j3tracey
/
grading-on-a-curve


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
							"""
Script to extract history features from a git repository.
"""
__author__ = "Oscar Svensson"
__copyright__ = "Copyright (c) 2018 Axis Communications AB"
__license__ = "MIT"

import csv
import json
import time

from argparse import ArgumentParser
from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
from tqdm import tqdm

def set_to_list(obj):
    """
    Helper function to convert a set to a list.
    """
    if isinstance(obj, set):
        return list(obj)
    raise TypeError

def get_files_in_tree(tree, repo):
    """
    Extract the hex of all files and their name.
    """
    files = set()
    for entry in tree:
        if entry.type == "tree":
            sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
                         for f in get_files_in_tree(repo[entry.id], repo)]
            files.update(sub_files)
        else:
            blob = repo[entry.id]
            if not blob.is_binary:
                if entry.name.endswith("java"):
                    files.add((entry.hex, entry.name))
    return files


def get_diffing_files(commit, parent, repo):
    """
    Get the files that diffed between two commits.
    """
    diff = repo.diff(parent, commit)

    patches = [p for p in diff]

    files = set()

    for patch in patches:
        if patch.delta.is_binary:
            continue
        nfile = patch.delta.new_file
        files.add((nfile.id, nfile.path, patch.delta.status))

    return files


def save_history_features_graph(repo_path, branch, graph_path):
    """
    Track the number of developers that have worked in a repository and save the
    results in a graph which could be used for later use.
    """
    repo = Repository(repo_path)
    head = repo.references.get(branch)

    commits = list(
        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
    current_commit = repo.head.target

    start_time = time.time()

    all_files = {}
    current_commit = repo.get(str(current_commit))
    files = get_files_in_tree(current_commit.tree, repo)

    for (_, name) in tqdm(files):
        all_files[name] = {}
        all_files[name]['lastcommit'] = current_commit.hex
        all_files[name][current_commit.hex] = {}
        all_files[name][current_commit.hex]["prevcommit"] = ""
        all_files[name][current_commit.hex]["authors"] = [
            current_commit.committer.name
        ]

    for i, commit in enumerate(tqdm(commits[1:])):
        files = get_diffing_files(commit, commits[i], repo)
        for (_, name, _) in files:
            if name not in all_files:
                all_files[name] = {}

            last_commit = ""
            if 'lastcommit' not in all_files[name]:
                all_files[name]['lastcommit'] = commit.hex
            else:
                last_commit = all_files[name]['lastcommit']

            all_files[name][commit.hex] = {}
            all_files[name][commit.hex]["prevcommit"] = last_commit

            authors = set([commit.committer.name])
            if last_commit:
                authors.update(all_files[name][last_commit]["authors"])
            all_files[name][commit.hex]["authors"] = authors

            all_files[name]['lastcommit'] = commit.hex

    with open(graph_path, 'w') as output:
        json.dump(all_files, output, default=set_to_list)

    end_time = time.time()

    print("Done")
    print("Overall processing time {}".format(end_time - start_time))

def load_history_features_graph(path):
    """
    Save the history features to a csv file.
    """
    file_graph = {}
    with open(path, 'r') as inp:
        file_graph = json.load(inp)
    return file_graph


def get_history_features(graph, repo_path, branch):
    """
    Function that extracts the history features from a git repository.
    They are the total number of authors, the total age and the total
    number of unique changes.
    """
    repo = Repository(repo_path)
    head = repo.references.get(branch)

    commits = list(
        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))

    features = []

    commit_feat = []
    commit_feat.append(str(commits[0].hex))
    commit_feat.append(str(1.0))
    commit_feat.append(str(0.0))
    commit_feat.append(str(0.0))
    features.append(commit_feat)

    for i, commit in enumerate(tqdm(commits[1:])):
        files = get_diffing_files(commit, commits[i], repo)

        total_number_of_authors = set()
        total_age = []
        total_unique_changes = set()

        for (_, name, _) in files:
            sub_graph = graph[name][commit.hex]
            total_number_of_authors.update(sub_graph['authors'])

            prev_commit = sub_graph['prevcommit']
            if prev_commit:
                total_unique_changes.add(prev_commit)

                prev_commit_obj = repo.get(prev_commit)

                total_age.append(commit.commit_time -
                                 prev_commit_obj.commit_time)

        total_age = float(sum(total_age)) / len(total_age) if total_age else 0

        commit_feat = []
        commit_feat.append(str(commit.hex))
        commit_feat.append(str(float(len(total_number_of_authors))))
        commit_feat.append(str(float(total_age)))
        commit_feat.append(str(float(len(total_unique_changes))))
        features.append(commit_feat)
    return features


def save_history_features(history_features, path):
    """
    Function to save the history features as a csv file.
    """
    with open(path, 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(
            ["commit", "number_of_authors", "age", "number_unique_changes"])
        for row in history_features:
            if row:
                writer.writerow([row[0], row[1], row[2], row[3]])


if __name__ == "__main__":
    PARSER = ArgumentParser(description="Utility to extract code churns from" +
                            " a repository or a single commit.")

    PARSER.add_argument(
        "--repository",
        "-r",
        type=str,
        default="./repos/jenkins",
        help="Path to local git repository.")
    PARSER.add_argument(
        "--branch",
        "-b",
        type=str,
        default="refs/heads/master",
        help="Which branch to use.")
    PARSER.add_argument(
        "--save-graph",
        "-sg",
        action="store_true",
        help="Generate a new graph for a repository.")
    PARSER.add_argument(
        "--graph-path",
        "-gp",
        type=str,
        default="./results/file_graph.json",
        help="The path to where the graph is stored.")
    PARSER.add_argument(
        "--output",
        "-o",
        type=str,
        default="./results/history_features.csv",
        help="The path where the output is written.")

    ARGS = PARSER.parse_args()
    REPO_PATH = ARGS.repository
    BRANCH = ARGS.branch
    SAVE_GRAPH = ARGS.save_graph
    GRAPH_PATH = ARGS.graph_path
    OUTPUT = ARGS.output
    print(SAVE_GRAPH)

    if SAVE_GRAPH:
        save_history_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
    GRAPH = load_history_features_graph(GRAPH_PATH)
    HISTORY_FEATURES = get_history_features(GRAPH, REPO_PATH, BRANCH)
    save_history_features(HISTORY_FEATURES, OUTPUT)