123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244 |
- """
- Script for extracting the experience features in a software repository.
- """
- __author__ = "Oscar Svensson"
- __copyright__ = "Copyright (c) 2018 Axis Communications AB"
- __license__ = "MIT"
- import csv
- import json
- import sys
- import time
- from argparse import ArgumentParser
- from datetime import datetime
- from numpy import floor
- from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
- from tqdm import tqdm
- def set_to_list(obj):
- """
- Helper function to turn sets to lists and floats to strings.
- """
- if isinstance(obj, set):
- return list(obj)
- if isinstance(obj, float):
- return str('%.15g' % obj)
- raise TypeError
- def get_files_in_tree(tree, repo):
- """
- Function to get the files in a tree.
- """
- files = set()
- for entry in tree:
- if entry.type == "tree":
- sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
- for f in get_files_in_tree(repo[entry.id], repo)]
- files.update(sub_files)
- else:
- blob = repo[entry.id]
- if not blob.is_binary:
- if entry.name.endswith("java"):
- files.add((entry.hex, entry.name))
- return files
- def get_diffing_files(commit, parent, repo):
- """
- Function to get the files that differs between two commits.
- """
- diff = repo.diff(parent, commit)
- patches = [p for p in diff]
- files = set()
- for patch in patches:
- if patch.delta.is_binary:
- continue
- nfile = patch.delta.new_file
- files.add((nfile.id, nfile.path, patch.delta.status))
- return files
- def save_experience_features_graph(repo_path, branch, graph_path):
- """
- Function to get and save the experience graph.
- """
- repo = Repository(repo_path)
- head = repo.references.get(branch)
- commits = list(
- repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
- current_commit = repo.head.target
- start_time = time.time()
- current_commit = repo.get(str(current_commit))
- files = get_files_in_tree(current_commit.tree, repo)
- all_authors = {}
- author = current_commit.committer.name
- all_authors[author] = {}
- all_authors[author]['lastcommit'] = current_commit.hex
- all_authors[author][current_commit.hex] = {}
- all_authors[author][current_commit.hex]['prevcommit'] = ""
- all_authors[author][current_commit.hex]["exp"] = 1
- all_authors[author][current_commit.hex]["rexp"] = [[len(files), 1]]
- all_authors[author][current_commit.hex]["sexp"] = {}
- for i, commit in enumerate(tqdm(commits[1:])):
- files = get_diffing_files(commit, commits[i], repo)
- author = commit.committer.name
- if author not in all_authors:
- all_authors[author] = {}
- all_authors[author]['lastcommit'] = commit.hex
- all_authors[author][commit.hex] = {}
- all_authors[author][commit.hex]['prevcommit'] = ""
- all_authors[author][commit.hex]["exp"] = 1
- all_authors[author][commit.hex]["rexp"] = [[len(files), 1.0]]
- all_authors[author][commit.hex]["sexp"] = {}
- else:
- last_commit = all_authors[author]["lastcommit"]
- all_authors[author]["lastcommit"] = commit.hex
- all_authors[author][commit.hex] = {}
- all_authors[author][commit.hex]['prevcommit'] = last_commit
- all_authors[author][commit.hex][
- 'exp'] = 1 + all_authors[author][last_commit]['exp']
- date_current = datetime.fromtimestamp(commit.commit_time)
- date_last = datetime.fromtimestamp(repo.get(last_commit).commit_time)
- diffing_years = abs(floor(float((date_current - date_last).days) / 365))
- overall = all_authors[author][last_commit]['rexp']
- all_authors[author][commit.hex][
- 'rexp'] = [[len(files), 1.0]] + [[e[0], e[1] + diffing_years]
- for e in overall]
- with open(graph_path, 'w') as output:
- json.dump(all_authors, output, default=set_to_list)
- end_time = time.time()
- print("Done")
- print("Overall processing time {}".format(end_time - start_time))
- def load_experience_features_graph(path="./results/author_graph.json"):
- """
- Function to load the feeatures graph.
- """
- file_graph = {}
- with open(path, 'r') as inp:
- file_graph = json.load(inp, parse_float=lambda x: float(x))
- return file_graph
- def get_experience_features(graph, repo_path, branch):
- """
- Function that extracts the experience features from a experience graph.
- """
- repo = Repository(repo_path)
- head = repo.references.get(branch)
- commits = list(
- repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
- current_commit = repo.head.target
- files = get_files_in_tree(repo.get(str(current_commit)).tree, repo)
- features = []
- commit_feat = []
- commit_feat.append(str(commits[0].hex))
- commit_feat.append(str(1.0))
- commit_feat.append(str(len(files)))
- commit_feat.append(str(0.0))
- features.append(commit_feat)
- for _, commit in enumerate(tqdm(commits[1:])):
- author = commit.committer.name
- exp = graph[author][commit.hex]['exp']
- rexp = graph[author][commit.hex]['rexp']
- try:
- rrexp = sum([float(float(e[0]) / (float(e[1]) + 1)) for e in rexp])
- except:
- print(author)
- print(commit.hex)
- print(rexp)
- sys.exit(1)
- commit_feat = []
- commit_feat.append(str(commit.hex))
- commit_feat.append(str(float(exp)))
- commit_feat.append(str(float(rrexp)))
- commit_feat.append(str(float(0)))
- features.append(commit_feat)
- return features
- def save_experience_features(history_features, path):
- """
- Save the experience features to a csv file.
- """
- with open(path, 'w') as csv_file:
- writer = csv.writer(csv_file)
- writer.writerow(["commit", "experience", "rexp", "sexp"])
- for row in history_features:
- if row:
- writer.writerow([row[0], row[1], row[2], row[3]])
- if __name__ == "__main__":
- PARSER = ArgumentParser(description="Utility to extract code churns from" +
- " a repository or a single commit.")
- PARSER.add_argument(
- "--repository",
- "-r",
- type=str,
- default="./repos/jenkins",
- help="Path to local git repository.")
- PARSER.add_argument(
- "--branch",
- "-b",
- type=str,
- default="refs/heads/master",
- help="Which branch to use.")
- PARSER.add_argument(
- "--save-graph",
- "-sg",
- action="store_true",
- help="Generate a new graph for a repository.")
- PARSER.add_argument(
- "--graph-path",
- "-gp",
- type=str,
- default="./results/author_graph.json",
- help="The path to where the graph is stored.")
- PARSER.add_argument(
- "--output",
- "-o",
- type=str,
- default="./results/experience_features.csv",
- help="The path where the output is written.")
- ARGS = PARSER.parse_args()
- REPO_PATH = ARGS.repository
- BRANCH = ARGS.branch
- SAVE_GRAPH = ARGS.save_graph
- GRAPH_PATH = ARGS.graph_path
- OUTPUT = ARGS.output
- if SAVE_GRAPH:
- save_experience_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
- GRAPH = load_experience_features_graph(GRAPH_PATH)
- EXPERIENCE_FEATURES = get_experience_features(GRAPH, REPO_PATH, BRANCH)
- save_experience_features(EXPERIENCE_FEATURES, OUTPUT)
|