assemble_experience_features.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. """
  2. Script for extracting the experience features in a software repository.
  3. """
  4. __author__ = "Oscar Svensson"
  5. __copyright__ = "Copyright (c) 2018 Axis Communications AB"
  6. __license__ = "MIT"
  7. import csv
  8. import json
  9. import sys
  10. import time
  11. from argparse import ArgumentParser
  12. from datetime import datetime
  13. from numpy import floor
  14. from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
  15. from tqdm import tqdm
  16. def set_to_list(obj):
  17. """
  18. Helper function to turn sets to lists and floats to strings.
  19. """
  20. if isinstance(obj, set):
  21. return list(obj)
  22. if isinstance(obj, float):
  23. return str('%.15g' % obj)
  24. raise TypeError
  25. def get_files_in_tree(tree, repo):
  26. """
  27. Function to get the files in a tree.
  28. """
  29. files = set()
  30. for entry in tree:
  31. if entry.type == "tree":
  32. sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
  33. for f in get_files_in_tree(repo[entry.id], repo)]
  34. files.update(sub_files)
  35. else:
  36. blob = repo[entry.id]
  37. if not blob.is_binary:
  38. if entry.name.endswith("java"):
  39. files.add((entry.hex, entry.name))
  40. return files
  41. def get_diffing_files(commit, parent, repo):
  42. """
  43. Function to get the files that differs between two commits.
  44. """
  45. diff = repo.diff(parent, commit)
  46. patches = [p for p in diff]
  47. files = set()
  48. for patch in patches:
  49. if patch.delta.is_binary:
  50. continue
  51. nfile = patch.delta.new_file
  52. files.add((nfile.id, nfile.path, patch.delta.status))
  53. return files
  54. def save_experience_features_graph(repo_path, branch, graph_path):
  55. """
  56. Function to get and save the experience graph.
  57. """
  58. repo = Repository(repo_path)
  59. head = repo.references.get(branch)
  60. commits = list(
  61. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  62. current_commit = repo.head.target
  63. start_time = time.time()
  64. current_commit = repo.get(str(current_commit))
  65. files = get_files_in_tree(current_commit.tree, repo)
  66. all_authors = {}
  67. author = current_commit.committer.name
  68. all_authors[author] = {}
  69. all_authors[author]['lastcommit'] = current_commit.hex
  70. all_authors[author][current_commit.hex] = {}
  71. all_authors[author][current_commit.hex]['prevcommit'] = ""
  72. all_authors[author][current_commit.hex]["exp"] = 1
  73. all_authors[author][current_commit.hex]["rexp"] = [[len(files), 1]]
  74. all_authors[author][current_commit.hex]["sexp"] = {}
  75. for i, commit in enumerate(tqdm(commits[1:])):
  76. files = get_diffing_files(commit, commits[i], repo)
  77. author = commit.committer.name
  78. if author not in all_authors:
  79. all_authors[author] = {}
  80. all_authors[author]['lastcommit'] = commit.hex
  81. all_authors[author][commit.hex] = {}
  82. all_authors[author][commit.hex]['prevcommit'] = ""
  83. all_authors[author][commit.hex]["exp"] = 1
  84. all_authors[author][commit.hex]["rexp"] = [[len(files), 1.0]]
  85. all_authors[author][commit.hex]["sexp"] = {}
  86. else:
  87. last_commit = all_authors[author]["lastcommit"]
  88. all_authors[author]["lastcommit"] = commit.hex
  89. all_authors[author][commit.hex] = {}
  90. all_authors[author][commit.hex]['prevcommit'] = last_commit
  91. all_authors[author][commit.hex][
  92. 'exp'] = 1 + all_authors[author][last_commit]['exp']
  93. date_current = datetime.fromtimestamp(commit.commit_time)
  94. date_last = datetime.fromtimestamp(repo.get(last_commit).commit_time)
  95. diffing_years = abs(floor(float((date_current - date_last).days) / 365))
  96. overall = all_authors[author][last_commit]['rexp']
  97. all_authors[author][commit.hex][
  98. 'rexp'] = [[len(files), 1.0]] + [[e[0], e[1] + diffing_years]
  99. for e in overall]
  100. with open(graph_path, 'w') as output:
  101. json.dump(all_authors, output, default=set_to_list)
  102. end_time = time.time()
  103. print("Done")
  104. print("Overall processing time {}".format(end_time - start_time))
  105. def load_experience_features_graph(path="./results/author_graph.json"):
  106. """
  107. Function to load the feeatures graph.
  108. """
  109. file_graph = {}
  110. with open(path, 'r') as inp:
  111. file_graph = json.load(inp, parse_float=lambda x: float(x))
  112. return file_graph
  113. def get_experience_features(graph, repo_path, branch):
  114. """
  115. Function that extracts the experience features from a experience graph.
  116. """
  117. repo = Repository(repo_path)
  118. head = repo.references.get(branch)
  119. commits = list(
  120. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  121. current_commit = repo.head.target
  122. files = get_files_in_tree(repo.get(str(current_commit)).tree, repo)
  123. features = []
  124. commit_feat = []
  125. commit_feat.append(str(commits[0].hex))
  126. commit_feat.append(str(1.0))
  127. commit_feat.append(str(len(files)))
  128. commit_feat.append(str(0.0))
  129. features.append(commit_feat)
  130. for _, commit in enumerate(tqdm(commits[1:])):
  131. author = commit.committer.name
  132. exp = graph[author][commit.hex]['exp']
  133. rexp = graph[author][commit.hex]['rexp']
  134. try:
  135. rrexp = sum([float(float(e[0]) / (float(e[1]) + 1)) for e in rexp])
  136. except:
  137. print(author)
  138. print(commit.hex)
  139. print(rexp)
  140. sys.exit(1)
  141. commit_feat = []
  142. commit_feat.append(str(commit.hex))
  143. commit_feat.append(str(float(exp)))
  144. commit_feat.append(str(float(rrexp)))
  145. commit_feat.append(str(float(0)))
  146. features.append(commit_feat)
  147. return features
  148. def save_experience_features(history_features, path):
  149. """
  150. Save the experience features to a csv file.
  151. """
  152. with open(path, 'w') as csv_file:
  153. writer = csv.writer(csv_file)
  154. writer.writerow(["commit", "experience", "rexp", "sexp"])
  155. for row in history_features:
  156. if row:
  157. writer.writerow([row[0], row[1], row[2], row[3]])
  158. if __name__ == "__main__":
  159. PARSER = ArgumentParser(description="Utility to extract code churns from" +
  160. " a repository or a single commit.")
  161. PARSER.add_argument(
  162. "--repository",
  163. "-r",
  164. type=str,
  165. default="./repos/jenkins",
  166. help="Path to local git repository.")
  167. PARSER.add_argument(
  168. "--branch",
  169. "-b",
  170. type=str,
  171. default="refs/heads/master",
  172. help="Which branch to use.")
  173. PARSER.add_argument(
  174. "--save-graph",
  175. "-sg",
  176. action="store_true",
  177. help="Generate a new graph for a repository.")
  178. PARSER.add_argument(
  179. "--graph-path",
  180. "-gp",
  181. type=str,
  182. default="./results/author_graph.json",
  183. help="The path to where the graph is stored.")
  184. PARSER.add_argument(
  185. "--output",
  186. "-o",
  187. type=str,
  188. default="./results/experience_features.csv",
  189. help="The path where the output is written.")
  190. ARGS = PARSER.parse_args()
  191. REPO_PATH = ARGS.repository
  192. BRANCH = ARGS.branch
  193. SAVE_GRAPH = ARGS.save_graph
  194. GRAPH_PATH = ARGS.graph_path
  195. OUTPUT = ARGS.output
  196. if SAVE_GRAPH:
  197. save_experience_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
  198. GRAPH = load_experience_features_graph(GRAPH_PATH)
  199. EXPERIENCE_FEATURES = get_experience_features(GRAPH, REPO_PATH, BRANCH)
  200. save_experience_features(EXPERIENCE_FEATURES, OUTPUT)