assemble_history_features.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. """
  2. Script to extract history features from a git repository.
  3. """
  4. __author__ = "Oscar Svensson"
  5. __copyright__ = "Copyright (c) 2018 Axis Communications AB"
  6. __license__ = "MIT"
  7. import csv
  8. import json
  9. import time
  10. from argparse import ArgumentParser
  11. from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
  12. from tqdm import tqdm
  13. def set_to_list(obj):
  14. """
  15. Helper function to convert a set to a list.
  16. """
  17. if isinstance(obj, set):
  18. return list(obj)
  19. raise TypeError
  20. def get_files_in_tree(tree, repo):
  21. """
  22. Extract the hex of all files and their name.
  23. """
  24. files = set()
  25. for entry in tree:
  26. if entry.type == "tree":
  27. sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
  28. for f in get_files_in_tree(repo[entry.id], repo)]
  29. files.update(sub_files)
  30. else:
  31. blob = repo[entry.id]
  32. if not blob.is_binary:
  33. if entry.name.endswith("java"):
  34. files.add((entry.hex, entry.name))
  35. return files
  36. def get_diffing_files(commit, parent, repo):
  37. """
  38. Get the files that diffed between two commits.
  39. """
  40. diff = repo.diff(parent, commit)
  41. patches = [p for p in diff]
  42. files = set()
  43. for patch in patches:
  44. if patch.delta.is_binary:
  45. continue
  46. nfile = patch.delta.new_file
  47. files.add((nfile.id, nfile.path, patch.delta.status))
  48. return files
  49. def save_history_features_graph(repo_path, branch, graph_path):
  50. """
  51. Track the number of developers that have worked in a repository and save the
  52. results in a graph which could be used for later use.
  53. """
  54. repo = Repository(repo_path)
  55. head = repo.references.get(branch)
  56. commits = list(
  57. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  58. current_commit = repo.head.target
  59. start_time = time.time()
  60. all_files = {}
  61. current_commit = repo.get(str(current_commit))
  62. files = get_files_in_tree(current_commit.tree, repo)
  63. for (_, name) in tqdm(files):
  64. all_files[name] = {}
  65. all_files[name]['lastcommit'] = current_commit.hex
  66. all_files[name][current_commit.hex] = {}
  67. all_files[name][current_commit.hex]["prevcommit"] = ""
  68. all_files[name][current_commit.hex]["authors"] = [
  69. current_commit.committer.name
  70. ]
  71. for i, commit in enumerate(tqdm(commits[1:])):
  72. files = get_diffing_files(commit, commits[i], repo)
  73. for (_, name, _) in files:
  74. if name not in all_files:
  75. all_files[name] = {}
  76. last_commit = ""
  77. if 'lastcommit' not in all_files[name]:
  78. all_files[name]['lastcommit'] = commit.hex
  79. else:
  80. last_commit = all_files[name]['lastcommit']
  81. all_files[name][commit.hex] = {}
  82. all_files[name][commit.hex]["prevcommit"] = last_commit
  83. authors = set([commit.committer.name])
  84. if last_commit:
  85. authors.update(all_files[name][last_commit]["authors"])
  86. all_files[name][commit.hex]["authors"] = authors
  87. all_files[name]['lastcommit'] = commit.hex
  88. with open(graph_path, 'w') as output:
  89. json.dump(all_files, output, default=set_to_list)
  90. end_time = time.time()
  91. print("Done")
  92. print("Overall processing time {}".format(end_time - start_time))
  93. def load_history_features_graph(path):
  94. """
  95. Save the history features to a csv file.
  96. """
  97. file_graph = {}
  98. with open(path, 'r') as inp:
  99. file_graph = json.load(inp)
  100. return file_graph
  101. def get_history_features(graph, repo_path, branch):
  102. """
  103. Function that extracts the history features from a git repository.
  104. They are the total number of authors, the total age and the total
  105. number of unique changes.
  106. """
  107. repo = Repository(repo_path)
  108. head = repo.references.get(branch)
  109. commits = list(
  110. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  111. features = []
  112. commit_feat = []
  113. commit_feat.append(str(commits[0].hex))
  114. commit_feat.append(str(1.0))
  115. commit_feat.append(str(0.0))
  116. commit_feat.append(str(0.0))
  117. features.append(commit_feat)
  118. for i, commit in enumerate(tqdm(commits[1:])):
  119. files = get_diffing_files(commit, commits[i], repo)
  120. total_number_of_authors = set()
  121. total_age = []
  122. total_unique_changes = set()
  123. for (_, name, _) in files:
  124. sub_graph = graph[name][commit.hex]
  125. total_number_of_authors.update(sub_graph['authors'])
  126. prev_commit = sub_graph['prevcommit']
  127. if prev_commit:
  128. total_unique_changes.add(prev_commit)
  129. prev_commit_obj = repo.get(prev_commit)
  130. total_age.append(commit.commit_time -
  131. prev_commit_obj.commit_time)
  132. total_age = float(sum(total_age)) / len(total_age) if total_age else 0
  133. commit_feat = []
  134. commit_feat.append(str(commit.hex))
  135. commit_feat.append(str(float(len(total_number_of_authors))))
  136. commit_feat.append(str(float(total_age)))
  137. commit_feat.append(str(float(len(total_unique_changes))))
  138. features.append(commit_feat)
  139. return features
  140. def save_history_features(history_features, path):
  141. """
  142. Function to save the history features as a csv file.
  143. """
  144. with open(path, 'w') as csv_file:
  145. writer = csv.writer(csv_file)
  146. writer.writerow(
  147. ["commit", "number_of_authors", "age", "number_unique_changes"])
  148. for row in history_features:
  149. if row:
  150. writer.writerow([row[0], row[1], row[2], row[3]])
  151. if __name__ == "__main__":
  152. PARSER = ArgumentParser(description="Utility to extract code churns from" +
  153. " a repository or a single commit.")
  154. PARSER.add_argument(
  155. "--repository",
  156. "-r",
  157. type=str,
  158. default="./repos/jenkins",
  159. help="Path to local git repository.")
  160. PARSER.add_argument(
  161. "--branch",
  162. "-b",
  163. type=str,
  164. default="refs/heads/master",
  165. help="Which branch to use.")
  166. PARSER.add_argument(
  167. "--save-graph",
  168. "-sg",
  169. action="store_true",
  170. help="Generate a new graph for a repository.")
  171. PARSER.add_argument(
  172. "--graph-path",
  173. "-gp",
  174. type=str,
  175. default="./results/file_graph.json",
  176. help="The path to where the graph is stored.")
  177. PARSER.add_argument(
  178. "--output",
  179. "-o",
  180. type=str,
  181. default="./results/history_features.csv",
  182. help="The path where the output is written.")
  183. ARGS = PARSER.parse_args()
  184. REPO_PATH = ARGS.repository
  185. BRANCH = ARGS.branch
  186. SAVE_GRAPH = ARGS.save_graph
  187. GRAPH_PATH = ARGS.graph_path
  188. OUTPUT = ARGS.output
  189. print(SAVE_GRAPH)
  190. if SAVE_GRAPH:
  191. save_history_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
  192. GRAPH = load_history_features_graph(GRAPH_PATH)
  193. HISTORY_FEATURES = get_history_features(GRAPH, REPO_PATH, BRANCH)
  194. save_history_features(HISTORY_FEATURES, OUTPUT)