assemble_code_churns.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. """
  2. Script to extract code churns.
  3. """
  4. __author__ = "Oscar Svensson"
  5. __copyright__ = "Copyright (c) 2018 Axis Communications AB"
  6. __license__ = "MIT"
  7. import csv
  8. import os
  9. import sys
  10. import time
  11. from argparse import ArgumentParser
  12. from multiprocessing import Process, Manager, cpu_count
  13. from pygit2 import Repository, GIT_SORT_REVERSE, GIT_SORT_TOPOLOGICAL
  14. from tqdm import tqdm
  15. # Global variables
  16. MANAGER = Manager()
  17. RES = MANAGER.dict()
  18. def parse_code_churns(pid, repo_path, branch, start, stop=-1):
  19. """
  20. Function that is intended to be runned by a process. It extracts the code churns
  21. for a set of commits and stores them in the RES dict.
  22. """
  23. repo = Repository(repo_path)
  24. head = repo.references.get(branch)
  25. commits = list(
  26. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  27. start = start - 1 if (start > 0) else start
  28. commits = commits[start:stop] if (stop != -1) else commits[start:]
  29. code_churns = [[] for c in range(len(commits))]
  30. for i, commit in enumerate(tqdm(commits[1:], position=pid)):
  31. diff = repo.diff(commits[i], commit)
  32. tree = commit.tree
  33. patches = [p for p in diff]
  34. stats = diff.stats
  35. # Count the total lines of code and find the biggest file that have been changed
  36. total_tloc = 0
  37. line_of_code_old = 0
  38. for patch in patches:
  39. if patch.delta.is_binary:
  40. continue
  41. new_file = patch.delta.new_file
  42. # Total lines of code
  43. total_tloc += get_file_lines_of_code(repo, tree, new_file)
  44. old_file = patch.delta.old_file
  45. # Total lines of code in the old file
  46. line_of_code_old = max(
  47. line_of_code_old, get_file_lines_of_code(repo, tree, old_file))
  48. # Churned lines of code
  49. cloc = stats.insertions
  50. # Deleted lines of code
  51. dloc = stats.deletions
  52. # Churned files
  53. files_churned = len(patches)
  54. # File count
  55. num_files = count_files(tree, repo)
  56. # Apply relative code churns
  57. measure_one = float(cloc) / total_tloc if (total_tloc > 0) else float(cloc)
  58. measure_two = float(dloc) / total_tloc if (total_tloc > 0) else float(cloc)
  59. measure_three = (float(files_churned) / num_files if (num_files > 0)
  60. else float(files_churned))
  61. line_of_code_old = float(line_of_code_old)
  62. # Churn features
  63. code_churns[i].append(str(commit.hex))
  64. code_churns[i].append(str(measure_one))
  65. code_churns[i].append(str(measure_two))
  66. code_churns[i].append(str(measure_three))
  67. code_churns[i].append(str(line_of_code_old))
  68. RES[pid] = code_churns
  69. def count_files(tree, repo):
  70. """
  71. Count how many files there are in a repository.
  72. """
  73. num_files = 0
  74. trees = []
  75. visited = set()
  76. visited.add(tree.id)
  77. trees.append(tree)
  78. while trees:
  79. current_tree = trees.pop()
  80. for entry in current_tree:
  81. if entry.type == "tree":
  82. if entry.id not in visited:
  83. trees.append(repo[entry.id])
  84. visited.add(entry.id)
  85. else:
  86. num_files += 1
  87. return num_files
  88. def get_file_lines_of_code(repo, tree, dfile):
  89. """
  90. Count how many lines of code there are in a file.
  91. """
  92. tloc = 0
  93. try:
  94. blob = repo[tree[dfile.path].id]
  95. tloc = len(str(blob.data).split('\\n'))
  96. except Exception as _:
  97. return tloc
  98. return tloc
  99. def get_code_churns(repo_path, branch):
  100. """
  101. General function for extracting code churns. It first extracts the code churns for
  102. the first commit and then starts a number of processes(equal to the number of cores
  103. on the computer), which equally extracts the code churns for the remaining commits.
  104. """
  105. repo = Repository(repo_path)
  106. head = repo.references.get(branch)
  107. commits = list(
  108. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  109. code_churns = [[]]
  110. initial = commits[0]
  111. # Relative code churns
  112. measure_one = 0.0
  113. measure_two = 0.0
  114. measure_three = 1.0
  115. line_of_code_old = 0.0
  116. code_churns[0].append(str(initial.hex))
  117. code_churns[0].append(str(measure_one))
  118. code_churns[0].append(str(measure_two))
  119. code_churns[0].append(str(measure_three))
  120. code_churns[0].append(str(line_of_code_old))
  121. # Check how many processes that could be spawned
  122. cpus = cpu_count()
  123. print("Using {} cpus...".format(cpus))
  124. # Equally split the commit set into the equally sized parts.
  125. quote, remainder = divmod(len(commits), cpus)
  126. processes = [
  127. Process(
  128. target=parse_code_churns,
  129. args=(i, repo_path, branch, i * quote + min(i, remainder),
  130. (i + 1) * quote + min(i + 1, remainder))) for i in range(cpus)
  131. ]
  132. for process in processes:
  133. process.start()
  134. start_time = time.time()
  135. for process in processes:
  136. process.join()
  137. end_time = time.time()
  138. print("Done")
  139. print("Overall processing time {}".format(end_time - start_time))
  140. # Assemble the results
  141. churns = []
  142. for _, churn in RES.items():
  143. churns.extend(churn)
  144. churns = list(reversed(churns))
  145. churns.append(code_churns[0])
  146. return churns
  147. def save_churns(churns, path="./results/code_churns_features_multithread.csv"):
  148. """
  149. Saves the code churns to a csv file.
  150. """
  151. with open(path, 'w') as csv_file:
  152. writer = csv.writer(csv_file)
  153. writer.writerow([
  154. "commit", "lines_of_code_added", "lines_of_code_deleted",
  155. "files_churned", "line_of_code_old"
  156. ])
  157. for row in churns:
  158. if row:
  159. writer.writerow([row[0], row[1], row[2], row[3], row[4]])
  160. if __name__ == "__main__":
  161. PARSER = ArgumentParser(description="Utility to extract code churns from" +
  162. " a repository or a single commit.")
  163. PARSER.add_argument(
  164. "--repository",
  165. "-r",
  166. type=str,
  167. default="./repos/jenkins",
  168. help="Path to local git repository.")
  169. PARSER.add_argument(
  170. "--branch",
  171. "-b",
  172. type=str,
  173. default="refs/heads/master",
  174. help="Which branch to use.")
  175. ARGS = PARSER.parse_args()
  176. REPOPATH = ARGS.repository
  177. BRANCH = ARGS.branch
  178. if not os.path.exists(REPOPATH):
  179. print("The repository path does not exist!")
  180. sys.exit(1)
  181. CHURNS = get_code_churns(REPOPATH, BRANCH)
  182. save_churns(CHURNS)