assemble_diffusion_features.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. """
  2. Script for extracting diffusion features from a git repository.
  3. """
  4. __author__ = "Oscar Svensson"
  5. __copyright__ = "Copyright (c) 2018 Axis Communications AB"
  6. __license__ = "MIT"
  7. import csv
  8. import os
  9. import sys
  10. import time
  11. from argparse import ArgumentParser
  12. from multiprocessing import Process, Manager, cpu_count
  13. from numpy import log2
  14. from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
  15. from tqdm import tqdm
  16. MANAGER = Manager()
  17. RES = MANAGER.dict()
  18. def count_diffing_subsystems(subsystems):
  19. """
  20. Function for counting the number of subsystems in a repository.
  21. """
  22. number = 0
  23. for system in subsystems.values():
  24. number = number + count_diffing_subsystems(system)
  25. return number + len(subsystems.keys())
  26. def count_entropy(file_changes, total_change):
  27. """
  28. Function to count entropy for some file changes.
  29. """
  30. if total_change == 0:
  31. return 0
  32. return sum([
  33. -1 * (float(x) / total_change) * (log2(float(x) / total_change)
  34. if x > 0 else 0)
  35. for x in file_changes
  36. ])
  37. def parse_diffusion_features(pid, repo_path, branch, start, stop=-1):
  38. """
  39. Function to extract diffusion features from a set of commits.
  40. """
  41. repo = Repository(repo_path)
  42. head = repo.references.get(branch)
  43. commits = list(
  44. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  45. start = start - 1 if (start > 0) else start
  46. commits = commits[start:stop] if (stop != -1) else commits[start:]
  47. features = [[] for c in range(len(commits))]
  48. for i, commit in enumerate(tqdm(commits[1:], position=pid)):
  49. diff = repo.diff(commits[i], commit)
  50. patches = [p for p in diff]
  51. # Extract all different subsystems that have been modified
  52. modules = set([])
  53. subsystems_mapping = {}
  54. entropy_change = 0
  55. file_changes = []
  56. total_change = 0
  57. for patch in patches:
  58. # Skip binary files
  59. if patch.delta.is_binary:
  60. continue
  61. _, addition, deletions = patch.line_stats
  62. total_change = total_change + (addition + deletions)
  63. file_changes.append(addition + deletions)
  64. # Store all subsystems
  65. fpath = patch.delta.new_file.path
  66. subsystems = fpath.split('/')[:-1]
  67. root = subsystems_mapping
  68. for system in subsystems:
  69. if system not in root:
  70. root[system] = {}
  71. root = root[system]
  72. if subsystems > 0:
  73. modules.add(subsystems[0])
  74. # Check how many subsystems that have been touched
  75. modified_systems = count_diffing_subsystems(subsystems_mapping)
  76. # Calculate the entropy for the commit
  77. entropy_change = count_entropy(file_changes, total_change)
  78. # Add all features
  79. features[i].append(str(commit.hex))
  80. features[i].append(str(float(modified_systems)))
  81. features[i].append(str(float(len(modules))))
  82. features[i].append(str(float(entropy_change)))
  83. RES[pid] = features
  84. def parse_tree(tree, repo):
  85. """
  86. Parse a git tree and get the number of files, the number of systems and
  87. the number of subdirectories.
  88. """
  89. found_sub_entries = 0
  90. additions = 0
  91. file_additions = []
  92. tree = repo[tree.id]
  93. for entry in tree:
  94. if entry.type == "bin":
  95. continue
  96. if entry.type == "tree":
  97. sub_additions, sub_file_additions, sub_entries = parse_tree(
  98. entry, repo)
  99. found_sub_entries += (1 + sub_entries)
  100. additions += sub_additions
  101. file_additions.extend(sub_file_additions)
  102. else:
  103. try:
  104. sub_addition = len(str(repo[entry.id]).split('\n'))
  105. additions += sub_addition
  106. file_additions.append(sub_addition)
  107. except Exception as ex:
  108. print(ex)
  109. continue
  110. return additions, file_additions, found_sub_entries
  111. def get_diffusion_features(repo_path, branch):
  112. """
  113. Function that extracts the first commits diffusion features. It then starts
  114. a number of processes(equal to the number of cores on the computer), and then
  115. distributes the remaining commits to them.
  116. """
  117. repo = Repository(repo_path)
  118. head = repo.references.get(branch)
  119. commits = list(
  120. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
  121. initial = commits[0]
  122. init_tree = initial.tree
  123. # Count inital total lines of code
  124. init_total_additions = 0
  125. init_file_addtions = []
  126. init_subdirectories = 0
  127. init_modules = 0
  128. for entry in init_tree:
  129. if entry.type == "tree":
  130. added, file_additions, subdirectories = parse_tree(entry, repo)
  131. init_modules += 1
  132. init_file_addtions.extend(file_additions)
  133. init_total_additions += added
  134. init_subdirectories += subdirectories
  135. else:
  136. try:
  137. additions = len(str(repo[entry.id]).split('\n'))
  138. init_total_additions += additions
  139. init_file_addtions.append(additions)
  140. except:
  141. continue
  142. diffusion_features = []
  143. diffusion_features.append(initial.hex)
  144. diffusion_features.append(init_subdirectories)
  145. diffusion_features.append(init_modules)
  146. diffusion_features.append(
  147. count_entropy(init_file_addtions, init_total_additions))
  148. # Check how many processes that could be spawned
  149. cpus = cpu_count()
  150. print("Using {} cpus...".format(cpus))
  151. # Divide the commits eqaully between the processes.
  152. quote, remainder = divmod(len(commits), cpus)
  153. processes = [
  154. Process(
  155. target=parse_diffusion_features,
  156. args=(i, repo_path, branch, i * quote + min(i, remainder),
  157. (i + 1) * quote + min(i + 1, remainder))) for i in range(cpus)
  158. ]
  159. for process in processes:
  160. process.start()
  161. start_time = time.time()
  162. for process in processes:
  163. process.join()
  164. end_time = time.time()
  165. print("Done")
  166. print("Overall processing time {}".format(end_time - start_time))
  167. # Assemble the results
  168. features = []
  169. for _, feat in RES.items():
  170. features.extend(feat)
  171. features = list(reversed(features))
  172. features.append(diffusion_features)
  173. return features
  174. def save_diffusion_features(diffusion_features,
  175. path="./results/diffusion_features.csv"):
  176. """
  177. Save the diffusion features to a csv file.
  178. """
  179. with open(path, 'w') as csv_file:
  180. writer = csv.writer(csv_file)
  181. writer.writerow([
  182. "commit", "modified_subsystems", "modified_subdirectories",
  183. "entropy"
  184. ])
  185. for row in diffusion_features:
  186. if row:
  187. writer.writerow([row[0], row[1], row[2], row[3]])
  188. if __name__ == "__main__":
  189. PARSER = ArgumentParser(
  190. description="Utility to extract diffusion features from" +
  191. " a repository or a single commit.")
  192. PARSER.add_argument(
  193. "--repository",
  194. "-r",
  195. type=str,
  196. default="./repos/jenkins",
  197. help="Path to local git repository.")
  198. PARSER.add_argument(
  199. "--branch",
  200. "-b",
  201. type=str,
  202. default="refs/heads/master",
  203. help="Which branch to use.")
  204. ARGS = PARSER.parse_args()
  205. REPOPATH = ARGS.repository
  206. BRANCH = ARGS.branch
  207. if not os.path.exists(REPOPATH):
  208. print("The repository path does not exist!")
  209. sys.exit(1)
  210. DIFFUSION_FEATURES = get_diffusion_features(REPOPATH, BRANCH)
  211. save_diffusion_features(DIFFUSION_FEATURES)