general_data.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. """
  2. Script that extracts general data about a git repository.
  3. """
  4. __author__ = "Oscar Svensson"
  5. __copyright__ = "Copyright (c) 2018 Axis Communications AB"
  6. __license__ = "MIT"
  7. import csv
  8. import json
  9. import re
  10. from argparse import ArgumentParser
  11. from datetime import datetime
  12. from numpy import median, mean
  13. from pygit2 import Repository
  14. def has_added(message):
  15. """
  16. Function to check if a message contains any word that indicates an addition of lines of code.
  17. """
  18. if (re.search(
  19. r"add(?:ed)*|implement(?:ed)*|introduce(?:d)*|improve(?:ment|ments)*",
  20. message.lower())):
  21. return True
  22. return False
  23. def has_updated(message):
  24. """
  25. Function to check if a message contains any word that indicates an update of lines of code.
  26. """
  27. if (re.search(
  28. r"update[d]*|mov(?:ing|e|ed)|refactor|modifying|switching|deprecate(?:d)*|"+
  29. "clean(?:up|ed)*",
  30. message.lower())):
  31. return True
  32. return False
  33. def has_bugfix(message):
  34. """
  35. Function to check if a message contains any word that indicates a bug fix.
  36. """
  37. if (re.search(r"jenkins[-]?\d|hudson[-]?\d|fix(?:es|ed)*|solve(?:d)*",
  38. message.lower())):
  39. return True
  40. return False
  41. def has_issue(message):
  42. """
  43. Function to check if a message contains any word that indicates a issue.
  44. """
  45. if re.search(r"issue number", message.lower()):
  46. return True
  47. return False
  48. def save_commit_messages(commits, repo):
  49. """
  50. Function to run some statistics on a number of commits in a git repository.
  51. """
  52. mapping = {}
  53. added = set()
  54. updated = set()
  55. bugfix = set()
  56. issue_set = set()
  57. for commit in commits:
  58. message = commit.message
  59. mapping[commit.hex] = commit.message
  60. if has_added(message):
  61. added.add(commit.hex)
  62. elif has_updated(message):
  63. updated.add(commit.hex)
  64. elif has_bugfix(message):
  65. bugfix.add(commit.hex)
  66. elif has_issue(message):
  67. issue_set.add(commit.hex)
  68. """
  69. Dumps all found commits to a file.
  70. """
  71. with open("./results/commit_messages.json", 'w') as output:
  72. json.dump(mapping, output)
  73. overall = set()
  74. overall.update(added)
  75. overall.update(updated)
  76. overall.update(bugfix)
  77. overall.update(issue_set)
  78. all_messages = set([commit.hex for commit in commits])
  79. not_defined = {c: repo.get(c).message for c in all_messages - overall}
  80. print("Number of commits that added something: {} ({}%)".format(
  81. len(added),
  82. float(len(added)) / len(all_messages)))
  83. print("Number of commits that updated something: {} ({}%)".format(
  84. len(updated),
  85. float(len(updated)) / len(all_messages)))
  86. print("Number of commits that fixed a bug: {} ({}%)".format(
  87. len(bugfix),
  88. float(len(bugfix)) / len(all_messages)))
  89. print("Number of commits that contained an issue number: {} ({}%)".format(
  90. len(issue_set),
  91. float(len(issue_set)) / len(all_messages)))
  92. """
  93. Dumps all undefined commits to a file as well.
  94. """
  95. with open("./results/undefined_commit_messages.json", 'w') as output:
  96. json.dump(not_defined, output)
  97. print("Number of undefined commits: {} ({}%)".format(
  98. len(not_defined),
  99. float(len(not_defined)) / len(all_messages)))
  100. def get_average_time_issues(issue_path):
  101. """
  102. Function to get the average times for issues.
  103. """
  104. issues_dict = {}
  105. with open(issue_path, 'r') as inp:
  106. issues_dict = json.load(inp)
  107. days = []
  108. lowest = (float('Inf'), 0, 0)
  109. highest = (0, None, None)
  110. for _, dates in issues_dict.items():
  111. creationdate = dates['creationdate']
  112. resolutiondate = dates['resolutiondate']
  113. creationdate = datetime.strptime(
  114. creationdate, "%Y-%m-%d %H:%M:%S %z").replace(tzinfo=None)
  115. resolutiondate = datetime.strptime(
  116. resolutiondate, "%Y-%m-%d %H:%M:%S %z").replace(tzinfo=None)
  117. days.append(((resolutiondate - creationdate).days))
  118. if days[-1] > highest[0]:
  119. highest = (days[-1], creationdate, resolutiondate)
  120. if days[-1] < lowest[0]:
  121. lowest = (days[-1], creationdate, resolutiondate)
  122. print("Lowest: {}".format(lowest))
  123. print("Highest: {}".format(highest))
  124. print("Mean time between resolution date and commit date: {} days".format(
  125. mean(days)))
  126. def get_general_data(repo_path, issue_path, labels, pairs):
  127. """
  128. Function to get general statistics for a git repository.
  129. """
  130. repo = Repository(repo_path)
  131. issue_list = {}
  132. labeled_commits = {}
  133. with open(labels, 'r') as inp:
  134. reader = csv.reader(inp)
  135. next(reader)
  136. for commit in reader:
  137. labeled_commits[commit[0]] = float(commit[1])
  138. print("Number of commits: {}".format(len(labeled_commits)))
  139. print("Number of found bugintroducing commits: {}".format(
  140. len([
  141. labeled_commits[f] for f in labeled_commits
  142. if labeled_commits[f] > 0
  143. ])))
  144. pair_map = []
  145. with open(pairs, 'r') as inp:
  146. pair_map = json.load(inp)
  147. total_fixes = set([p[0] for p in pair_map])
  148. print("Total number of fixes used: {}".format(len(total_fixes)))
  149. bug_labeled_commits = set(
  150. [l for l in labeled_commits if labeled_commits[l] > 0])
  151. fixes_in_bugs = set(bug_labeled_commits).intersection(total_fixes)
  152. print("Total number of fixes in bugs found : {}".format(
  153. len(fixes_in_bugs)))
  154. time_diff = []
  155. for pair in pair_map:
  156. fix = repo.get(pair[0])
  157. bug = repo.get(pair[1])
  158. fix_date = datetime.fromtimestamp(fix.commit_time).replace(tzinfo=None)
  159. bug_date = datetime.fromtimestamp(bug.commit_time).replace(tzinfo=None)
  160. diff = (fix_date - bug_date).days
  161. time_diff.append(diff)
  162. years, days = divmod(float(mean(time_diff)), 365.25)
  163. myears, mdays = divmod(float(median(time_diff)), 365.25)
  164. print(
  165. "Average time between bug introduction and fix: {} years and {} days".
  166. format(years, days))
  167. print("Median time between bug introduction and fix: {} years and {} days".
  168. format(myears, mdays))
  169. with open(issue_path, 'r') as inp:
  170. issue_list = json.load(inp)
  171. print("Total number of fixes found: {}".format(len(issue_list)))
  172. save_commit_messages([repo.get(c) for c in bug_labeled_commits], repo)
  173. get_average_time_issues(issue_path)
  174. if __name__ == "__main__":
  175. PARSER = ArgumentParser(
  176. description="Utility to extract purpose features from" +
  177. " a repository or a single commit.")
  178. PARSER.add_argument(
  179. "--repository",
  180. "-r",
  181. type=str,
  182. default="./repos/jenkins",
  183. help="Path to local git repository.")
  184. PARSER.add_argument(
  185. "--issues",
  186. "-i",
  187. type=str,
  188. default="../szz/issue_list_saved.json",
  189. help="Issues to analyze.")
  190. PARSER.add_argument(
  191. "--labels",
  192. "-l",
  193. type=str,
  194. default="./labels.csv",
  195. help="Found labels.")
  196. PARSER.add_argument(
  197. "--fixinpairs",
  198. "-fp",
  199. type=str,
  200. default="./fix_and_introducers_pairs.json",
  201. help="File with fix and introducing pair commits.")
  202. ARGS = PARSER.parse_args()
  203. REPO_PATH = ARGS.repository
  204. ISSUES = ARGS.issues
  205. LABELS = ARGS.labels
  206. PAIRS = ARGS.fixinpairs
  207. get_general_data(REPO_PATH, ISSUES, LABELS, PAIRS)