assemble_labels.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. """
  2. Script to generate a labels file from a file produced by the SZZ algorithm.
  3. """
  4. __author__ = "Oscar Svensson"
  5. __copyright__ = "Copyright (c) 2018 Axis Communications AB"
  6. __license__ = "MIT"
  7. import csv
  8. import json
  9. from argparse import ArgumentParser
  10. from datetime import datetime as dat
  11. from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
  12. from tqdm import tqdm
  13. import matplotlib.pyplot as plt
  14. def get_labels(repo_path, branch, pair_file, last_commit):
  15. """
  16. Get the labels from a file produced by the SZZ algorithm. It contains
  17. bug fixing commits and their respective bug fixing commit.
  18. """
  19. repo = Repository(repo_path)
  20. head = repo.references.get(branch)
  21. commits = []
  22. for commit in list(
  23. repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)):
  24. commits.append(commit)
  25. if commit.hex == last_commit:
  26. break
  27. commits = list(reversed(commits))
  28. pairs = {}
  29. with open(pair_file, 'r') as inp:
  30. pairs = json.load(inp)
  31. unique_pairs = set([p[1] for p in pairs])
  32. labels = []
  33. for commit in tqdm(commits):
  34. label = [commit.hex, "1" if commit.hex in unique_pairs else "0"]
  35. labels.append(label)
  36. return labels
  37. def save_labels(labels, res_path):
  38. """
  39. Save the labels as a csv file.
  40. """
  41. with open(res_path, 'w') as out:
  42. writer = csv.writer(out)
  43. writer.writerow(["commit", "label"])
  44. for label in labels:
  45. writer.writerow(label)
  46. def save_label_distribution(repo_path, branch, labels, res_path):
  47. """
  48. Save a distribution of the labels over time.
  49. """
  50. ldict = set()
  51. for label in labels:
  52. if label[1] == "1":
  53. ldict.add(label[0])
  54. repo = Repository(repo_path)
  55. head = repo.references.get(branch)
  56. commits = list(repo.walk(head.target, GIT_SORT_TOPOLOGICAL))
  57. start_year = dat.fromtimestamp(commits[-1].commit_time).year
  58. end_year = dat.fromtimestamp(commits[0].commit_time).year
  59. num_years = end_year - start_year
  60. year_dist = [0 for y in range(num_years + 1)]
  61. years = [y for y in range(start_year, end_year + 1)]
  62. for commit in commits:
  63. if commit.hex in ldict:
  64. commit_year = dat.fromtimestamp(commit.commit_time).year
  65. year_dist[commit_year - start_year - 1] += 1
  66. fig = plt.figure()
  67. plt.bar(years, year_dist)
  68. plt.xticks(years)
  69. plt.xlim(xmin=years[0] - 1, xmax=years[-1] + 1)
  70. fig.autofmt_xdate()
  71. plt.savefig(res_path)
  72. if __name__ == "__main__":
  73. PARSER = ArgumentParser(
  74. description="Utility to extract unique bug " +
  75. "introducing commits from a set a bug fix and bug introducing pairs.")
  76. PARSER.add_argument(
  77. "--repository",
  78. "-r",
  79. type=str,
  80. default="../../jenkins_master/jenkins_master",
  81. help=
  82. "Path to a local git repository from which the pairs where extracted.")
  83. PARSER.add_argument(
  84. "--branch",
  85. "-b",
  86. type=str,
  87. default="refs/heads/master",
  88. help="Which branch to use.")
  89. PARSER.add_argument(
  90. "--file",
  91. "-f",
  92. type=str,
  93. default="../szz/results/fix_and_introducers_pairs.json",
  94. help="The file with the pairs.")
  95. PARSER.add_argument(
  96. "--resfile",
  97. "-rf",
  98. type=str,
  99. default="./labels.csv",
  100. help="The file to which the labels are written.")
  101. PARSER.add_argument(
  102. "--figfile",
  103. "-ff",
  104. type=str,
  105. default="./distribution.png",
  106. help="The file to which the bug introducing ditribution is written.")
  107. PARSER.add_argument(
  108. "--commit",
  109. "-c",
  110. type=str,
  111. default="02d6908ada70fcf8012833ddef628bc09c6f8389",
  112. help="The last commit that should be analyzed.")
  113. ARGS = PARSER.parse_args()
  114. REPOPATH = ARGS.repository
  115. BRANCH = ARGS.branch
  116. PAIRFILE = ARGS.file
  117. RESFILE = ARGS.resfile
  118. FIGFILE = ARGS.figfile
  119. LAST_COMMIT = ARGS.commit
  120. LABELS = get_labels(REPOPATH, BRANCH, PAIRFILE, LAST_COMMIT)
  121. save_labels(LABELS, RESFILE)
  122. save_label_distribution(REPOPATH, BRANCH, LABELS, FIGFILE)