vcclib.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import subprocess
  2. class Commit:
  3. def __init__(self, commit_hash, author, vcc=None):
  4. self.commit_hash = commit_hash
  5. self.author = author
  6. self.is_vcc = vcc
  7. self.xp = None
  8. def get_experience(self, commits, exp_dir):
  9. if self.xp is None:
  10. with open(exp_dir + self.author.replace("/", "_")) as f:
  11. commit_history = f.readlines()
  12. for xp in range(len(commit_history)):
  13. commit_hash = commit_history[xp].split(',')[0]
  14. if commit_hash in commits.hash_to_commit:
  15. commits.hash_to_commit[commit_hash].xp = xp
  16. assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\
  17. .format(self.author, self.commit_hash, self.is_vcc)
  18. return self.xp
  19. class Commits:
  20. def __init__(self, git_dir, paths, vccs=None):
  21. """
  22. Returns a list of Commits at the given paths, ordered chronologically
  23. by authored time from old to new (NOT the order they were applied).
  24. paths is a single string appended raw to the git command,
  25. so any necessary escaping, quoting, etc. should be applied prior
  26. """
  27. command = "git -C " + git_dir + " log " \
  28. "--full-history --reverse --no-merges --use-mailmap " \
  29. "--since-as-filter=2012-04-09 " \
  30. "--format='format:%ct %H %aN <%aE>' -- " \
  31. + paths + " | sort -n | cut -f2,3"
  32. lines = subprocess.check_output(command, shell=True,
  33. universal_newlines=True).strip()
  34. assert lines, "git command failed to return any commits: {}"\
  35. .format(command)
  36. self.commits = []
  37. self.hash_to_commit = {}
  38. matched_vccs = set()
  39. for line in lines.splitlines():
  40. if '\\' in line:
  41. # dark incantation to unescape string
  42. line = line.encode('latin1').decode('unicode_escape').encode(
  43. 'latin1').decode('utf-8')
  44. line = line.strip().split(' ') # tab
  45. commit_hash = line[0]
  46. author = line[1]
  47. if vccs:
  48. vcc = commit_hash in vccs
  49. if vcc:
  50. matched_vccs.add(commit_hash)
  51. else:
  52. vcc = None
  53. commit = Commit(line[0], author, vcc)
  54. self.commits.append(commit)
  55. self.hash_to_commit[commit_hash] = commit
  56. # unmatched_vccs = [vcc for vcc in vccs if vcc not in matched_vccs]
  57. # print("VCCs unmatched to any valid commit:", unmatched_vccs,
  58. # file=sys.stderr, flush=True)
  59. class GrowingList(list):
  60. def __init__(self, default):
  61. super().__init__()
  62. self.default = default
  63. def __setitem__(self, index, value):
  64. while index >= len(self):
  65. self.append(self.default())
  66. list.__setitem__(self, index, value)
  67. def __getitem__(self, index):
  68. while index >= len(self):
  69. self.append(self.default())
  70. return list.__getitem__(self, index)
  71. class Counts:
  72. def __init__(self, total=0, vccs=0):
  73. self.total = total
  74. self.vccs = vccs
  75. def count_commits(commits, vccs, exp_dir, counts=None):
  76. if not counts:
  77. counts = GrowingList(Counts)
  78. bugs = set()
  79. for commit in commits.commits:
  80. j = commit.get_experience(commits, exp_dir)
  81. if commit.is_vcc:
  82. for bug in vccs[commit.commit_hash]:
  83. if bug not in bugs:
  84. counts[j].vccs += 1
  85. bugs.add(bug)
  86. counts[j].total += 1
  87. return counts
  88. def count_all_commits(git_dirs, project_paths, exp_dirs, vccs):
  89. assert len(git_dirs) == len(exp_dirs) and \
  90. len(git_dirs) == len(project_paths), \
  91. "each git dir needs one project path and one experience dir"
  92. counts = None
  93. for i in range(len(git_dirs)):
  94. commits = Commits(git_dirs[i], project_paths[i], vccs)
  95. counts = count_commits(commits, vccs, exp_dirs[i], counts)
  96. # convert to a normal list
  97. return [c for c in counts]
  98. def get_vccs(vcc_file):
  99. vccs = {}
  100. with open(vcc_file) as f:
  101. for line in f.readlines():
  102. line = line.strip().split(',')
  103. issues = {issue for issue in line[1:]}
  104. if line[0] not in vccs:
  105. vccs[line[0]] = issues
  106. else:
  107. vccs[line[0]] |= issues
  108. return vccs
  109. def expdirs(exp_dirs):
  110. for exp_dir in exp_dirs:
  111. if exp_dir[-1] != '/':
  112. exp_dir += '/'
  113. return exp_dirs
  114. # takes an iterable of Decimal objects
  115. def sigfigs(vals):
  116. msds = [v.adjusted() for v in vals]
  117. if not all(msd == msds[0] for msd in msds):
  118. msd = -max(msds)
  119. return [round(vals[i], msd) for i in range(len(vals))]
  120. for i in range(-msds[0], 20): # arbitrarily high precision
  121. if any(round(v, i) != round(vals[0], i) for v in vals):
  122. return [round(v, i) for v in vals]
  123. return vals