123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- import subprocess
- class Commit:
- def __init__(self, commit_hash, author, vcc=None):
- self.commit_hash = commit_hash
- self.author = author
- self.is_vcc = vcc
- self.xp = None
- def get_experience(self, commits, exp_dir):
- if self.xp is None:
- with open(exp_dir + self.author.replace("/", "_")) as f:
- commit_history = f.readlines()
- for xp in range(len(commit_history)):
- commit_hash = commit_history[xp].split(',')[0]
- if commit_hash in commits.hash_to_commit:
- commits.hash_to_commit[commit_hash].xp = xp
- assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\
- .format(self.author, self.commit_hash, self.is_vcc)
- return self.xp
- class Commits:
- def __init__(self, git_dir, paths, vccs=None):
- """
- Returns a list of Commits at the given paths, ordered chronologically
- by authored time from old to new (NOT the order they were applied).
- paths is a single string appended raw to the git command,
- so any necessary escaping, quoting, etc. should be applied prior
- """
- command = "git -C " + git_dir + " log " \
- "--full-history --reverse --no-merges --use-mailmap " \
- "--since-as-filter=2012-04-09 " \
- "--format='format:%ct %H %aN <%aE>' -- " \
- + paths + " | sort -n | cut -f2,3"
- lines = subprocess.check_output(command, shell=True,
- universal_newlines=True).strip()
- assert lines, "git command failed to return any commits: {}"\
- .format(command)
- self.commits = []
- self.hash_to_commit = {}
- matched_vccs = set()
- for line in lines.splitlines():
- if '\\' in line:
- # dark incantation to unescape string
- line = line.encode('latin1').decode('unicode_escape').encode(
- 'latin1').decode('utf-8')
- line = line.strip().split(' ') # tab
- commit_hash = line[0]
- author = line[1]
- if vccs:
- vcc = commit_hash in vccs
- if vcc:
- matched_vccs.add(commit_hash)
- else:
- vcc = None
- commit = Commit(line[0], author, vcc)
- self.commits.append(commit)
- self.hash_to_commit[commit_hash] = commit
- # unmatched_vccs = [vcc for vcc in vccs if vcc not in matched_vccs]
- # print("VCCs unmatched to any valid commit:", unmatched_vccs,
- # file=sys.stderr, flush=True)
- class GrowingList(list):
- def __init__(self, default):
- super().__init__()
- self.default = default
- def __setitem__(self, index, value):
- while index >= len(self):
- self.append(self.default())
- list.__setitem__(self, index, value)
- def __getitem__(self, index):
- while index >= len(self):
- self.append(self.default())
- return list.__getitem__(self, index)
- class Counts:
- def __init__(self, total=0, vccs=0):
- self.total = total
- self.vccs = vccs
- def count_commits(commits, vccs, exp_dir, counts=None):
- if not counts:
- counts = GrowingList(Counts)
- bugs = set()
- for commit in commits.commits:
- j = commit.get_experience(commits, exp_dir)
- if commit.is_vcc:
- for bug in vccs[commit.commit_hash]:
- if bug not in bugs:
- counts[j].vccs += 1
- bugs.add(bug)
- counts[j].total += 1
- return counts
- def count_all_commits(git_dirs, project_paths, exp_dirs, vccs):
- assert len(git_dirs) == len(exp_dirs) and \
- len(git_dirs) == len(project_paths), \
- "each git dir needs one project path and one experience dir"
- counts = None
- for i in range(len(git_dirs)):
- commits = Commits(git_dirs[i], project_paths[i], vccs)
- counts = count_commits(commits, vccs, exp_dirs[i], counts)
- # convert to a normal list
- return [c for c in counts]
- def get_vccs(vcc_file):
- vccs = {}
- with open(vcc_file) as f:
- for line in f.readlines():
- line = line.strip().split(',')
- issues = {issue for issue in line[1:]}
- if line[0] not in vccs:
- vccs[line[0]] = issues
- else:
- vccs[line[0]] |= issues
- return vccs
- def expdirs(exp_dirs):
- for exp_dir in exp_dirs:
- if exp_dir[-1] != '/':
- exp_dir += '/'
- return exp_dirs
- # takes an iterable of Decimal objects
- def sigfigs(vals):
- msds = [v.adjusted() for v in vals]
- if not all(msd == msds[0] for msd in msds):
- msd = -max(msds)
- return [round(vals[i], msd) for i in range(len(vals))]
- for i in range(-msds[0], 20): # arbitrarily high precision
- if any(round(v, i) != round(vals[0], i) for v in vals):
- return [round(v, i) for v in vals]
- return vals
|