import subprocess class Commit: def __init__(self, commit_hash, author, vcc=None): self.commit_hash = commit_hash self.author = author self.is_vcc = vcc self.xp = None def get_experience(self, commits, exp_dir): if self.xp is None: with open(exp_dir + self.author.replace("/", "_")) as f: commit_history = f.readlines() for xp in range(len(commit_history)): commit_hash = commit_history[xp].split(',')[0] if commit_hash in commits.hash_to_commit: commits.hash_to_commit[commit_hash].xp = xp assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\ .format(self.author, self.commit_hash, self.is_vcc) return self.xp class Commits: def __init__(self, git_dir, paths, vccs=None): """ Returns a list of Commits at the given paths, ordered chronologically by authored time from old to new (NOT the order they were applied). paths is a single string appended raw to the git command, so any necessary escaping, quoting, etc. should be applied prior """ command = "git -C " + git_dir + " log " \ "--full-history --reverse --no-merges --use-mailmap " \ "--since-as-filter=2012-04-09 " \ "--format='format:%ct %H %aN <%aE>' -- " \ + paths + " | sort -n | cut -f2,3" lines = subprocess.check_output(command, shell=True, universal_newlines=True).strip() assert lines, "git command failed to return any commits: {}"\ .format(command) self.commits = [] self.hash_to_commit = {} matched_vccs = set() for line in lines.splitlines(): if '\\' in line: # dark incantation to unescape string line = line.encode('latin1').decode('unicode_escape').encode( 'latin1').decode('utf-8') line = line.strip().split(' ') # tab commit_hash = line[0] author = line[1] if vccs: vcc = commit_hash in vccs if vcc: matched_vccs.add(commit_hash) else: vcc = None commit = Commit(line[0], author, vcc) self.commits.append(commit) self.hash_to_commit[commit_hash] = commit # unmatched_vccs = [vcc for vcc in vccs if vcc not in matched_vccs] # print("VCCs unmatched to any valid commit:", unmatched_vccs, # file=sys.stderr, flush=True) class GrowingList(list): def __init__(self, default): super().__init__() self.default = default def __setitem__(self, index, value): while index >= len(self): self.append(self.default()) list.__setitem__(self, index, value) def __getitem__(self, index): while index >= len(self): self.append(self.default()) return list.__getitem__(self, index) class Counts: def __init__(self, total=0, vccs=0): self.total = total self.vccs = vccs def count_commits(commits, vccs, exp_dir, counts=None): if not counts: counts = GrowingList(Counts) bugs = set() for commit in commits.commits: j = commit.get_experience(commits, exp_dir) if commit.is_vcc: for bug in vccs[commit.commit_hash]: if bug not in bugs: counts[j].vccs += 1 bugs.add(bug) counts[j].total += 1 return counts def count_all_commits(git_dirs, project_paths, exp_dirs, vccs): assert len(git_dirs) == len(exp_dirs) and \ len(git_dirs) == len(project_paths), \ "each git dir needs one project path and one experience dir" counts = None for i in range(len(git_dirs)): commits = Commits(git_dirs[i], project_paths[i], vccs) counts = count_commits(commits, vccs, exp_dirs[i], counts) # convert to a normal list return [c for c in counts] def get_vccs(vcc_file): vccs = {} with open(vcc_file) as f: for line in f.readlines(): line = line.strip().split(',') issues = {issue for issue in line[1:]} if line[0] not in vccs: vccs[line[0]] = issues else: vccs[line[0]] |= issues return vccs def expdirs(exp_dirs): for exp_dir in exp_dirs: if exp_dir[-1] != '/': exp_dir += '/' return exp_dirs # takes an iterable of Decimal objects def sigfigs(vals): msds = [v.adjusted() for v in vals] if not all(msd == msds[0] for msd in msds): msd = -max(msds) return [round(vals[i], msd) for i in range(len(vals))] for i in range(-msds[0], 20): # arbitrarily high precision if any(round(v, i) != round(vals[0], i) for v in vals): return [round(v, i) for v in vals] return vals