#!/usr/bin/env python3 import sys import numpy as np import scipy.stats as sp import subprocess import matplotlib.pyplot as plt class Commit: def __init__(self, commit_hash, author, vcc=None): self.commit_hash = commit_hash self.author = author self.is_vcc = vcc self.xp = None def get_experience(self, commits, exp_dir): if self.xp is None: with open(exp_dir + self.author.replace("/", "_")) as f: commit_history = f.readlines() for xp in range(len(commit_history)): commit_hash = commit_history[xp].split(',')[0] if commit_hash in commits.hash_to_commit: commits.hash_to_commit[commit_hash].xp = xp assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\ .format(self.author, self.commit_hash, self.is_vcc) return self.xp class Commits: def __init__(self, git_dir, paths, vccs=None): """ Returns a list of Commits at the given paths, ordered chronologically by authored time from old to new (NOT the order they were applied). paths is a single string appended raw to the git command, so any necessary escaping, quoting, etc. should be applied prior """ command = "git -C " + git_dir + " log " \ "--full-history --reverse --no-merges --use-mailmap "\ "--since=2012-04-09 --format='format:%ct %H %aN <%aE>' -- " \ + paths + " | sort -n | cut -f2,3" lines = subprocess.check_output(command, shell=True, universal_newlines=True).strip() assert lines self.commits = [] self.hash_to_commit = {} for line in lines.splitlines(): if '\\' in line: # dark incantation to unescape string line = line.encode('latin1').decode('unicode_escape').encode( 'latin1').decode('utf-8') line = line.strip().split(' ') # tab commit_hash = line[0] author = line[1] if vccs: vcc = commit_hash in vccs else: vcc = None commit = Commit(line[0], author, vcc) self.commits.append(commit) self.hash_to_commit[commit_hash] = commit class GrowingList(list): def __init__(self, default): super().__init__() self.default = default def __setitem__(self, index, value): while index >= len(self): self.append(self.default()) list.__setitem__(self, index, value) def __getitem__(self, index): while index >= len(self): self.append(self.default()) return list.__getitem__(self, index) class Counts: def __init__(self, total=0, vccs=0): self.total = total self.vccs = vccs def count_commits(commits, vccs, exp_dir, counts=None): if not counts: counts = GrowingList(Counts) bugs = set() for commit in commits.commits: j = commit.get_experience(commits, exp_dir) if commit.is_vcc: for bug in vccs[commit.commit_hash]: if bug not in bugs: counts[j].vccs += 1 bugs.add(bug) counts[j].total += 1 return counts def main(argv): # a file where each line is a VCC commit hash, followed by the issues it # contributed to, comma separated vcc_file = argv[1] git_dirs = argv[2].split(':') # the paths in the git dir to filter on (use "" or . to use everything) project_paths = argv[3].split(':') # the directory where experiences are stored exp_dirs = argv[4].split(':') for exp_dir in exp_dirs: if exp_dir[-1] != '/': exp_dir += '/' assert len(git_dirs) == len(exp_dirs) and \ len(git_dirs) == len(project_paths), \ "each git dir needs one project path and one experience dir" # the path+name of where to save the resulting plot plot_path = argv[5] vccs = {} with open(vcc_file) as f: for line in f.readlines(): line = line.strip().split(',') vccs[line[0]] = {issue for issue in line[1:]} counts = None for i in range(len(git_dirs)): commits = Commits(git_dirs[i], project_paths[i], vccs) counts = [c for c in count_commits(commits, vccs, exp_dirs[i], counts)] def divide(a, b): """make division errors (primarily, divide by zero) return None""" if a and b: return a / b elif b: return 0 return None cuml_vccs = [sum(c.vccs for c in counts[:j+1]) for j in range(len(counts))] cuml_tot = [sum(c.total for c in counts[:j+1]) for j in range(len(counts))] cuml_frac = [divide(cuml_vccs[j], cuml_tot[j]) for j in range(len(counts))] # to prevent regressing on leading 0 values (i.e., the first n values of j # where there were 0 contributors of those j's, so we have no data to # regress on, or to take the log of), we need to count and skip them offset = 0 for i in range(len(cuml_vccs)): if cuml_vccs[i] != 0: offset = i break xs = np.log([x+1 for x in range(offset, len(counts))]) ys = np.log(cuml_frac[offset:]) regression = sp.linregress(xs, ys) print(regression) learning_coef = -regression.slope learning_intercept = -np.exp(regression.intercept) * (learning_coef - 1) print("l={}, T1={}".format(learning_coef, learning_intercept)) xs = np.log([x+1 for x in range(len(counts))]) plt.plot( [x for x in range(offset, len(counts))], cuml_frac[offset:], 'b.', [x for x in range(len(counts))], np.exp(xs*regression.slope+regression.intercept), 'r--' ) plt.xlabel("j=Experience") plt.ylabel("Tj=P(error)") plt.xlim(left=0) plt.savefig(plot_path) if __name__ == '__main__': main(sys.argv)