123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- #!/usr/bin/env python3
- import sys
- import numpy as np
- import scipy.stats as sp
- import subprocess
- import matplotlib.pyplot as plt
- class Commit:
- def __init__(self, commit_hash, author, vcc=None):
- self.commit_hash = commit_hash
- self.author = author
- self.is_vcc = vcc
- self.xp = None
- def get_experience(self, commits, exp_dir):
- if self.xp is None:
- with open(exp_dir + self.author.replace("/", "_")) as f:
- commit_history = f.readlines()
- for xp in range(len(commit_history)):
- commit_hash = commit_history[xp].split(',')[0]
- if commit_hash in commits.hash_to_commit:
- commits.hash_to_commit[commit_hash].xp = xp
- assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\
- .format(self.author, self.commit_hash, self.is_vcc)
- return self.xp
- class Commits:
- def __init__(self, git_dir, paths, vccs=None):
- """
- Returns a list of Commits at the given paths, ordered chronologically
- by authored time from old to new (NOT the order they were applied).
- paths is a single string appended raw to the git command,
- so any necessary escaping, quoting, etc. should be applied prior
- """
- command = "git -C " + git_dir + " log " \
- "--full-history --reverse --no-merges --use-mailmap "\
- "--since=2012-04-09 --format='format:%ct %H %aN <%aE>' -- " \
- + paths + " | sort -n | cut -f2,3"
- lines = subprocess.check_output(command, shell=True,
- universal_newlines=True).strip()
- assert lines
- self.commits = []
- self.hash_to_commit = {}
- for line in lines.splitlines():
- if '\\' in line:
- # dark incantation to unescape string
- line = line.encode('latin1').decode('unicode_escape').encode(
- 'latin1').decode('utf-8')
- line = line.strip().split(' ') # tab
- commit_hash = line[0]
- author = line[1]
- if vccs:
- vcc = commit_hash in vccs
- else:
- vcc = None
- commit = Commit(line[0], author, vcc)
- self.commits.append(commit)
- self.hash_to_commit[commit_hash] = commit
- class GrowingList(list):
- def __init__(self, default):
- super().__init__()
- self.default = default
- def __setitem__(self, index, value):
- while index >= len(self):
- self.append(self.default())
- list.__setitem__(self, index, value)
- def __getitem__(self, index):
- while index >= len(self):
- self.append(self.default())
- return list.__getitem__(self, index)
- class Counts:
- def __init__(self, total=0, vccs=0):
- self.total = total
- self.vccs = vccs
- def count_commits(commits, vccs, exp_dir, counts=None):
- if not counts:
- counts = GrowingList(Counts)
- bugs = set()
- for commit in commits.commits:
- j = commit.get_experience(commits, exp_dir)
- if commit.is_vcc:
- for bug in vccs[commit.commit_hash]:
- if bug not in bugs:
- counts[j].vccs += 1
- bugs.add(bug)
- counts[j].total += 1
- return counts
- def main(argv):
- # a file where each line is a VCC commit hash, followed by the issues it
- # contributed to, comma separated
- vcc_file = argv[1]
- git_dirs = argv[2].split(':')
- # the paths in the git dir to filter on (use "" or . to use everything)
- project_paths = argv[3].split(':')
- # the directory where experiences are stored
- exp_dirs = argv[4].split(':')
- for exp_dir in exp_dirs:
- if exp_dir[-1] != '/':
- exp_dir += '/'
- assert len(git_dirs) == len(exp_dirs) and \
- len(git_dirs) == len(project_paths), \
- "each git dir needs one project path and one experience dir"
- # the path+name of where to save the resulting plot
- plot_path = argv[5]
- vccs = {}
- with open(vcc_file) as f:
- for line in f.readlines():
- line = line.strip().split(',')
- vccs[line[0]] = {issue for issue in line[1:]}
- counts = None
- for i in range(len(git_dirs)):
- commits = Commits(git_dirs[i], project_paths[i], vccs)
- counts = [c for c in count_commits(commits, vccs, exp_dirs[i], counts)]
- def divide(a, b):
- """make division errors (primarily, divide by zero) return None"""
- if a and b:
- return a / b
- elif b:
- return 0
- return None
- cuml_vccs = [sum(c.vccs for c in counts[:j+1]) for j in range(len(counts))]
- cuml_tot = [sum(c.total for c in counts[:j+1]) for j in range(len(counts))]
- cuml_frac = [divide(cuml_vccs[j], cuml_tot[j]) for j in range(len(counts))]
- # to prevent regressing on leading 0 values (i.e., the first n values of j
- # where there were 0 contributors of those j's, so we have no data to
- # regress on, or to take the log of), we need to count and skip them
- offset = 0
- for i in range(len(cuml_vccs)):
- if cuml_vccs[i] != 0:
- offset = i
- break
- xs = np.log([x+1 for x in range(offset, len(counts))])
- ys = np.log(cuml_frac[offset:])
- regression = sp.linregress(xs, ys)
- print(regression)
- learning_coef = -regression.slope
- learning_intercept = -np.exp(regression.intercept) * (learning_coef - 1)
- print("l={}, T1={}".format(learning_coef, learning_intercept))
- xs = np.log([x+1 for x in range(len(counts))])
- plt.plot(
- [x for x in range(offset, len(counts))], cuml_frac[offset:], 'b.',
- [x for x in range(len(counts))],
- np.exp(xs*regression.slope+regression.intercept), 'r--'
- )
- plt.xlabel("j=Experience")
- plt.ylabel("Tj=P(error)")
- plt.xlim(left=0)
- plt.savefig(plot_path)
- if __name__ == '__main__':
- main(sys.argv)
|