learningCurve.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #!/usr/bin/env python3
  2. import sys
  3. import numpy as np
  4. import scipy.stats as sp
  5. import subprocess
  6. import matplotlib.pyplot as plt
  7. class Commit:
  8. def __init__(self, commit_hash, author, vcc=None):
  9. self.commit_hash = commit_hash
  10. self.author = author
  11. self.is_vcc = vcc
  12. self.xp = None
  13. def get_experience(self, commits, exp_dir):
  14. if self.xp is None:
  15. with open(exp_dir + self.author.replace("/", "_")) as f:
  16. commit_history = f.readlines()
  17. for xp in range(len(commit_history)):
  18. commit_hash = commit_history[xp].split(',')[0]
  19. if commit_hash in commits.hash_to_commit:
  20. commits.hash_to_commit[commit_hash].xp = xp
  21. assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\
  22. .format(self.author, self.commit_hash, self.is_vcc)
  23. return self.xp
  24. class Commits:
  25. def __init__(self, git_dir, paths, vccs=None):
  26. """
  27. Returns a list of Commits at the given paths, ordered chronologically
  28. by authored time from old to new (NOT the order they were applied).
  29. paths is a single string appended raw to the git command,
  30. so any necessary escaping, quoting, etc. should be applied prior
  31. """
  32. command = "git -C " + git_dir + " log " \
  33. "--full-history --reverse --no-merges --use-mailmap "\
  34. "--since=2012-04-09 --format='format:%ct %H %aN <%aE>' -- " \
  35. + paths + " | sort -n | cut -f2,3"
  36. lines = subprocess.check_output(command, shell=True,
  37. universal_newlines=True).strip()
  38. assert lines
  39. self.commits = []
  40. self.hash_to_commit = {}
  41. for line in lines.splitlines():
  42. if '\\' in line:
  43. # dark incantation to unescape string
  44. line = line.encode('latin1').decode('unicode_escape').encode(
  45. 'latin1').decode('utf-8')
  46. line = line.strip().split(' ') # tab
  47. commit_hash = line[0]
  48. author = line[1]
  49. if vccs:
  50. vcc = commit_hash in vccs
  51. else:
  52. vcc = None
  53. commit = Commit(line[0], author, vcc)
  54. self.commits.append(commit)
  55. self.hash_to_commit[commit_hash] = commit
  56. class GrowingList(list):
  57. def __init__(self, default):
  58. super().__init__()
  59. self.default = default
  60. def __setitem__(self, index, value):
  61. while index >= len(self):
  62. self.append(self.default())
  63. list.__setitem__(self, index, value)
  64. def __getitem__(self, index):
  65. while index >= len(self):
  66. self.append(self.default())
  67. return list.__getitem__(self, index)
  68. class Counts:
  69. def __init__(self, total=0, vccs=0):
  70. self.total = total
  71. self.vccs = vccs
  72. def count_commits(commits, vccs, exp_dir, counts=None):
  73. if not counts:
  74. counts = GrowingList(Counts)
  75. bugs = set()
  76. for commit in commits.commits:
  77. j = commit.get_experience(commits, exp_dir)
  78. if commit.is_vcc:
  79. for bug in vccs[commit.commit_hash]:
  80. if bug not in bugs:
  81. counts[j].vccs += 1
  82. bugs.add(bug)
  83. counts[j].total += 1
  84. return counts
  85. def main(argv):
  86. # a file where each line is a VCC commit hash, followed by the issues it
  87. # contributed to, comma separated
  88. vcc_file = argv[1]
  89. git_dirs = argv[2].split(':')
  90. # the paths in the git dir to filter on (use "" or . to use everything)
  91. project_paths = argv[3].split(':')
  92. # the directory where experiences are stored
  93. exp_dirs = argv[4].split(':')
  94. for exp_dir in exp_dirs:
  95. if exp_dir[-1] != '/':
  96. exp_dir += '/'
  97. assert len(git_dirs) == len(exp_dirs) and \
  98. len(git_dirs) == len(project_paths), \
  99. "each git dir needs one project path and one experience dir"
  100. # the path+name of where to save the resulting plot
  101. plot_path = argv[5]
  102. vccs = {}
  103. with open(vcc_file) as f:
  104. for line in f.readlines():
  105. line = line.strip().split(',')
  106. vccs[line[0]] = {issue for issue in line[1:]}
  107. counts = None
  108. for i in range(len(git_dirs)):
  109. commits = Commits(git_dirs[i], project_paths[i], vccs)
  110. counts = [c for c in count_commits(commits, vccs, exp_dirs[i], counts)]
  111. def divide(a, b):
  112. """make division errors (primarily, divide by zero) return None"""
  113. if a and b:
  114. return a / b
  115. elif b:
  116. return 0
  117. return None
  118. cuml_vccs = [sum(c.vccs for c in counts[:j+1]) for j in range(len(counts))]
  119. cuml_tot = [sum(c.total for c in counts[:j+1]) for j in range(len(counts))]
  120. cuml_frac = [divide(cuml_vccs[j], cuml_tot[j]) for j in range(len(counts))]
  121. # to prevent regressing on leading 0 values (i.e., the first n values of j
  122. # where there were 0 contributors of those j's, so we have no data to
  123. # regress on, or to take the log of), we need to count and skip them
  124. offset = 0
  125. for i in range(len(cuml_vccs)):
  126. if cuml_vccs[i] != 0:
  127. offset = i
  128. break
  129. xs = np.log([x+1 for x in range(offset, len(counts))])
  130. ys = np.log(cuml_frac[offset:])
  131. regression = sp.linregress(xs, ys)
  132. print(regression)
  133. learning_coef = -regression.slope
  134. learning_intercept = -np.exp(regression.intercept) * (learning_coef - 1)
  135. print("l={}, T1={}".format(learning_coef, learning_intercept))
  136. xs = np.log([x+1 for x in range(len(counts))])
  137. plt.plot(
  138. [x for x in range(offset, len(counts))], cuml_frac[offset:], 'b.',
  139. [x for x in range(len(counts))],
  140. np.exp(xs*regression.slope+regression.intercept), 'r--'
  141. )
  142. plt.xlabel("j=Experience")
  143. plt.ylabel("Tj=P(error)")
  144. plt.xlim(left=0)
  145. plt.savefig(plot_path)
  146. if __name__ == '__main__':
  147. main(sys.argv)