learningCurve_gradientDescent.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. #!/usr/bin/env python3
  2. import sys
  3. import math
  4. import numpy as np
  5. import vcclib
  6. # n.b.: in the following functions,
  7. # j0/k0 are the 0-indexed versions (used mostly for indexing lists),
  8. # j1/k1 are the 1-indexed versions (used mostly for arithmetic)
  9. # ell is l spelled out to prevent confusion with the 1 character
  10. def loss(t1, ell, j1, a):
  11. return (t1*j1**-ell - a)**2
  12. def loss_cuml(t1, ell, j1, cs, as_cuml, expected_vuln):
  13. return (t1*expected_vuln - as_cuml[j1-1])**2
  14. def gradient(t1, ell, j1, a):
  15. # returns (d/dt1, d/dl)
  16. ret = (j1**(-2*ell)*(2*t1 - 2*a*j1),
  17. 2*t1*j1**(-2*ell)*np.log(j1)*(a*j1**ell - t1))
  18. return ret
  19. def gradient_cuml(t1, ell, j1, cs, as_cuml, expected_vuln, log_factor):
  20. # returns (d/dt1, d/dl)
  21. # without t1 (too slow to use in practice, but left here to explain code
  22. # expected_vuln_c = sum([k1**-l * cs[k1-1] for k1 in range(1, j1+1)])
  23. # assert(expected_vuln == expected_vuln_c)
  24. # log_factor_c = sum([-k1**-l * cs[k1-1] * np.log(k1)
  25. # for k1 in range(1, j1+1)])
  26. # assert(log_factor == log_factor_c)
  27. gap = t1*expected_vuln - as_cuml[j1-1]
  28. d_dt1 = expected_vuln*gap # *2 removed from both since it's common and
  29. d_dl = t1*log_factor*gap # positive, so preserves direction
  30. return (d_dt1, d_dl)
  31. def gradient_descent(initial_T1, initial_ell, counts, learning_rate):
  32. minloss = 1000000000
  33. guess = (initial_T1, initial_ell)
  34. for i in range(2000):
  35. gradients = [gradient(guess[0], guess[1], j0+1,
  36. counts[j0].vccs/counts[j0].total)
  37. for j0 in range(len(counts)) if counts[j0].total > 0]
  38. grad = (sum(grad[0] for grad in gradients),
  39. sum(grad[1] for grad in gradients))
  40. guess = (guess[0] - learning_rate*grad[0],
  41. guess[1] - learning_rate*grad[1])
  42. losss = sum([loss(guess[0], guess[1], j0+1,
  43. counts[j0].vccs/counts[j0].total)
  44. for j0 in range(len(counts)) if counts[j0].total > 0])
  45. if losss >= minloss:
  46. break
  47. minloss = losss
  48. def calc_expected_vulns(ell, cs):
  49. """
  50. generates values giving the expected number of vulns at exp. <=j,
  51. without the T1 factor, for memoization
  52. """
  53. total = 0
  54. j1 = 1
  55. while True:
  56. try:
  57. total += j1**-ell * cs[j1-1]
  58. except OverflowError:
  59. total = float("inf")
  60. yield total
  61. j1 += 1
  62. def calc_log_factors(ell, cs):
  63. total = 0
  64. j1 = 1
  65. while True:
  66. try:
  67. total += -j1**-ell * cs[j1-1] * np.log(j1)
  68. except OverflowError:
  69. total = float("inf")
  70. yield total
  71. j1 += 1
  72. def gradient_descent_cuml(initial_T1, initial_ell,
  73. cs, cuml_vccs, learning_rate):
  74. t1, ell = (initial_T1, initial_ell)
  75. # without t1 factor
  76. g = calc_expected_vulns(ell, cs)
  77. expected_vulns = [next(g) for _ in range(len(cs))]
  78. g = calc_log_factors(ell, cs)
  79. log_factors = [next(g) for _ in range(len(cs))]
  80. losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs,
  81. expected_vulns[j0])
  82. for j0 in range(len(cs)) if cs[j0] > 0]
  83. minloss = sum(losses)
  84. for i in range(1000):
  85. gradients = [gradient_cuml(t1, ell, j0+1, cs, cuml_vccs,
  86. expected_vulns[j0], log_factors[j0])
  87. for j0 in range(len(cs)) if cs[j0] > 0]
  88. grad = (sum(gradient[0] for gradient in gradients),
  89. sum(gradient[1] for gradient in gradients))
  90. old_t1 = t1
  91. old_ell = ell
  92. old_expected_vulns = expected_vulns
  93. old_log_factors = log_factors
  94. t1 = t1 - grad[0]*learning_rate[0]
  95. ell = ell - grad[1]*learning_rate[1]
  96. # without t1 factor
  97. g = calc_expected_vulns(ell, cs)
  98. expected_vulns = [next(g) for _ in range(len(cs))]
  99. g = calc_log_factors(ell, cs)
  100. log_factors = [next(g) for _ in range(len(cs))]
  101. losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs,
  102. expected_vulns[j0])
  103. for j0 in range(len(cs)) if cs[j0] > 0]
  104. losss = sum(losses)
  105. if losss > minloss:
  106. t1 = old_t1
  107. ell = old_ell
  108. expected_vulns = old_expected_vulns
  109. log_factors = old_log_factors
  110. learning_rate = (learning_rate[0]/2, learning_rate[1]/2)
  111. else:
  112. if i % 100 == 0:
  113. learning_rate = (learning_rate[0]*2, learning_rate[1]*2)
  114. minloss = losss
  115. return t1, ell, grad, minloss
  116. def main(argv):
  117. # a file where each line is a VCC commit hash, followed by the issues it
  118. # contributed to, comma separated
  119. vcc_file = argv[1]
  120. git_dirs = argv[2].split(':')
  121. # the paths in the git dir to filter on (use "" or . to use everything)
  122. project_paths = argv[3].split(':')
  123. # the directory where experiences are stored
  124. exp_dirs = vcclib.expdirs(argv[4].split(':'))
  125. assert len(git_dirs) == len(exp_dirs) and \
  126. len(git_dirs) == len(project_paths), \
  127. "each git dir needs one project path and one experience dir"
  128. guesses_t1 = [float(f) for f in argv[5].split(':')]
  129. guesses_ell = [float(f) for f in argv[6].split(':')]
  130. search = len(argv) > 7 and argv[7] == "search"
  131. vccs = vcclib.get_vccs(vcc_file)
  132. assert len(vccs) > 0, "found no VCCs (vcc_file: {})".format(vcc_file)
  133. counts = vcclib.count_all_commits(git_dirs, project_paths, exp_dirs, vccs)
  134. cs = [count.total for count in counts]
  135. learning_rate = (1e-14, 1e-14)
  136. if not search:
  137. # normal mode, run grad descent on actual data, then +/-1 for err bars
  138. for bias in [0, -1, +1]:
  139. cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1]))
  140. for j in range(len(counts))]
  141. for t1_guess in guesses_t1:
  142. for ell_guess in guesses_ell:
  143. t1, ell, grad, minloss = \
  144. gradient_descent_cuml(t1_guess, ell_guess,
  145. cs, cuml_vccs, learning_rate)
  146. print(bias, t1_guess, ell_guess, t1, ell, grad, minloss,
  147. flush=True)
  148. else:
  149. # search mode, run a binary search on 1 exp VCCs to flip ell to +
  150. # a reasonable starting point is from no change to "1/2 of VCCs are 1"
  151. bias_low = 0
  152. bias_high = len(vccs)
  153. # first, initial backoff to find upper bound
  154. best_ell = 0
  155. while best_ell <= 0:
  156. cuml_vccs = [max(0, bias_high + sum(c.vccs for c in counts[:j+1]))
  157. for j in range(len(counts))]
  158. best_minloss = math.inf
  159. best_t1 = 0
  160. for t1_guess in guesses_t1:
  161. for ell_guess in guesses_ell:
  162. t1, ell, _grad, minloss = \
  163. gradient_descent_cuml(t1_guess, ell_guess,
  164. cs, cuml_vccs, learning_rate)
  165. if minloss < best_minloss:
  166. best_minloss = minloss
  167. best_ell = ell
  168. best_t1 = t1
  169. print(bias_high, best_t1, best_ell, flush=True)
  170. # no do-while loop in python
  171. if best_ell <= 0:
  172. bias_low = bias_high
  173. bias_high *= 2
  174. # now do the actual bisecting search, with the previous loop having
  175. # given us two values we know are above and below the target
  176. while bias_high - bias_low > 1:
  177. bias = (bias_high - bias_low)//2 + bias_low
  178. cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1]))
  179. for j in range(len(counts))]
  180. best_minloss = math.inf
  181. best_ell = 0
  182. for t1_guess in guesses_t1:
  183. for ell_guess in guesses_ell:
  184. t1, ell, _grad, minloss = \
  185. gradient_descent_cuml(t1_guess, ell_guess,
  186. cs, cuml_vccs, learning_rate)
  187. if minloss < best_minloss:
  188. best_minloss = minloss
  189. best_ell = ell
  190. best_t1 = t1
  191. print(bias, best_t1, best_ell, flush=True)
  192. if best_ell > 0:
  193. bias_high = bias
  194. else:
  195. bias_low = bias
  196. print("Learning rate becomes positive going from {} to {}".format(
  197. bias_low, bias_high))
  198. if __name__ == '__main__':
  199. main(sys.argv)