123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- #!/usr/bin/env python3
- import sys
- import math
- import numpy as np
- import vcclib
- # n.b.: in the following functions,
- # j0/k0 are the 0-indexed versions (used mostly for indexing lists),
- # j1/k1 are the 1-indexed versions (used mostly for arithmetic)
- # ell is l spelled out to prevent confusion with the 1 character
- def loss(t1, ell, j1, a):
- return (t1*j1**-ell - a)**2
- def loss_cuml(t1, ell, j1, cs, as_cuml, expected_vuln):
- return (t1*expected_vuln - as_cuml[j1-1])**2
- def gradient(t1, ell, j1, a):
- # returns (d/dt1, d/dl)
- ret = (j1**(-2*ell)*(2*t1 - 2*a*j1),
- 2*t1*j1**(-2*ell)*np.log(j1)*(a*j1**ell - t1))
- return ret
- def gradient_cuml(t1, ell, j1, cs, as_cuml, expected_vuln, log_factor):
- # returns (d/dt1, d/dl)
- # without t1 (too slow to use in practice, but left here to explain code
- # expected_vuln_c = sum([k1**-l * cs[k1-1] for k1 in range(1, j1+1)])
- # assert(expected_vuln == expected_vuln_c)
- # log_factor_c = sum([-k1**-l * cs[k1-1] * np.log(k1)
- # for k1 in range(1, j1+1)])
- # assert(log_factor == log_factor_c)
- gap = t1*expected_vuln - as_cuml[j1-1]
- d_dt1 = expected_vuln*gap # *2 removed from both since it's common and
- d_dl = t1*log_factor*gap # positive, so preserves direction
- return (d_dt1, d_dl)
- def gradient_descent(initial_T1, initial_ell, counts, learning_rate):
- minloss = 1000000000
- guess = (initial_T1, initial_ell)
- for i in range(2000):
- gradients = [gradient(guess[0], guess[1], j0+1,
- counts[j0].vccs/counts[j0].total)
- for j0 in range(len(counts)) if counts[j0].total > 0]
- grad = (sum(grad[0] for grad in gradients),
- sum(grad[1] for grad in gradients))
- guess = (guess[0] - learning_rate*grad[0],
- guess[1] - learning_rate*grad[1])
- losss = sum([loss(guess[0], guess[1], j0+1,
- counts[j0].vccs/counts[j0].total)
- for j0 in range(len(counts)) if counts[j0].total > 0])
- if losss >= minloss:
- break
- minloss = losss
- def calc_expected_vulns(ell, cs):
- """
- generates values giving the expected number of vulns at exp. <=j,
- without the T1 factor, for memoization
- """
- total = 0
- j1 = 1
- while True:
- try:
- total += j1**-ell * cs[j1-1]
- except OverflowError:
- total = float("inf")
- yield total
- j1 += 1
- def calc_log_factors(ell, cs):
- total = 0
- j1 = 1
- while True:
- try:
- total += -j1**-ell * cs[j1-1] * np.log(j1)
- except OverflowError:
- total = float("inf")
- yield total
- j1 += 1
- def gradient_descent_cuml(initial_T1, initial_ell,
- cs, cuml_vccs, learning_rate):
- t1, ell = (initial_T1, initial_ell)
- # without t1 factor
- g = calc_expected_vulns(ell, cs)
- expected_vulns = [next(g) for _ in range(len(cs))]
- g = calc_log_factors(ell, cs)
- log_factors = [next(g) for _ in range(len(cs))]
- losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs,
- expected_vulns[j0])
- for j0 in range(len(cs)) if cs[j0] > 0]
- minloss = sum(losses)
- for i in range(1000):
- gradients = [gradient_cuml(t1, ell, j0+1, cs, cuml_vccs,
- expected_vulns[j0], log_factors[j0])
- for j0 in range(len(cs)) if cs[j0] > 0]
- grad = (sum(gradient[0] for gradient in gradients),
- sum(gradient[1] for gradient in gradients))
- old_t1 = t1
- old_ell = ell
- old_expected_vulns = expected_vulns
- old_log_factors = log_factors
- t1 = t1 - grad[0]*learning_rate[0]
- ell = ell - grad[1]*learning_rate[1]
- # without t1 factor
- g = calc_expected_vulns(ell, cs)
- expected_vulns = [next(g) for _ in range(len(cs))]
- g = calc_log_factors(ell, cs)
- log_factors = [next(g) for _ in range(len(cs))]
- losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs,
- expected_vulns[j0])
- for j0 in range(len(cs)) if cs[j0] > 0]
- losss = sum(losses)
- if losss > minloss:
- t1 = old_t1
- ell = old_ell
- expected_vulns = old_expected_vulns
- log_factors = old_log_factors
- learning_rate = (learning_rate[0]/2, learning_rate[1]/2)
- else:
- if i % 100 == 0:
- learning_rate = (learning_rate[0]*2, learning_rate[1]*2)
- minloss = losss
- return t1, ell, grad, minloss
- def main(argv):
- # a file where each line is a VCC commit hash, followed by the issues it
- # contributed to, comma separated
- vcc_file = argv[1]
- git_dirs = argv[2].split(':')
- # the paths in the git dir to filter on (use "" or . to use everything)
- project_paths = argv[3].split(':')
- # the directory where experiences are stored
- exp_dirs = vcclib.expdirs(argv[4].split(':'))
- assert len(git_dirs) == len(exp_dirs) and \
- len(git_dirs) == len(project_paths), \
- "each git dir needs one project path and one experience dir"
- guesses_t1 = [float(f) for f in argv[5].split(':')]
- guesses_ell = [float(f) for f in argv[6].split(':')]
- search = len(argv) > 7 and argv[7] == "search"
- vccs = vcclib.get_vccs(vcc_file)
- assert len(vccs) > 0, "found no VCCs (vcc_file: {})".format(vcc_file)
- counts = vcclib.count_all_commits(git_dirs, project_paths, exp_dirs, vccs)
- cs = [count.total for count in counts]
- learning_rate = (1e-14, 1e-14)
- if not search:
- # normal mode, run grad descent on actual data, then +/-1 for err bars
- for bias in [0, -1, +1]:
- cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1]))
- for j in range(len(counts))]
- for t1_guess in guesses_t1:
- for ell_guess in guesses_ell:
- t1, ell, grad, minloss = \
- gradient_descent_cuml(t1_guess, ell_guess,
- cs, cuml_vccs, learning_rate)
- print(bias, t1_guess, ell_guess, t1, ell, grad, minloss,
- flush=True)
- else:
- # search mode, run a binary search on 1 exp VCCs to flip ell to +
- # a reasonable starting point is from no change to "1/2 of VCCs are 1"
- bias_low = 0
- bias_high = len(vccs)
- # first, initial backoff to find upper bound
- best_ell = 0
- while best_ell <= 0:
- cuml_vccs = [max(0, bias_high + sum(c.vccs for c in counts[:j+1]))
- for j in range(len(counts))]
- best_minloss = math.inf
- best_t1 = 0
- for t1_guess in guesses_t1:
- for ell_guess in guesses_ell:
- t1, ell, _grad, minloss = \
- gradient_descent_cuml(t1_guess, ell_guess,
- cs, cuml_vccs, learning_rate)
- if minloss < best_minloss:
- best_minloss = minloss
- best_ell = ell
- best_t1 = t1
- print(bias_high, best_t1, best_ell, flush=True)
- # no do-while loop in python
- if best_ell <= 0:
- bias_low = bias_high
- bias_high *= 2
- # now do the actual bisecting search, with the previous loop having
- # given us two values we know are above and below the target
- while bias_high - bias_low > 1:
- bias = (bias_high - bias_low)//2 + bias_low
- cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1]))
- for j in range(len(counts))]
- best_minloss = math.inf
- best_ell = 0
- for t1_guess in guesses_t1:
- for ell_guess in guesses_ell:
- t1, ell, _grad, minloss = \
- gradient_descent_cuml(t1_guess, ell_guess,
- cs, cuml_vccs, learning_rate)
- if minloss < best_minloss:
- best_minloss = minloss
- best_ell = ell
- best_t1 = t1
- print(bias, best_t1, best_ell, flush=True)
- if best_ell > 0:
- bias_high = bias
- else:
- bias_low = bias
- print("Learning rate becomes positive going from {} to {}".format(
- bias_low, bias_high))
- if __name__ == '__main__':
- main(sys.argv)
|