#!/usr/bin/env python3 import sys import math import numpy as np import vcclib # n.b.: in the following functions, # j0/k0 are the 0-indexed versions (used mostly for indexing lists), # j1/k1 are the 1-indexed versions (used mostly for arithmetic) # ell is l spelled out to prevent confusion with the 1 character def loss(t1, ell, j1, a): return (t1*j1**-ell - a)**2 def loss_cuml(t1, ell, j1, cs, as_cuml, expected_vuln): return (t1*expected_vuln - as_cuml[j1-1])**2 def gradient(t1, ell, j1, a): # returns (d/dt1, d/dl) ret = (j1**(-2*ell)*(2*t1 - 2*a*j1), 2*t1*j1**(-2*ell)*np.log(j1)*(a*j1**ell - t1)) return ret def gradient_cuml(t1, ell, j1, cs, as_cuml, expected_vuln, log_factor): # returns (d/dt1, d/dl) # without t1 (too slow to use in practice, but left here to explain code # expected_vuln_c = sum([k1**-l * cs[k1-1] for k1 in range(1, j1+1)]) # assert(expected_vuln == expected_vuln_c) # log_factor_c = sum([-k1**-l * cs[k1-1] * np.log(k1) # for k1 in range(1, j1+1)]) # assert(log_factor == log_factor_c) gap = t1*expected_vuln - as_cuml[j1-1] d_dt1 = expected_vuln*gap # *2 removed from both since it's common and d_dl = t1*log_factor*gap # positive, so preserves direction return (d_dt1, d_dl) def gradient_descent(initial_T1, initial_ell, counts, learning_rate): minloss = 1000000000 guess = (initial_T1, initial_ell) for i in range(2000): gradients = [gradient(guess[0], guess[1], j0+1, counts[j0].vccs/counts[j0].total) for j0 in range(len(counts)) if counts[j0].total > 0] grad = (sum(grad[0] for grad in gradients), sum(grad[1] for grad in gradients)) guess = (guess[0] - learning_rate*grad[0], guess[1] - learning_rate*grad[1]) losss = sum([loss(guess[0], guess[1], j0+1, counts[j0].vccs/counts[j0].total) for j0 in range(len(counts)) if counts[j0].total > 0]) if losss >= minloss: break minloss = losss def calc_expected_vulns(ell, cs): """ generates values giving the expected number of vulns at exp. <=j, without the T1 factor, for memoization """ total = 0 j1 = 1 while True: try: total += j1**-ell * cs[j1-1] except OverflowError: total = float("inf") yield total j1 += 1 def calc_log_factors(ell, cs): total = 0 j1 = 1 while True: try: total += -j1**-ell * cs[j1-1] * np.log(j1) except OverflowError: total = float("inf") yield total j1 += 1 def gradient_descent_cuml(initial_T1, initial_ell, cs, cuml_vccs, learning_rate): t1, ell = (initial_T1, initial_ell) # without t1 factor g = calc_expected_vulns(ell, cs) expected_vulns = [next(g) for _ in range(len(cs))] g = calc_log_factors(ell, cs) log_factors = [next(g) for _ in range(len(cs))] losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs, expected_vulns[j0]) for j0 in range(len(cs)) if cs[j0] > 0] minloss = sum(losses) for i in range(1000): gradients = [gradient_cuml(t1, ell, j0+1, cs, cuml_vccs, expected_vulns[j0], log_factors[j0]) for j0 in range(len(cs)) if cs[j0] > 0] grad = (sum(gradient[0] for gradient in gradients), sum(gradient[1] for gradient in gradients)) old_t1 = t1 old_ell = ell old_expected_vulns = expected_vulns old_log_factors = log_factors t1 = t1 - grad[0]*learning_rate[0] ell = ell - grad[1]*learning_rate[1] # without t1 factor g = calc_expected_vulns(ell, cs) expected_vulns = [next(g) for _ in range(len(cs))] g = calc_log_factors(ell, cs) log_factors = [next(g) for _ in range(len(cs))] losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs, expected_vulns[j0]) for j0 in range(len(cs)) if cs[j0] > 0] losss = sum(losses) if losss > minloss: t1 = old_t1 ell = old_ell expected_vulns = old_expected_vulns log_factors = old_log_factors learning_rate = (learning_rate[0]/2, learning_rate[1]/2) else: if i % 100 == 0: learning_rate = (learning_rate[0]*2, learning_rate[1]*2) minloss = losss return t1, ell, grad, minloss def main(argv): # a file where each line is a VCC commit hash, followed by the issues it # contributed to, comma separated vcc_file = argv[1] git_dirs = argv[2].split(':') # the paths in the git dir to filter on (use "" or . to use everything) project_paths = argv[3].split(':') # the directory where experiences are stored exp_dirs = vcclib.expdirs(argv[4].split(':')) assert len(git_dirs) == len(exp_dirs) and \ len(git_dirs) == len(project_paths), \ "each git dir needs one project path and one experience dir" guesses_t1 = [float(f) for f in argv[5].split(':')] guesses_ell = [float(f) for f in argv[6].split(':')] search = len(argv) > 7 and argv[7] == "search" vccs = vcclib.get_vccs(vcc_file) assert len(vccs) > 0, "found no VCCs (vcc_file: {})".format(vcc_file) counts = vcclib.count_all_commits(git_dirs, project_paths, exp_dirs, vccs) cs = [count.total for count in counts] learning_rate = (1e-14, 1e-14) if not search: # normal mode, run grad descent on actual data, then +/-1 for err bars for bias in [0, -1, +1]: cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1])) for j in range(len(counts))] for t1_guess in guesses_t1: for ell_guess in guesses_ell: t1, ell, grad, minloss = \ gradient_descent_cuml(t1_guess, ell_guess, cs, cuml_vccs, learning_rate) print(bias, t1_guess, ell_guess, t1, ell, grad, minloss, flush=True) else: # search mode, run a binary search on 1 exp VCCs to flip ell to + # a reasonable starting point is from no change to "1/2 of VCCs are 1" bias_low = 0 bias_high = len(vccs) # first, initial backoff to find upper bound best_ell = 0 while best_ell <= 0: cuml_vccs = [max(0, bias_high + sum(c.vccs for c in counts[:j+1])) for j in range(len(counts))] best_minloss = math.inf best_t1 = 0 for t1_guess in guesses_t1: for ell_guess in guesses_ell: t1, ell, _grad, minloss = \ gradient_descent_cuml(t1_guess, ell_guess, cs, cuml_vccs, learning_rate) if minloss < best_minloss: best_minloss = minloss best_ell = ell best_t1 = t1 print(bias_high, best_t1, best_ell, flush=True) # no do-while loop in python if best_ell <= 0: bias_low = bias_high bias_high *= 2 # now do the actual bisecting search, with the previous loop having # given us two values we know are above and below the target while bias_high - bias_low > 1: bias = (bias_high - bias_low)//2 + bias_low cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1])) for j in range(len(counts))] best_minloss = math.inf best_ell = 0 for t1_guess in guesses_t1: for ell_guess in guesses_ell: t1, ell, _grad, minloss = \ gradient_descent_cuml(t1_guess, ell_guess, cs, cuml_vccs, learning_rate) if minloss < best_minloss: best_minloss = minloss best_ell = ell best_t1 = t1 print(bias, best_t1, best_ell, flush=True) if best_ell > 0: bias_high = bias else: bias_low = bias print("Learning rate becomes positive going from {} to {}".format( bias_low, bias_high)) if __name__ == '__main__': main(sys.argv)