#!/usr/bin/env python3

import sys
import numpy as np
import scipy.stats as sp
import subprocess
import matplotlib.pyplot as plt


class Commit:
    def __init__(self, commit_hash, author, vcc=None):
        self.commit_hash = commit_hash
        self.author = author
        self.is_vcc = vcc
        self.xp = None

    def get_experience(self, commits, exp_dir):
        if self.xp is None:
            with open(exp_dir + self.author.replace("/", "_")) as f:
                commit_history = f.readlines()
            for xp in range(len(commit_history)):
                commit_hash = commit_history[xp].split(',')[0]
                if commit_hash in commits.hash_to_commit:
                    commits.hash_to_commit[commit_hash].xp = xp
        assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\
                   .format(self.author, self.commit_hash, self.is_vcc)
        return self.xp


class Commits:
    def __init__(self, git_dir, paths, vccs=None):
        """
        Returns a list of Commits at the given paths, ordered chronologically
        by authored time from old to new (NOT the order they were applied).
        paths is a single string appended raw to the git command,
        so any necessary escaping, quoting, etc. should be applied prior
        """
        command = "git -C " + git_dir + " log " \
            "--full-history --reverse --no-merges --use-mailmap "\
            "--since=2012-04-09 --format='format:%ct	%H	%aN <%aE>' -- " \
            + paths + " | sort -n | cut -f2,3"
        lines = subprocess.check_output(command, shell=True,
                                        universal_newlines=True).strip()
        assert lines
        self.commits = []
        self.hash_to_commit = {}
        for line in lines.splitlines():
            if '\\' in line:
                # dark incantation to unescape string
                line = line.encode('latin1').decode('unicode_escape').encode(
                    'latin1').decode('utf-8')
            line = line.strip().split('	')  # tab
            commit_hash = line[0]
            author = line[1]
            if vccs:
                vcc = commit_hash in vccs
            else:
                vcc = None
            commit = Commit(line[0], author, vcc)
            self.commits.append(commit)
            self.hash_to_commit[commit_hash] = commit


class GrowingList(list):
    def __init__(self, default):
        super().__init__()
        self.default = default

    def __setitem__(self, index, value):
        while index >= len(self):
            self.append(self.default())
        list.__setitem__(self, index, value)

    def __getitem__(self, index):
        while index >= len(self):
            self.append(self.default())
        return list.__getitem__(self, index)


class Counts:
    def __init__(self, total=0, vccs=0):
        self.total = total
        self.vccs = vccs


def count_commits(commits, vccs, exp_dir, counts=None):
    if not counts:
        counts = GrowingList(Counts)
    bugs = set()
    for commit in commits.commits:
        j = commit.get_experience(commits, exp_dir)
        if commit.is_vcc:
            for bug in vccs[commit.commit_hash]:
                if bug not in bugs:
                    counts[j].vccs += 1
                    bugs.add(bug)
        counts[j].total += 1
    return counts


def main(argv):
    # a file where each line is a VCC commit hash, followed by the issues it
    # contributed to, comma separated
    vcc_file = argv[1]
    git_dirs = argv[2].split(':')
    # the paths in the git dir to filter on (use "" or . to use everything)
    project_paths = argv[3].split(':')
    # the directory where experiences are stored
    exp_dirs = argv[4].split(':')
    for exp_dir in exp_dirs:
        if exp_dir[-1] != '/':
            exp_dir += '/'
    assert len(git_dirs) == len(exp_dirs) and \
        len(git_dirs) == len(project_paths), \
        "each git dir needs one project path and one experience dir"
    # the path+name of where to save the resulting plot
    plot_path = argv[5]

    vccs = {}
    with open(vcc_file) as f:
        for line in f.readlines():
            line = line.strip().split(',')
            vccs[line[0]] = {issue for issue in line[1:]}

    counts = None
    for i in range(len(git_dirs)):
        commits = Commits(git_dirs[i], project_paths[i], vccs)
        counts = [c for c in count_commits(commits, vccs, exp_dirs[i], counts)]

    def divide(a, b):
        """make division errors (primarily, divide by zero) return None"""
        if a and b:
            return a / b
        elif b:
            return 0
        return None
    cuml_vccs = [sum(c.vccs for c in counts[:j+1]) for j in range(len(counts))]
    cuml_tot = [sum(c.total for c in counts[:j+1]) for j in range(len(counts))]
    cuml_frac = [divide(cuml_vccs[j], cuml_tot[j]) for j in range(len(counts))]

    # to prevent regressing on leading 0 values (i.e., the first n values of j
    # where there were 0 contributors of those j's, so we have no data to
    # regress on, or to take the log of), we need to count and skip them
    offset = 0
    for i in range(len(cuml_vccs)):
        if cuml_vccs[i] != 0:
            offset = i
            break

    xs = np.log([x+1 for x in range(offset, len(counts))])
    ys = np.log(cuml_frac[offset:])
    regression = sp.linregress(xs, ys)

    print(regression)
    learning_coef = -regression.slope
    learning_intercept = -np.exp(regression.intercept) * (learning_coef - 1)
    print("l={}, T1={}".format(learning_coef, learning_intercept))

    xs = np.log([x+1 for x in range(len(counts))])
    plt.plot(
        [x for x in range(offset, len(counts))], cuml_frac[offset:], 'b.',
        [x for x in range(len(counts))],
        np.exp(xs*regression.slope+regression.intercept), 'r--'
    )
    plt.xlabel("j=Experience")
    plt.ylabel("Tj=P(error)")
    plt.xlim(left=0)
    plt.savefig(plot_path)


if __name__ == '__main__':
    main(sys.argv)