6 gadi atpakaļ · a0e8e9537f
--- a/code/data_assembler/CodeMaatDockerFile
+++ b/code/data_assembler/CodeMaatDockerFile
@@ -0,0 +1,15 @@
 
				+FROM clojure:alpine
			
 
				+VOLUME /data
			
 
				+LABEL description="code-maat docker image."
			
 
				+
			
 
				+ARG dest=/usr/src/code-maat
			
 
				+
			
 
				+RUN mkdir -p $dest
			
 
				+WORKDIR $dest
			
 
				+COPY project.clj $dest
			
 
				+RUN lein deps
			
 
				+COPY . $dest
			
 
				+RUN mv "$(lein uberjar | sed -n 's/^Created \(.*standalone\.jar\)/\1/p')" app-standalone.jar
			
 
				+
			
 
				+RUN apk update
			
 
				+RUN apk add git
			
--- a/code/data_assembler/assemble_code_churns.py
+++ b/code/data_assembler/assemble_code_churns.py
@@ -0,0 +1,232 @@
 
				+"""
			
 
				+Script to extract code churns.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+
			
 
				+from multiprocessing import Process, Manager, cpu_count
			
 
				+from pygit2 import Repository, GIT_SORT_REVERSE, GIT_SORT_TOPOLOGICAL
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+# Global variables
			
 
				+MANAGER = Manager()
			
 
				+RES = MANAGER.dict()
			
 
				+
			
 
				+
			
 
				+def parse_code_churns(pid, repo_path, branch, start, stop=-1):
			
 
				+    """
			
 
				+    Function that is intended to be runned by a process. It extracts the code churns
			
 
				+    for a set of commits and stores them in the RES dict.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+
			
 
				+    head = repo.references.get(branch)
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+
			
 
				+    start = start - 1 if (start > 0) else start
			
 
				+    commits = commits[start:stop] if (stop != -1) else commits[start:]
			
 
				+
			
 
				+    code_churns = [[] for c in range(len(commits))]
			
 
				+    for i, commit in enumerate(tqdm(commits[1:], position=pid)):
			
 
				+        diff = repo.diff(commits[i], commit)
			
 
				+
			
 
				+        tree = commit.tree
			
 
				+        patches = [p for p in diff]
			
 
				+        stats = diff.stats
			
 
				+
			
 
				+        # Count the total lines of code and find the biggest file that have been changed
			
 
				+        total_tloc = 0
			
 
				+        line_of_code_old = 0
			
 
				+        for patch in patches:
			
 
				+            if patch.delta.is_binary:
			
 
				+                continue
			
 
				+            new_file = patch.delta.new_file
			
 
				+
			
 
				+            # Total lines of code
			
 
				+            total_tloc += get_file_lines_of_code(repo, tree, new_file)
			
 
				+
			
 
				+            old_file = patch.delta.old_file
			
 
				+            # Total lines of code in the old file
			
 
				+            line_of_code_old = max(
			
 
				+                line_of_code_old, get_file_lines_of_code(repo, tree, old_file))
			
 
				+
			
 
				+        # Churned lines of code
			
 
				+        cloc = stats.insertions
			
 
				+        # Deleted lines of code
			
 
				+        dloc = stats.deletions
			
 
				+
			
 
				+        # Churned files
			
 
				+        files_churned = len(patches)
			
 
				+
			
 
				+        # File count
			
 
				+        num_files = count_files(tree, repo)
			
 
				+
			
 
				+        # Apply relative code churns
			
 
				+        measure_one = float(cloc) / total_tloc if (total_tloc > 0) else float(cloc)
			
 
				+        measure_two = float(dloc) / total_tloc if (total_tloc > 0) else float(cloc)
			
 
				+        measure_three = (float(files_churned) / num_files if (num_files > 0)
			
 
				+                         else float(files_churned))
			
 
				+
			
 
				+        line_of_code_old = float(line_of_code_old)
			
 
				+
			
 
				+        # Churn features
			
 
				+        code_churns[i].append(str(commit.hex))
			
 
				+        code_churns[i].append(str(measure_one))
			
 
				+        code_churns[i].append(str(measure_two))
			
 
				+        code_churns[i].append(str(measure_three))
			
 
				+        code_churns[i].append(str(line_of_code_old))
			
 
				+
			
 
				+    RES[pid] = code_churns
			
 
				+
			
 
				+
			
 
				+def count_files(tree, repo):
			
 
				+    """
			
 
				+    Count how many files there are in a repository.
			
 
				+    """
			
 
				+    num_files = 0
			
 
				+    trees = []
			
 
				+    visited = set()
			
 
				+    visited.add(tree.id)
			
 
				+    trees.append(tree)
			
 
				+
			
 
				+    while trees:
			
 
				+        current_tree = trees.pop()
			
 
				+        for entry in current_tree:
			
 
				+            if entry.type == "tree":
			
 
				+                if entry.id not in visited:
			
 
				+                    trees.append(repo[entry.id])
			
 
				+                    visited.add(entry.id)
			
 
				+            else:
			
 
				+                num_files += 1
			
 
				+    return num_files
			
 
				+
			
 
				+
			
 
				+def get_file_lines_of_code(repo, tree, dfile):
			
 
				+    """
			
 
				+    Count how many lines of code there are in a file.
			
 
				+    """
			
 
				+    tloc = 0
			
 
				+    try:
			
 
				+        blob = repo[tree[dfile.path].id]
			
 
				+
			
 
				+        tloc = len(str(blob.data).split('\\n'))
			
 
				+    except Exception as _:
			
 
				+        return tloc
			
 
				+    return tloc
			
 
				+
			
 
				+
			
 
				+def get_code_churns(repo_path, branch):
			
 
				+    """
			
 
				+    General function for extracting code churns. It first extracts the code churns for
			
 
				+    the first commit and then starts a number of processes(equal to the number of cores
			
 
				+    on the computer), which equally extracts the code churns for the remaining commits.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+    code_churns = [[]]
			
 
				+
			
 
				+    initial = commits[0]
			
 
				+
			
 
				+    # Relative code churns
			
 
				+    measure_one = 0.0
			
 
				+    measure_two = 0.0
			
 
				+    measure_three = 1.0
			
 
				+
			
 
				+    line_of_code_old = 0.0
			
 
				+
			
 
				+    code_churns[0].append(str(initial.hex))
			
 
				+    code_churns[0].append(str(measure_one))
			
 
				+    code_churns[0].append(str(measure_two))
			
 
				+    code_churns[0].append(str(measure_three))
			
 
				+    code_churns[0].append(str(line_of_code_old))
			
 
				+
			
 
				+    # Check how many processes that could be spawned
			
 
				+    cpus = cpu_count()
			
 
				+    print("Using {} cpus...".format(cpus))
			
 
				+
			
 
				+    # Equally split the commit set into the equally sized parts.
			
 
				+    quote, remainder = divmod(len(commits), cpus)
			
 
				+
			
 
				+    processes = [
			
 
				+        Process(
			
 
				+            target=parse_code_churns,
			
 
				+            args=(i, repo_path, branch, i * quote + min(i, remainder),
			
 
				+                  (i + 1) * quote + min(i + 1, remainder))) for i in range(cpus)
			
 
				+    ]
			
 
				+
			
 
				+    for process in processes:
			
 
				+        process.start()
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+    for process in processes:
			
 
				+        process.join()
			
 
				+    end_time = time.time()
			
 
				+
			
 
				+    print("Done")
			
 
				+    print("Overall processing time {}".format(end_time - start_time))
			
 
				+
			
 
				+    # Assemble the results
			
 
				+    churns = []
			
 
				+    for _, churn in RES.items():
			
 
				+        churns.extend(churn)
			
 
				+
			
 
				+    churns = list(reversed(churns))
			
 
				+    churns.append(code_churns[0])
			
 
				+    return churns
			
 
				+
			
 
				+def save_churns(churns, path="./results/code_churns_features_multithread.csv"):
			
 
				+    """
			
 
				+    Saves the code churns to a csv file.
			
 
				+    """
			
 
				+    with open(path, 'w') as csv_file:
			
 
				+        writer = csv.writer(csv_file)
			
 
				+        writer.writerow([
			
 
				+            "commit", "lines_of_code_added", "lines_of_code_deleted",
			
 
				+            "files_churned", "line_of_code_old"
			
 
				+        ])
			
 
				+
			
 
				+        for row in churns:
			
 
				+            if row:
			
 
				+                writer.writerow([row[0], row[1], row[2], row[3], row[4]])
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(description="Utility to extract code churns from" +
			
 
				+                            " a repository or a single commit.")
			
 
				+
			
 
				+    PARSER.add_argument(
			
 
				+        "--repository",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="./repos/jenkins",
			
 
				+        help="Path to local git repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--branch",
			
 
				+        "-b",
			
 
				+        type=str,
			
 
				+        default="refs/heads/master",
			
 
				+        help="Which branch to use.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+    REPOPATH = ARGS.repository
			
 
				+    BRANCH = ARGS.branch
			
 
				+
			
 
				+    if not os.path.exists(REPOPATH):
			
 
				+        print("The repository path does not exist!")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    CHURNS = get_code_churns(REPOPATH, BRANCH)
			
 
				+    save_churns(CHURNS)
			
--- a/code/data_assembler/assemble_coupling_features.py
+++ b/code/data_assembler/assemble_coupling_features.py
@@ -0,0 +1,135 @@
 
				+"""
			
 
				+Script to extract coupling features from code maat analysis files.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import os
			
 
				+
			
 
				+from git import Repo
			
 
				+import numpy as np
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+def save_features(features, res_path):
			
 
				+    """
			
 
				+    Save the coupling features to a csv file.
			
 
				+    """
			
 
				+    print("Saving to {}".format(os.path.abspath(res_path)))
			
 
				+    with open(os.path.abspath(res_path), 'w') as feat_file:
			
 
				+        feat_writer = csv.writer(feat_file)
			
 
				+
			
 
				+        feat_writer.writerow([
			
 
				+            "commit", "number_of_cruical_files",
			
 
				+            "number_of_moderate_risk_cruical_files",
			
 
				+            "number_of_high_risk_cruical_files",
			
 
				+            "number_of_non_modified_change_couplings"
			
 
				+        ])
			
 
				+        for feature in features:
			
 
				+            feat_writer.writerow(feature)
			
 
				+
			
 
				+
			
 
				+def get_features():
			
 
				+    """
			
 
				+    Get the coupling features from a number of files.
			
 
				+    """
			
 
				+    commits = list(REPO.iter_commits('master'))
			
 
				+
			
 
				+    couplings = {}
			
 
				+    features = []
			
 
				+
			
 
				+    for hexsha in os.listdir("/h/oskars/data_all"):
			
 
				+        couplings[hexsha] = os.path.join(
			
 
				+            os.path.join("/h/oskars/data_all", hexsha),
			
 
				+            "{}_coupling.log.res".format(hexsha))
			
 
				+
			
 
				+    features.append([commits[0].hexsha, 0, 0, 0])
			
 
				+    for i in tqdm(range(1, len(commits))):
			
 
				+        first = commits[i - 1]
			
 
				+        second = commits[i]
			
 
				+
			
 
				+        diff = first.diff(second)
			
 
				+
			
 
				+        paths = [d.b_path for d in diff]
			
 
				+
			
 
				+        cruical_moderate = 0
			
 
				+        cruical_high = 0
			
 
				+        cruical_files = 0
			
 
				+        cruical_non_modified_couplings = 0
			
 
				+
			
 
				+        if second.hexsha in couplings:
			
 
				+            cruical_commits = 0
			
 
				+            cruical_degrees = []
			
 
				+
			
 
				+            with open(couplings[second.hexsha], 'r') as csvfile:
			
 
				+                coup_rows = csv.reader(csvfile)
			
 
				+                files = {}
			
 
				+                file_coupling_graph = {}
			
 
				+
			
 
				+                next(coup_rows)
			
 
				+                for row in coup_rows:
			
 
				+                    degree = float(row[2])
			
 
				+
			
 
				+                    # Is this correct?
			
 
				+                    in_files = bool(row[0] in files)
			
 
				+                    if in_files and files[row[0]] > degree:
			
 
				+                        files[row[0]] = degree
			
 
				+                    elif not in_files:
			
 
				+                        files[row[0]] = degree
			
 
				+
			
 
				+                    is_in_coupling_graph = bool(row[0] in file_coupling_graph)
			
 
				+                    if is_in_coupling_graph and degree >= 75:
			
 
				+                        file_coupling_graph[row[0]].append(row[1])
			
 
				+                    elif degree >= 50:
			
 
				+                        file_coupling_graph[row[0]] = [row[1]]
			
 
				+
			
 
				+                    # Is this correct?
			
 
				+                    in_files = bool(row[1] in files)
			
 
				+                    if in_files and files[row[1]] > degree:
			
 
				+                        files[row[1]] = degree
			
 
				+                    elif not in_files:
			
 
				+                        files[row[1]] = degree
			
 
				+
			
 
				+                    is_in_coupling_graph = bool(row[1] in file_coupling_graph)
			
 
				+                    if is_in_coupling_graph and degree >= 75:
			
 
				+                        file_coupling_graph[row[1]].append(row[0])
			
 
				+                    elif degree >= 50:
			
 
				+                        file_coupling_graph[row[1]] = [row[0]]
			
 
				+
			
 
				+                for path in paths:
			
 
				+                    if path in files:
			
 
				+                        cruical_commits = cruical_commits + 1
			
 
				+                        cruical_degrees.append(files[path])
			
 
				+                        cruical_files = cruical_files + 1
			
 
				+
			
 
				+                # Check for all non modified cruical non coupled files.
			
 
				+                set_path = set(paths)
			
 
				+                for path in paths:
			
 
				+                    if path in file_coupling_graph:
			
 
				+                        file_couplings = set(file_coupling_graph[path])
			
 
				+                        cruical_non_modified_couplings = cruical_non_modified_couplings + len(
			
 
				+                            file_couplings - set_path)
			
 
				+
			
 
				+                inds = np.digitize(cruical_degrees, [25, 50, 75, 100])
			
 
				+                cruical_moderate = sum([1 for i in inds if i == 3])
			
 
				+                cruical_high = sum([1 for i in inds if i == 4])
			
 
				+
			
 
				+        features.append([
			
 
				+            second.hexsha,
			
 
				+            str(cruical_files),
			
 
				+            str(cruical_moderate),
			
 
				+            str(cruical_high),
			
 
				+            str(cruical_non_modified_couplings)
			
 
				+        ])
			
 
				+
			
 
				+    return features
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    global REPO
			
 
				+    REPO = Repo("../../jenkins")
			
 
				+    REPO = Repo("./repos/jenkins")
			
 
				+
			
 
				+    FEATURES = get_features()
			
 
				+    save_features(FEATURES, './results/coupling_features.csv')
			
--- a/code/data_assembler/assemble_diffusion_features.py
+++ b/code/data_assembler/assemble_diffusion_features.py
@@ -0,0 +1,254 @@
 
				+"""
			
 
				+Script for extracting diffusion features from a git repository.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+from multiprocessing import Process, Manager, cpu_count
			
 
				+from numpy import log2
			
 
				+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+MANAGER = Manager()
			
 
				+RES = MANAGER.dict()
			
 
				+
			
 
				+
			
 
				+def count_diffing_subsystems(subsystems):
			
 
				+    """
			
 
				+    Function for counting the number of subsystems in a repository.
			
 
				+    """
			
 
				+    number = 0
			
 
				+    for system in subsystems.values():
			
 
				+        number = number + count_diffing_subsystems(system)
			
 
				+
			
 
				+    return number + len(subsystems.keys())
			
 
				+
			
 
				+def count_entropy(file_changes, total_change):
			
 
				+    """
			
 
				+    Function to count entropy for some file changes.
			
 
				+    """
			
 
				+    if total_change == 0:
			
 
				+        return 0
			
 
				+    return sum([
			
 
				+        -1 * (float(x) / total_change) * (log2(float(x) / total_change)
			
 
				+                                          if x > 0 else 0)
			
 
				+        for x in file_changes
			
 
				+    ])
			
 
				+
			
 
				+
			
 
				+def parse_diffusion_features(pid, repo_path, branch, start, stop=-1):
			
 
				+    """
			
 
				+    Function to extract diffusion features from a set of commits.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+
			
 
				+    head = repo.references.get(branch)
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+
			
 
				+    start = start - 1 if (start > 0) else start
			
 
				+    commits = commits[start:stop] if (stop != -1) else commits[start:]
			
 
				+
			
 
				+    features = [[] for c in range(len(commits))]
			
 
				+    for i, commit in enumerate(tqdm(commits[1:], position=pid)):
			
 
				+        diff = repo.diff(commits[i], commit)
			
 
				+
			
 
				+        patches = [p for p in diff]
			
 
				+
			
 
				+        # Extract all different subsystems that have been modified
			
 
				+        modules = set([])
			
 
				+        subsystems_mapping = {}
			
 
				+        entropy_change = 0
			
 
				+
			
 
				+        file_changes = []
			
 
				+        total_change = 0
			
 
				+        for patch in patches:
			
 
				+            # Skip binary files
			
 
				+            if patch.delta.is_binary:
			
 
				+                continue
			
 
				+            _, addition, deletions = patch.line_stats
			
 
				+            total_change = total_change + (addition + deletions)
			
 
				+            file_changes.append(addition + deletions)
			
 
				+
			
 
				+            # Store all subsystems
			
 
				+            fpath = patch.delta.new_file.path
			
 
				+            subsystems = fpath.split('/')[:-1]
			
 
				+
			
 
				+            root = subsystems_mapping
			
 
				+            for system in subsystems:
			
 
				+                if system not in root:
			
 
				+                    root[system] = {}
			
 
				+                root = root[system]
			
 
				+            if subsystems > 0:
			
 
				+                modules.add(subsystems[0])
			
 
				+
			
 
				+        # Check how many subsystems that have been touched
			
 
				+        modified_systems = count_diffing_subsystems(subsystems_mapping)
			
 
				+
			
 
				+        # Calculate the entropy for the commit
			
 
				+        entropy_change = count_entropy(file_changes, total_change)
			
 
				+
			
 
				+        # Add all features
			
 
				+        features[i].append(str(commit.hex))
			
 
				+        features[i].append(str(float(modified_systems)))
			
 
				+        features[i].append(str(float(len(modules))))
			
 
				+        features[i].append(str(float(entropy_change)))
			
 
				+
			
 
				+    RES[pid] = features
			
 
				+
			
 
				+def parse_tree(tree, repo):
			
 
				+    """
			
 
				+    Parse a git tree and get the number of files, the number of systems and
			
 
				+    the number of subdirectories.
			
 
				+    """
			
 
				+    found_sub_entries = 0
			
 
				+    additions = 0
			
 
				+    file_additions = []
			
 
				+    tree = repo[tree.id]
			
 
				+
			
 
				+    for entry in tree:
			
 
				+        if entry.type == "bin":
			
 
				+            continue
			
 
				+        if entry.type == "tree":
			
 
				+            sub_additions, sub_file_additions, sub_entries = parse_tree(
			
 
				+                entry, repo)
			
 
				+            found_sub_entries += (1 + sub_entries)
			
 
				+            additions += sub_additions
			
 
				+            file_additions.extend(sub_file_additions)
			
 
				+        else:
			
 
				+            try:
			
 
				+                sub_addition = len(str(repo[entry.id]).split('\n'))
			
 
				+                additions += sub_addition
			
 
				+                file_additions.append(sub_addition)
			
 
				+            except Exception as ex:
			
 
				+                print(ex)
			
 
				+                continue
			
 
				+
			
 
				+    return additions, file_additions, found_sub_entries
			
 
				+
			
 
				+def get_diffusion_features(repo_path, branch):
			
 
				+    """
			
 
				+    Function that extracts the first commits diffusion features. It then starts
			
 
				+    a number of processes(equal to the number of cores on the computer), and then
			
 
				+    distributes the remaining commits to them.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+    initial = commits[0]
			
 
				+    init_tree = initial.tree
			
 
				+
			
 
				+    # Count inital total lines of code
			
 
				+    init_total_additions = 0
			
 
				+    init_file_addtions = []
			
 
				+
			
 
				+    init_subdirectories = 0
			
 
				+    init_modules = 0
			
 
				+
			
 
				+    for entry in init_tree:
			
 
				+        if entry.type == "tree":
			
 
				+            added, file_additions, subdirectories = parse_tree(entry, repo)
			
 
				+
			
 
				+            init_modules += 1
			
 
				+            init_file_addtions.extend(file_additions)
			
 
				+            init_total_additions += added
			
 
				+            init_subdirectories += subdirectories
			
 
				+        else:
			
 
				+            try:
			
 
				+                additions = len(str(repo[entry.id]).split('\n'))
			
 
				+                init_total_additions += additions
			
 
				+                init_file_addtions.append(additions)
			
 
				+            except:
			
 
				+                continue
			
 
				+    diffusion_features = []
			
 
				+    diffusion_features.append(initial.hex)
			
 
				+    diffusion_features.append(init_subdirectories)
			
 
				+    diffusion_features.append(init_modules)
			
 
				+    diffusion_features.append(
			
 
				+        count_entropy(init_file_addtions, init_total_additions))
			
 
				+
			
 
				+    # Check how many processes that could be spawned
			
 
				+    cpus = cpu_count()
			
 
				+    print("Using {} cpus...".format(cpus))
			
 
				+    # Divide the commits eqaully between the processes.
			
 
				+    quote, remainder = divmod(len(commits), cpus)
			
 
				+
			
 
				+    processes = [
			
 
				+        Process(
			
 
				+            target=parse_diffusion_features,
			
 
				+            args=(i, repo_path, branch, i * quote + min(i, remainder),
			
 
				+                  (i + 1) * quote + min(i + 1, remainder))) for i in range(cpus)
			
 
				+    ]
			
 
				+
			
 
				+    for process in processes:
			
 
				+        process.start()
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+    for process in processes:
			
 
				+        process.join()
			
 
				+    end_time = time.time()
			
 
				+
			
 
				+    print("Done")
			
 
				+    print("Overall processing time {}".format(end_time - start_time))
			
 
				+
			
 
				+    # Assemble the results
			
 
				+    features = []
			
 
				+    for _, feat in RES.items():
			
 
				+        features.extend(feat)
			
 
				+    features = list(reversed(features))
			
 
				+    features.append(diffusion_features)
			
 
				+    return features
			
 
				+
			
 
				+def save_diffusion_features(diffusion_features,
			
 
				+                            path="./results/diffusion_features.csv"):
			
 
				+    """
			
 
				+    Save the diffusion features to a csv file.
			
 
				+    """
			
 
				+    with open(path, 'w') as csv_file:
			
 
				+        writer = csv.writer(csv_file)
			
 
				+        writer.writerow([
			
 
				+            "commit", "modified_subsystems", "modified_subdirectories",
			
 
				+            "entropy"
			
 
				+        ])
			
 
				+        for row in diffusion_features:
			
 
				+            if row:
			
 
				+                writer.writerow([row[0], row[1], row[2], row[3]])
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(
			
 
				+        description="Utility to extract diffusion features from" +
			
 
				+        " a repository or a single commit.")
			
 
				+
			
 
				+    PARSER.add_argument(
			
 
				+        "--repository",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="./repos/jenkins",
			
 
				+        help="Path to local git repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--branch",
			
 
				+        "-b",
			
 
				+        type=str,
			
 
				+        default="refs/heads/master",
			
 
				+        help="Which branch to use.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+    REPOPATH = ARGS.repository
			
 
				+    BRANCH = ARGS.branch
			
 
				+    if not os.path.exists(REPOPATH):
			
 
				+        print("The repository path does not exist!")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    DIFFUSION_FEATURES = get_diffusion_features(REPOPATH, BRANCH)
			
 
				+    save_diffusion_features(DIFFUSION_FEATURES)
			
--- a/code/data_assembler/assemble_experience_features.py
+++ b/code/data_assembler/assemble_experience_features.py
@@ -0,0 +1,244 @@
 
				+"""
			
 
				+Script for extracting the experience features in a software repository.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import json
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+from datetime import datetime
			
 
				+from numpy import floor
			
 
				+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+def set_to_list(obj):
			
 
				+    """
			
 
				+    Helper function to turn sets to lists and floats to strings.
			
 
				+    """
			
 
				+    if isinstance(obj, set):
			
 
				+        return list(obj)
			
 
				+    if isinstance(obj, float):
			
 
				+        return str('%.15g' % obj)
			
 
				+    raise TypeError
			
 
				+
			
 
				+
			
 
				+def get_files_in_tree(tree, repo):
			
 
				+    """
			
 
				+    Function to get the files in a tree.
			
 
				+    """
			
 
				+    files = set()
			
 
				+    for entry in tree:
			
 
				+        if entry.type == "tree":
			
 
				+            sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
			
 
				+                         for f in get_files_in_tree(repo[entry.id], repo)]
			
 
				+            files.update(sub_files)
			
 
				+        else:
			
 
				+            blob = repo[entry.id]
			
 
				+            if not blob.is_binary:
			
 
				+                if entry.name.endswith("java"):
			
 
				+                    files.add((entry.hex, entry.name))
			
 
				+    return files
			
 
				+
			
 
				+
			
 
				+def get_diffing_files(commit, parent, repo):
			
 
				+    """
			
 
				+    Function to get the files that differs between two commits.
			
 
				+    """
			
 
				+    diff = repo.diff(parent, commit)
			
 
				+
			
 
				+    patches = [p for p in diff]
			
 
				+
			
 
				+    files = set()
			
 
				+
			
 
				+    for patch in patches:
			
 
				+        if patch.delta.is_binary:
			
 
				+            continue
			
 
				+        nfile = patch.delta.new_file
			
 
				+        files.add((nfile.id, nfile.path, patch.delta.status))
			
 
				+
			
 
				+    return files
			
 
				+
			
 
				+def save_experience_features_graph(repo_path, branch, graph_path):
			
 
				+    """
			
 
				+    Function to get and save the experience graph.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+    current_commit = repo.head.target
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    current_commit = repo.get(str(current_commit))
			
 
				+    files = get_files_in_tree(current_commit.tree, repo)
			
 
				+
			
 
				+    all_authors = {}
			
 
				+
			
 
				+    author = current_commit.committer.name
			
 
				+
			
 
				+    all_authors[author] = {}
			
 
				+    all_authors[author]['lastcommit'] = current_commit.hex
			
 
				+    all_authors[author][current_commit.hex] = {}
			
 
				+    all_authors[author][current_commit.hex]['prevcommit'] = ""
			
 
				+    all_authors[author][current_commit.hex]["exp"] = 1
			
 
				+    all_authors[author][current_commit.hex]["rexp"] = [[len(files), 1]]
			
 
				+    all_authors[author][current_commit.hex]["sexp"] = {}
			
 
				+
			
 
				+    for i, commit in enumerate(tqdm(commits[1:])):
			
 
				+        files = get_diffing_files(commit, commits[i], repo)
			
 
				+
			
 
				+        author = commit.committer.name
			
 
				+        if author not in all_authors:
			
 
				+            all_authors[author] = {}
			
 
				+            all_authors[author]['lastcommit'] = commit.hex
			
 
				+            all_authors[author][commit.hex] = {}
			
 
				+            all_authors[author][commit.hex]['prevcommit'] = ""
			
 
				+            all_authors[author][commit.hex]["exp"] = 1
			
 
				+            all_authors[author][commit.hex]["rexp"] = [[len(files), 1.0]]
			
 
				+            all_authors[author][commit.hex]["sexp"] = {}
			
 
				+        else:
			
 
				+            last_commit = all_authors[author]["lastcommit"]
			
 
				+            all_authors[author]["lastcommit"] = commit.hex
			
 
				+            all_authors[author][commit.hex] = {}
			
 
				+            all_authors[author][commit.hex]['prevcommit'] = last_commit
			
 
				+            all_authors[author][commit.hex][
			
 
				+                'exp'] = 1 + all_authors[author][last_commit]['exp']
			
 
				+
			
 
				+            date_current = datetime.fromtimestamp(commit.commit_time)
			
 
				+            date_last = datetime.fromtimestamp(repo.get(last_commit).commit_time)
			
 
				+
			
 
				+            diffing_years = abs(floor(float((date_current - date_last).days) / 365))
			
 
				+
			
 
				+            overall = all_authors[author][last_commit]['rexp']
			
 
				+
			
 
				+            all_authors[author][commit.hex][
			
 
				+                'rexp'] = [[len(files), 1.0]] + [[e[0], e[1] + diffing_years]
			
 
				+                                                 for e in overall]
			
 
				+
			
 
				+    with open(graph_path, 'w') as output:
			
 
				+        json.dump(all_authors, output, default=set_to_list)
			
 
				+
			
 
				+    end_time = time.time()
			
 
				+
			
 
				+    print("Done")
			
 
				+    print("Overall processing time {}".format(end_time - start_time))
			
 
				+
			
 
				+def load_experience_features_graph(path="./results/author_graph.json"):
			
 
				+    """
			
 
				+    Function to load the feeatures graph.
			
 
				+    """
			
 
				+    file_graph = {}
			
 
				+    with open(path, 'r') as inp:
			
 
				+        file_graph = json.load(inp, parse_float=lambda x: float(x))
			
 
				+    return file_graph
			
 
				+
			
 
				+
			
 
				+def get_experience_features(graph, repo_path, branch):
			
 
				+    """
			
 
				+    Function that extracts the experience features from a experience graph.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+    current_commit = repo.head.target
			
 
				+
			
 
				+    files = get_files_in_tree(repo.get(str(current_commit)).tree, repo)
			
 
				+
			
 
				+    features = []
			
 
				+
			
 
				+    commit_feat = []
			
 
				+    commit_feat.append(str(commits[0].hex))
			
 
				+    commit_feat.append(str(1.0))
			
 
				+    commit_feat.append(str(len(files)))
			
 
				+    commit_feat.append(str(0.0))
			
 
				+    features.append(commit_feat)
			
 
				+
			
 
				+    for _, commit in enumerate(tqdm(commits[1:])):
			
 
				+        author = commit.committer.name
			
 
				+
			
 
				+        exp = graph[author][commit.hex]['exp']
			
 
				+        rexp = graph[author][commit.hex]['rexp']
			
 
				+        try:
			
 
				+            rrexp = sum([float(float(e[0]) / (float(e[1]) + 1)) for e in rexp])
			
 
				+        except:
			
 
				+            print(author)
			
 
				+            print(commit.hex)
			
 
				+            print(rexp)
			
 
				+            sys.exit(1)
			
 
				+
			
 
				+        commit_feat = []
			
 
				+        commit_feat.append(str(commit.hex))
			
 
				+        commit_feat.append(str(float(exp)))
			
 
				+        commit_feat.append(str(float(rrexp)))
			
 
				+        commit_feat.append(str(float(0)))
			
 
				+        features.append(commit_feat)
			
 
				+    return features
			
 
				+
			
 
				+
			
 
				+def save_experience_features(history_features, path):
			
 
				+    """
			
 
				+    Save the experience features to a csv file.
			
 
				+    """
			
 
				+    with open(path, 'w') as csv_file:
			
 
				+        writer = csv.writer(csv_file)
			
 
				+        writer.writerow(["commit", "experience", "rexp", "sexp"])
			
 
				+        for row in history_features:
			
 
				+            if row:
			
 
				+                writer.writerow([row[0], row[1], row[2], row[3]])
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(description="Utility to extract code churns from" +
			
 
				+                            " a repository or a single commit.")
			
 
				+
			
 
				+    PARSER.add_argument(
			
 
				+        "--repository",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="./repos/jenkins",
			
 
				+        help="Path to local git repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--branch",
			
 
				+        "-b",
			
 
				+        type=str,
			
 
				+        default="refs/heads/master",
			
 
				+        help="Which branch to use.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--save-graph",
			
 
				+        "-sg",
			
 
				+        action="store_true",
			
 
				+        help="Generate a new graph for a repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--graph-path",
			
 
				+        "-gp",
			
 
				+        type=str,
			
 
				+        default="./results/author_graph.json",
			
 
				+        help="The path to where the graph is stored.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--output",
			
 
				+        "-o",
			
 
				+        type=str,
			
 
				+        default="./results/experience_features.csv",
			
 
				+        help="The path where the output is written.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+    REPO_PATH = ARGS.repository
			
 
				+    BRANCH = ARGS.branch
			
 
				+    SAVE_GRAPH = ARGS.save_graph
			
 
				+    GRAPH_PATH = ARGS.graph_path
			
 
				+    OUTPUT = ARGS.output
			
 
				+
			
 
				+    if SAVE_GRAPH:
			
 
				+        save_experience_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
			
 
				+    GRAPH = load_experience_features_graph(GRAPH_PATH)
			
 
				+    EXPERIENCE_FEATURES = get_experience_features(GRAPH, REPO_PATH, BRANCH)
			
 
				+    save_experience_features(EXPERIENCE_FEATURES, OUTPUT)
			
--- a/code/data_assembler/assemble_features.py
+++ b/code/data_assembler/assemble_features.py
@@ -0,0 +1,292 @@
 
				+"""
			
 
				+Script that runs several docker containers which in turn runs an analysis on
			
 
				+a git repository.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import shutil
			
 
				+import time
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+from distutils.dir_util import copy_tree
			
 
				+from multiprocessing import Process, cpu_count
			
 
				+from git import Repo
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+import docker
			
 
				+
			
 
				+def start_container(client, image, name, repo_dir, result_dir):
			
 
				+    """
			
 
				+    Function that starts a docker container and links the repo into it and
			
 
				+    a directory where the results are stored.
			
 
				+    """
			
 
				+    for container in client.containers.list(all=True):
			
 
				+        if name == container.name:
			
 
				+            if container.status == "running":
			
 
				+                container.kill()
			
 
				+            container.remove()
			
 
				+
			
 
				+    path = os.path.abspath('./')
			
 
				+
			
 
				+    container = client.containers.run(
			
 
				+        image,
			
 
				+        name=name,
			
 
				+        stdin_open=True,
			
 
				+        detach=True,
			
 
				+        volumes={
			
 
				+            str(path + "/scripts"): {
			
 
				+                'bind': '/root/scripts',
			
 
				+                'mode': 'rw'
			
 
				+            },
			
 
				+            result_dir: {
			
 
				+                'bind': '/root/results',
			
 
				+                'mode': 'rw'
			
 
				+            },
			
 
				+            os.path.abspath(repo_dir): {
			
 
				+                'bind': '/root/repo',
			
 
				+                'mode': 'rw'
			
 
				+            }
			
 
				+        },
			
 
				+        command="bash")
			
 
				+
			
 
				+    return container
			
 
				+
			
 
				+def run_command(container, command):
			
 
				+    """
			
 
				+    Function that executes a command inside a container.
			
 
				+    """
			
 
				+    return container.exec_run(
			
 
				+        cmd="bash -c \"" + command + "\"", tty=True, privileged=True)
			
 
				+
			
 
				+
			
 
				+def run_analysis(t_id, container, commits):
			
 
				+    """
			
 
				+    Function that runs a command inside all docker container.
			
 
				+    """
			
 
				+    for commit in tqdm(
			
 
				+            commits, desc="Progress process {}".format(t_id), position=t_id):
			
 
				+        run_command(container,
			
 
				+                    "/root/scripts/analyse_commit {}".format(commit))
			
 
				+
			
 
				+def copy_repo(src, dest):
			
 
				+    """
			
 
				+    Helper function to copy a repository to another destination.
			
 
				+    """
			
 
				+    try:
			
 
				+        shutil.copytree(src, dest)
			
 
				+    except shutil.Error as exp:
			
 
				+        print("Directory not copied. Error: {}".format(exp))
			
 
				+    except OSError as exp:
			
 
				+        print("Directory not copied. Error: {}".format(exp))
			
 
				+
			
 
				+def partion_commits(commits, partitions):
			
 
				+    """
			
 
				+    Function that divides commits into evenly partitions.
			
 
				+    """
			
 
				+    quote, remainder = divmod(len(commits), partitions)
			
 
				+    chunk_commits = [(i * quote + min(i, remainder), (i + 1) * quote + min(i + 1, remainder) - 1)
			
 
				+                     for i in range(partitions)]
			
 
				+    chunk_commits[-1] = (chunk_commits[-1][0], chunk_commits[-1][1] + 1)
			
 
				+
			
 
				+    commits = [[commit for commit in commits[chunk[0]:chunk[1]]]
			
 
				+               for chunk in chunk_commits]
			
 
				+    return commits
			
 
				+
			
 
				+def start_analysis(image, result_dir, commits=None, cpus=cpu_count()):
			
 
				+    """
			
 
				+    This function starts a docker container that can analyze a git repository. It starts several
			
 
				+    containers if the cpus are more than one.
			
 
				+    """
			
 
				+    client = docker.from_env()
			
 
				+    repo = Repo(REPO)
			
 
				+
			
 
				+    # Since the script is working directly on the repository, they have
			
 
				+    # to have a separately copy.
			
 
				+    if not os.path.exists("./repos"):
			
 
				+        os.makedirs("./repos")
			
 
				+
			
 
				+    repo_name = os.path.basename(os.path.normpath(REPO))
			
 
				+
			
 
				+    for cpu in range(cpus):
			
 
				+        copy_repo(REPO, "./repos/{}{}".format(repo_name, cpu))
			
 
				+
			
 
				+    # Split the commits into even parts.
			
 
				+    if not commits:
			
 
				+        commits = [
			
 
				+            str(commit.hexsha) for commit in list(repo.iter_commits('master'))
			
 
				+        ]
			
 
				+
			
 
				+    commits = partion_commits(commits, cpus)
			
 
				+
			
 
				+    containers = []
			
 
				+    for cpu in range(cpus):
			
 
				+        container = start_container(
			
 
				+            client,
			
 
				+            image=image,
			
 
				+            name="analysis_{}_cpu_{}".format(repo_name, cpu),
			
 
				+            repo_dir="./repos/{}{}".format(repo_name, cpu),
			
 
				+            result_dir=result_dir + "/data{}".format(cpu))
			
 
				+        containers.append(container)
			
 
				+
			
 
				+    processes = [
			
 
				+        Process(target=run_analysis, args=(i, containers[i], commits[i]))
			
 
				+        for i in range(cpus)
			
 
				+    ]
			
 
				+    for process in processes:
			
 
				+        process.start()
			
 
				+    for process in processes:
			
 
				+        process.join()
			
 
				+
			
 
				+    for container in containers:
			
 
				+        print(container.status)
			
 
				+        print(container.name)
			
 
				+        if (container.status != "exited" or container.status != "dead"):
			
 
				+            container.kill()
			
 
				+        container.remove()
			
 
				+
			
 
				+    shutil.rmtree("./repos", ignore_errors=True)
			
 
				+
			
 
				+def parse_commits(commit_file):
			
 
				+    """
			
 
				+    Read the commits from a file and reutrn the content.
			
 
				+    """
			
 
				+    if not os.path.exists(commit_file):
			
 
				+        print("commit_file doesn't exist!!", file=sys.stderr)
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    commits = []
			
 
				+    with open(commit_file, 'r') as cfile:
			
 
				+        commits = [line.strip() for line in cfile.readlines()]
			
 
				+    return commits
			
 
				+
			
 
				+def assemble_directories(result_path, cpus=cpu_count()):
			
 
				+    """
			
 
				+    Copy all results into a single directory.
			
 
				+    """
			
 
				+    result_path = os.path.abspath(result_path)
			
 
				+    paths = ["{}/data{}".format(result_path, i) for i in range(cpus)]
			
 
				+
			
 
				+    if not all([os.path.exists(p) for p in paths]):
			
 
				+        print("data paths doesn't exists!", file=sys.stderr)
			
 
				+        return
			
 
				+
			
 
				+    files = []
			
 
				+
			
 
				+    for path in paths:
			
 
				+        for item in os.listdir(path):
			
 
				+            commit = os.path.join(path, item)
			
 
				+            corrupt = False if (len(os.listdir(commit)) == 2) else True
			
 
				+
			
 
				+            if (os.path.isdir(commit) and not corrupt):
			
 
				+                files.append((commit, item))
			
 
				+
			
 
				+    print("Saving all analysed commits into a single directory: {}/data_all".
			
 
				+          format(result_path))
			
 
				+    if not os.path.exists("{}/data_all".format(result_path)):
			
 
				+        os.makedirs("{}/data_all".format(result_path))
			
 
				+
			
 
				+    for file_tuple in files:
			
 
				+        if not os.path.exists("{}/data_all/{}".format(result_path, file_tuple[1])):
			
 
				+            copy_tree(file_tuple[0], "{}/data_all/{}".format(result_path, file_tuple[1]))
			
 
				+
			
 
				+def check_for_missing_commits(repo_path, result_path):
			
 
				+    """
			
 
				+    Controller function that checks if all commits has been analyzed.
			
 
				+    """
			
 
				+    result_dir = os.path.abspath(result_path)
			
 
				+    if not os.path.exists(result_path):
			
 
				+        print("Result path doesn't exist!", file=sys.stderr)
			
 
				+        return
			
 
				+
			
 
				+    repo = Repo(repo_path)
			
 
				+
			
 
				+    current_commits = []
			
 
				+    for item in os.listdir(result_dir):
			
 
				+        current_commits.append(item)
			
 
				+
			
 
				+    all_repo_commits = [c.hexsha for c in list(repo.iter_commits('master'))]
			
 
				+
			
 
				+    missing_commits = set(all_repo_commits) - set(current_commits)
			
 
				+
			
 
				+    if missing_commits:
			
 
				+        with open("./missing_commits.txt", 'w') as cfile:
			
 
				+            for commit in missing_commits:
			
 
				+                cfile.write(commit)
			
 
				+                cfile.write('\n')
			
 
				+        print("Wrote missing commits to missing_commits.txt")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(description="Utility to run several docker " +
			
 
				+                            "containers onto a git repository. " +
			
 
				+                            "Each container is given a set of " +
			
 
				+                            "commits and is instructed to run " +
			
 
				+                            "an analysis on each one of them.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--analyse", "-a", action="store_true", help="Run an analysation.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--image",
			
 
				+        "-i",
			
 
				+        type=str,
			
 
				+        default="code-maat",
			
 
				+        help="Specification of which image to use.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--repo-dir",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="../../jenkins",
			
 
				+        help="Specification of which repo to use.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--result-dir",
			
 
				+        "-rd",
			
 
				+        type=str,
			
 
				+        default="/h/oskars",
			
 
				+        help="Specification of where to store the result.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--commits",
			
 
				+        "-c",
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help="Direction to a file containing commits to analyse.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--assemble",
			
 
				+        "-as",
			
 
				+        action="store_true",
			
 
				+        help="Assemble the results into a single directory.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--missing-commits",
			
 
				+        "-mc",
			
 
				+        action="store_true",
			
 
				+        help="Check for non analysed commits.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+
			
 
				+    global REPO
			
 
				+    REPO = os.path.abspath(ARGS.repo_dir)
			
 
				+
			
 
				+    if ARGS.commits:
			
 
				+        COMMITS = parse_commits(ARGS.commits)
			
 
				+    else:
			
 
				+        COMMITS = []
			
 
				+
			
 
				+    CLIENT = docker.from_env()
			
 
				+    if ARGS.analyse:
			
 
				+        print("Starting the analysis using {} cpus...".format(cpu_count()))
			
 
				+        START = time.time()
			
 
				+        if COMMITS:
			
 
				+            start_analysis(ARGS.image, ARGS.result_dir, commits=COMMITS)
			
 
				+        else:
			
 
				+            start_analysis(ARGS.image, ARGS.result_dir)
			
 
				+        STOP = time.time()
			
 
				+        print("Done in {}".format(
			
 
				+            time.strftime('%H:%M:%S', time.gmtime(STOP - START))))
			
 
				+        print("Results can be found in {}".format(
			
 
				+            ARGS.result_dir + "/data{" +
			
 
				+            ','.join(["{}".format(i) for i in range(cpu_count())]) + "}"))
			
 
				+    if ARGS.assemble:
			
 
				+        assemble_directories(ARGS.result_dir)
			
 
				+    if ARGS.missing_commits:
			
 
				+        check_for_missing_commits(ARGS.repo_dir, ARGS.result_dir)
			
--- a/code/data_assembler/assemble_history_features.py
+++ b/code/data_assembler/assemble_history_features.py
@@ -0,0 +1,239 @@
 
				+"""
			
 
				+Script to extract history features from a git repository.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import json
			
 
				+import time
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+def set_to_list(obj):
			
 
				+    """
			
 
				+    Helper function to convert a set to a list.
			
 
				+    """
			
 
				+    if isinstance(obj, set):
			
 
				+        return list(obj)
			
 
				+    raise TypeError
			
 
				+
			
 
				+def get_files_in_tree(tree, repo):
			
 
				+    """
			
 
				+    Extract the hex of all files and their name.
			
 
				+    """
			
 
				+    files = set()
			
 
				+    for entry in tree:
			
 
				+        if entry.type == "tree":
			
 
				+            sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
			
 
				+                         for f in get_files_in_tree(repo[entry.id], repo)]
			
 
				+            files.update(sub_files)
			
 
				+        else:
			
 
				+            blob = repo[entry.id]
			
 
				+            if not blob.is_binary:
			
 
				+                if entry.name.endswith("java"):
			
 
				+                    files.add((entry.hex, entry.name))
			
 
				+    return files
			
 
				+
			
 
				+
			
 
				+def get_diffing_files(commit, parent, repo):
			
 
				+    """
			
 
				+    Get the files that diffed between two commits.
			
 
				+    """
			
 
				+    diff = repo.diff(parent, commit)
			
 
				+
			
 
				+    patches = [p for p in diff]
			
 
				+
			
 
				+    files = set()
			
 
				+
			
 
				+    for patch in patches:
			
 
				+        if patch.delta.is_binary:
			
 
				+            continue
			
 
				+        nfile = patch.delta.new_file
			
 
				+        files.add((nfile.id, nfile.path, patch.delta.status))
			
 
				+
			
 
				+    return files
			
 
				+
			
 
				+
			
 
				+def save_history_features_graph(repo_path, branch, graph_path):
			
 
				+    """
			
 
				+    Track the number of developers that have worked in a repository and save the
			
 
				+    results in a graph which could be used for later use.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+    current_commit = repo.head.target
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    all_files = {}
			
 
				+    current_commit = repo.get(str(current_commit))
			
 
				+    files = get_files_in_tree(current_commit.tree, repo)
			
 
				+
			
 
				+    for (_, name) in tqdm(files):
			
 
				+        all_files[name] = {}
			
 
				+        all_files[name]['lastcommit'] = current_commit.hex
			
 
				+        all_files[name][current_commit.hex] = {}
			
 
				+        all_files[name][current_commit.hex]["prevcommit"] = ""
			
 
				+        all_files[name][current_commit.hex]["authors"] = [
			
 
				+            current_commit.committer.name
			
 
				+        ]
			
 
				+
			
 
				+    for i, commit in enumerate(tqdm(commits[1:])):
			
 
				+        files = get_diffing_files(commit, commits[i], repo)
			
 
				+        for (_, name, _) in files:
			
 
				+            if name not in all_files:
			
 
				+                all_files[name] = {}
			
 
				+
			
 
				+            last_commit = ""
			
 
				+            if 'lastcommit' not in all_files[name]:
			
 
				+                all_files[name]['lastcommit'] = commit.hex
			
 
				+            else:
			
 
				+                last_commit = all_files[name]['lastcommit']
			
 
				+
			
 
				+            all_files[name][commit.hex] = {}
			
 
				+            all_files[name][commit.hex]["prevcommit"] = last_commit
			
 
				+
			
 
				+            authors = set([commit.committer.name])
			
 
				+            if last_commit:
			
 
				+                authors.update(all_files[name][last_commit]["authors"])
			
 
				+            all_files[name][commit.hex]["authors"] = authors
			
 
				+
			
 
				+            all_files[name]['lastcommit'] = commit.hex
			
 
				+
			
 
				+    with open(graph_path, 'w') as output:
			
 
				+        json.dump(all_files, output, default=set_to_list)
			
 
				+
			
 
				+    end_time = time.time()
			
 
				+
			
 
				+    print("Done")
			
 
				+    print("Overall processing time {}".format(end_time - start_time))
			
 
				+
			
 
				+def load_history_features_graph(path):
			
 
				+    """
			
 
				+    Save the history features to a csv file.
			
 
				+    """
			
 
				+    file_graph = {}
			
 
				+    with open(path, 'r') as inp:
			
 
				+        file_graph = json.load(inp)
			
 
				+    return file_graph
			
 
				+
			
 
				+
			
 
				+def get_history_features(graph, repo_path, branch):
			
 
				+    """
			
 
				+    Function that extracts the history features from a git repository.
			
 
				+    They are the total number of authors, the total age and the total
			
 
				+    number of unique changes.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+
			
 
				+    features = []
			
 
				+
			
 
				+    commit_feat = []
			
 
				+    commit_feat.append(str(commits[0].hex))
			
 
				+    commit_feat.append(str(1.0))
			
 
				+    commit_feat.append(str(0.0))
			
 
				+    commit_feat.append(str(0.0))
			
 
				+    features.append(commit_feat)
			
 
				+
			
 
				+    for i, commit in enumerate(tqdm(commits[1:])):
			
 
				+        files = get_diffing_files(commit, commits[i], repo)
			
 
				+
			
 
				+        total_number_of_authors = set()
			
 
				+        total_age = []
			
 
				+        total_unique_changes = set()
			
 
				+
			
 
				+        for (_, name, _) in files:
			
 
				+            sub_graph = graph[name][commit.hex]
			
 
				+            total_number_of_authors.update(sub_graph['authors'])
			
 
				+
			
 
				+            prev_commit = sub_graph['prevcommit']
			
 
				+            if prev_commit:
			
 
				+                total_unique_changes.add(prev_commit)
			
 
				+
			
 
				+                prev_commit_obj = repo.get(prev_commit)
			
 
				+
			
 
				+                total_age.append(commit.commit_time -
			
 
				+                                 prev_commit_obj.commit_time)
			
 
				+
			
 
				+        total_age = float(sum(total_age)) / len(total_age) if total_age else 0
			
 
				+
			
 
				+        commit_feat = []
			
 
				+        commit_feat.append(str(commit.hex))
			
 
				+        commit_feat.append(str(float(len(total_number_of_authors))))
			
 
				+        commit_feat.append(str(float(total_age)))
			
 
				+        commit_feat.append(str(float(len(total_unique_changes))))
			
 
				+        features.append(commit_feat)
			
 
				+    return features
			
 
				+
			
 
				+
			
 
				+def save_history_features(history_features, path):
			
 
				+    """
			
 
				+    Function to save the history features as a csv file.
			
 
				+    """
			
 
				+    with open(path, 'w') as csv_file:
			
 
				+        writer = csv.writer(csv_file)
			
 
				+        writer.writerow(
			
 
				+            ["commit", "number_of_authors", "age", "number_unique_changes"])
			
 
				+        for row in history_features:
			
 
				+            if row:
			
 
				+                writer.writerow([row[0], row[1], row[2], row[3]])
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(description="Utility to extract code churns from" +
			
 
				+                            " a repository or a single commit.")
			
 
				+
			
 
				+    PARSER.add_argument(
			
 
				+        "--repository",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="./repos/jenkins",
			
 
				+        help="Path to local git repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--branch",
			
 
				+        "-b",
			
 
				+        type=str,
			
 
				+        default="refs/heads/master",
			
 
				+        help="Which branch to use.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--save-graph",
			
 
				+        "-sg",
			
 
				+        action="store_true",
			
 
				+        help="Generate a new graph for a repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--graph-path",
			
 
				+        "-gp",
			
 
				+        type=str,
			
 
				+        default="./results/file_graph.json",
			
 
				+        help="The path to where the graph is stored.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--output",
			
 
				+        "-o",
			
 
				+        type=str,
			
 
				+        default="./results/history_features.csv",
			
 
				+        help="The path where the output is written.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+    REPO_PATH = ARGS.repository
			
 
				+    BRANCH = ARGS.branch
			
 
				+    SAVE_GRAPH = ARGS.save_graph
			
 
				+    GRAPH_PATH = ARGS.graph_path
			
 
				+    OUTPUT = ARGS.output
			
 
				+    print(SAVE_GRAPH)
			
 
				+
			
 
				+    if SAVE_GRAPH:
			
 
				+        save_history_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
			
 
				+    GRAPH = load_history_features_graph(GRAPH_PATH)
			
 
				+    HISTORY_FEATURES = get_history_features(GRAPH, REPO_PATH, BRANCH)
			
 
				+    save_history_features(HISTORY_FEATURES, OUTPUT)
			
--- a/code/data_assembler/assemble_labels.py
+++ b/code/data_assembler/assemble_labels.py
@@ -0,0 +1,149 @@
 
				+"""
			
 
				+Script to generate a labels file from a file produced by the SZZ algorithm.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import json
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+from datetime import datetime as dat
			
 
				+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+
			
 
				+def get_labels(repo_path, branch, pair_file, last_commit):
			
 
				+    """
			
 
				+    Get the labels from a file produced by the SZZ algorithm. It contains
			
 
				+    bug fixing commits and their respective bug fixing commit.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = []
			
 
				+    for commit in list(
			
 
				+            repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)):
			
 
				+        commits.append(commit)
			
 
				+        if commit.hex == last_commit:
			
 
				+            break
			
 
				+
			
 
				+    commits = list(reversed(commits))
			
 
				+
			
 
				+    pairs = {}
			
 
				+    with open(pair_file, 'r') as inp:
			
 
				+        pairs = json.load(inp)
			
 
				+
			
 
				+    unique_pairs = set([p[1] for p in pairs])
			
 
				+    labels = []
			
 
				+
			
 
				+    for commit in tqdm(commits):
			
 
				+        label = [commit.hex, "1" if commit.hex in unique_pairs else "0"]
			
 
				+        labels.append(label)
			
 
				+
			
 
				+    return labels
			
 
				+
			
 
				+
			
 
				+def save_labels(labels, res_path):
			
 
				+    """
			
 
				+    Save the labels as a csv file.
			
 
				+    """
			
 
				+    with open(res_path, 'w') as out:
			
 
				+        writer = csv.writer(out)
			
 
				+        writer.writerow(["commit", "label"])
			
 
				+        for label in labels:
			
 
				+            writer.writerow(label)
			
 
				+
			
 
				+
			
 
				+def save_label_distribution(repo_path, branch, labels, res_path):
			
 
				+    """
			
 
				+    Save a distribution of the labels over time.
			
 
				+    """
			
 
				+    ldict = set()
			
 
				+    for label in labels:
			
 
				+        if label[1] == "1":
			
 
				+            ldict.add(label[0])
			
 
				+
			
 
				+    repo = Repository(repo_path)
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(repo.walk(head.target, GIT_SORT_TOPOLOGICAL))
			
 
				+
			
 
				+    start_year = dat.fromtimestamp(commits[-1].commit_time).year
			
 
				+    end_year = dat.fromtimestamp(commits[0].commit_time).year
			
 
				+
			
 
				+    num_years = end_year - start_year
			
 
				+    year_dist = [0 for y in range(num_years + 1)]
			
 
				+    years = [y for y in range(start_year, end_year + 1)]
			
 
				+
			
 
				+    for commit in commits:
			
 
				+        if commit.hex in ldict:
			
 
				+            commit_year = dat.fromtimestamp(commit.commit_time).year
			
 
				+            year_dist[commit_year - start_year - 1] += 1
			
 
				+
			
 
				+    fig = plt.figure()
			
 
				+    plt.bar(years, year_dist)
			
 
				+    plt.xticks(years)
			
 
				+    plt.xlim(xmin=years[0] - 1, xmax=years[-1] + 1)
			
 
				+    fig.autofmt_xdate()
			
 
				+    plt.savefig(res_path)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(
			
 
				+        description="Utility to extract unique bug " +
			
 
				+        "introducing commits from a set a bug fix and bug introducing pairs.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--repository",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="../../jenkins_master/jenkins_master",
			
 
				+        help=
			
 
				+        "Path to a local git repository from which the pairs where extracted.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--branch",
			
 
				+        "-b",
			
 
				+        type=str,
			
 
				+        default="refs/heads/master",
			
 
				+        help="Which branch to use.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--file",
			
 
				+        "-f",
			
 
				+        type=str,
			
 
				+        default="../szz/results/fix_and_introducers_pairs.json",
			
 
				+        help="The file with the pairs.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--resfile",
			
 
				+        "-rf",
			
 
				+        type=str,
			
 
				+        default="./labels.csv",
			
 
				+        help="The file to which the labels are written.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--figfile",
			
 
				+        "-ff",
			
 
				+        type=str,
			
 
				+        default="./distribution.png",
			
 
				+        help="The file to which the bug introducing ditribution is written.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--commit",
			
 
				+        "-c",
			
 
				+        type=str,
			
 
				+        default="02d6908ada70fcf8012833ddef628bc09c6f8389",
			
 
				+        help="The last commit that should be analyzed.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+    REPOPATH = ARGS.repository
			
 
				+    BRANCH = ARGS.branch
			
 
				+    PAIRFILE = ARGS.file
			
 
				+    RESFILE = ARGS.resfile
			
 
				+    FIGFILE = ARGS.figfile
			
 
				+    LAST_COMMIT = ARGS.commit
			
 
				+
			
 
				+    LABELS = get_labels(REPOPATH, BRANCH, PAIRFILE, LAST_COMMIT)
			
 
				+
			
 
				+    save_labels(LABELS, RESFILE)
			
 
				+
			
 
				+    save_label_distribution(REPOPATH, BRANCH, LABELS, FIGFILE)
			
--- a/code/data_assembler/assemble_purpose_features.py
+++ b/code/data_assembler/assemble_purpose_features.py
@@ -0,0 +1,83 @@
 
				+"""
			
 
				+Script to extract the purpose features.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import re
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+from tqdm import tqdm
			
 
				+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
			
 
				+
			
 
				+PATTERNS = [r"bug", r"fix", r"defect", r"patch"]
			
 
				+
			
 
				+def is_fix(message):
			
 
				+    """
			
 
				+    Check if a message contains any of the fix patterns.
			
 
				+    """
			
 
				+    for pattern in PATTERNS:
			
 
				+        if re.search(pattern, message):
			
 
				+            return True
			
 
				+    return False
			
 
				+
			
 
				+def get_purpose_features(repo_path, branch):
			
 
				+    """
			
 
				+    Extract the purpose features for each commit.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+    head = repo.references.get(branch)
			
 
				+
			
 
				+    commits = list(
			
 
				+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
			
 
				+
			
 
				+    features = []
			
 
				+    for _, commit in enumerate(tqdm(commits)):
			
 
				+        message = commit.message
			
 
				+
			
 
				+        fix = 1.0 if (is_fix(message)) else 0.0
			
 
				+
			
 
				+        feat = []
			
 
				+        feat.append(str(commit.hex))
			
 
				+        feat.append(str(fix))
			
 
				+        features.append(feat)
			
 
				+    return features
			
 
				+
			
 
				+def save_features(purpose_features, path="./results/purpose_features.csv"):
			
 
				+    """
			
 
				+    Save the purpose features to a csv file.
			
 
				+    """
			
 
				+    with open(path, 'w') as csv_file:
			
 
				+        writer = csv.writer(csv_file)
			
 
				+        writer.writerow(["commit", "purpose"])
			
 
				+        for row in purpose_features:
			
 
				+            if row:
			
 
				+                writer.writerow([row[0], row[1]])
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(
			
 
				+        description="Utility to extract purpose features from" +
			
 
				+        " a repository or a single commit.")
			
 
				+
			
 
				+    PARSER.add_argument(
			
 
				+        "--repository",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="./repos/jenkins",
			
 
				+        help="Path to local git repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--branch",
			
 
				+        "-b",
			
 
				+        type=str,
			
 
				+        default="refs/heads/master",
			
 
				+        help="Which branch to use.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+    REPOPATH = ARGS.repository
			
 
				+    BRANCH = ARGS.branch
			
 
				+
			
 
				+    FEATURES = get_purpose_features(REPOPATH, BRANCH)
			
 
				+    save_features(FEATURES)
			
--- a/code/data_assembler/general_data.py
+++ b/code/data_assembler/general_data.py
@@ -0,0 +1,257 @@
 
				+"""
			
 
				+Script that extracts general data about a git repository.
			
 
				+"""
			
 
				+__author__ = "Oscar Svensson"
			
 
				+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
			
 
				+__license__ = "MIT"
			
 
				+
			
 
				+import csv
			
 
				+import json
			
 
				+import re
			
 
				+
			
 
				+from argparse import ArgumentParser
			
 
				+from datetime import datetime
			
 
				+from numpy import median, mean
			
 
				+from pygit2 import Repository
			
 
				+
			
 
				+def has_added(message):
			
 
				+    """
			
 
				+    Function to check if a message contains any word that indicates an addition of lines of code.
			
 
				+    """
			
 
				+    if (re.search(
			
 
				+            r"add(?:ed)*|implement(?:ed)*|introduce(?:d)*|improve(?:ment|ments)*",
			
 
				+            message.lower())):
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def has_updated(message):
			
 
				+    """
			
 
				+    Function to check if a message contains any word that indicates an update of lines of code.
			
 
				+    """
			
 
				+    if (re.search(
			
 
				+            r"update[d]*|mov(?:ing|e|ed)|refactor|modifying|switching|deprecate(?:d)*|"+
			
 
				+            "clean(?:up|ed)*",
			
 
				+            message.lower())):
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def has_bugfix(message):
			
 
				+    """
			
 
				+    Function to check if a message contains any word that indicates a bug fix.
			
 
				+    """
			
 
				+    if (re.search(r"jenkins[-]?\d|hudson[-]?\d|fix(?:es|ed)*|solve(?:d)*",
			
 
				+                  message.lower())):
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def has_issue(message):
			
 
				+    """
			
 
				+    Function to check if a message contains any word that indicates a issue.
			
 
				+    """
			
 
				+    if re.search(r"issue number", message.lower()):
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def save_commit_messages(commits, repo):
			
 
				+    """
			
 
				+    Function to run some statistics on a number of commits in a git repository.
			
 
				+    """
			
 
				+
			
 
				+    mapping = {}
			
 
				+
			
 
				+    added = set()
			
 
				+    updated = set()
			
 
				+    bugfix = set()
			
 
				+    issue_set = set()
			
 
				+    for commit in commits:
			
 
				+        message = commit.message
			
 
				+        mapping[commit.hex] = commit.message
			
 
				+
			
 
				+        if has_added(message):
			
 
				+            added.add(commit.hex)
			
 
				+        elif has_updated(message):
			
 
				+            updated.add(commit.hex)
			
 
				+        elif has_bugfix(message):
			
 
				+            bugfix.add(commit.hex)
			
 
				+        elif has_issue(message):
			
 
				+            issue_set.add(commit.hex)
			
 
				+
			
 
				+    """
			
 
				+    Dumps all found commits to a file.
			
 
				+    """
			
 
				+    with open("./results/commit_messages.json", 'w') as output:
			
 
				+        json.dump(mapping, output)
			
 
				+
			
 
				+    overall = set()
			
 
				+    overall.update(added)
			
 
				+    overall.update(updated)
			
 
				+    overall.update(bugfix)
			
 
				+    overall.update(issue_set)
			
 
				+
			
 
				+    all_messages = set([commit.hex for commit in commits])
			
 
				+    not_defined = {c: repo.get(c).message for c in all_messages - overall}
			
 
				+
			
 
				+    print("Number of commits that added something: {} ({}%)".format(
			
 
				+        len(added),
			
 
				+        float(len(added)) / len(all_messages)))
			
 
				+    print("Number of commits that updated something: {} ({}%)".format(
			
 
				+        len(updated),
			
 
				+        float(len(updated)) / len(all_messages)))
			
 
				+    print("Number of commits that fixed a bug: {} ({}%)".format(
			
 
				+        len(bugfix),
			
 
				+        float(len(bugfix)) / len(all_messages)))
			
 
				+    print("Number of commits that contained an issue number: {} ({}%)".format(
			
 
				+        len(issue_set),
			
 
				+        float(len(issue_set)) / len(all_messages)))
			
 
				+
			
 
				+    """
			
 
				+    Dumps all undefined commits to a file as well.
			
 
				+    """
			
 
				+    with open("./results/undefined_commit_messages.json", 'w') as output:
			
 
				+        json.dump(not_defined, output)
			
 
				+    print("Number of undefined commits: {} ({}%)".format(
			
 
				+        len(not_defined),
			
 
				+        float(len(not_defined)) / len(all_messages)))
			
 
				+
			
 
				+
			
 
				+def get_average_time_issues(issue_path):
			
 
				+    """
			
 
				+    Function to get the average times for issues.
			
 
				+    """
			
 
				+    issues_dict = {}
			
 
				+    with open(issue_path, 'r') as inp:
			
 
				+        issues_dict = json.load(inp)
			
 
				+
			
 
				+    days = []
			
 
				+
			
 
				+    lowest = (float('Inf'), 0, 0)
			
 
				+    highest = (0, None, None)
			
 
				+
			
 
				+    for _, dates in issues_dict.items():
			
 
				+        creationdate = dates['creationdate']
			
 
				+        resolutiondate = dates['resolutiondate']
			
 
				+
			
 
				+        creationdate = datetime.strptime(
			
 
				+            creationdate, "%Y-%m-%d %H:%M:%S %z").replace(tzinfo=None)
			
 
				+        resolutiondate = datetime.strptime(
			
 
				+            resolutiondate, "%Y-%m-%d %H:%M:%S %z").replace(tzinfo=None)
			
 
				+
			
 
				+        days.append(((resolutiondate - creationdate).days))
			
 
				+        if days[-1] > highest[0]:
			
 
				+            highest = (days[-1], creationdate, resolutiondate)
			
 
				+        if days[-1] < lowest[0]:
			
 
				+            lowest = (days[-1], creationdate, resolutiondate)
			
 
				+
			
 
				+    print("Lowest: {}".format(lowest))
			
 
				+    print("Highest: {}".format(highest))
			
 
				+    print("Mean time between resolution date and commit date: {} days".format(
			
 
				+        mean(days)))
			
 
				+
			
 
				+
			
 
				+def get_general_data(repo_path, issue_path, labels, pairs):
			
 
				+    """
			
 
				+    Function to get general statistics for a git repository.
			
 
				+    """
			
 
				+    repo = Repository(repo_path)
			
 
				+
			
 
				+    issue_list = {}
			
 
				+    labeled_commits = {}
			
 
				+
			
 
				+    with open(labels, 'r') as inp:
			
 
				+        reader = csv.reader(inp)
			
 
				+        next(reader)
			
 
				+
			
 
				+        for commit in reader:
			
 
				+            labeled_commits[commit[0]] = float(commit[1])
			
 
				+
			
 
				+    print("Number of commits: {}".format(len(labeled_commits)))
			
 
				+    print("Number of found bugintroducing commits: {}".format(
			
 
				+        len([
			
 
				+            labeled_commits[f] for f in labeled_commits
			
 
				+            if labeled_commits[f] > 0
			
 
				+        ])))
			
 
				+
			
 
				+    pair_map = []
			
 
				+    with open(pairs, 'r') as inp:
			
 
				+        pair_map = json.load(inp)
			
 
				+
			
 
				+    total_fixes = set([p[0] for p in pair_map])
			
 
				+    print("Total number of fixes used: {}".format(len(total_fixes)))
			
 
				+
			
 
				+    bug_labeled_commits = set(
			
 
				+        [l for l in labeled_commits if labeled_commits[l] > 0])
			
 
				+
			
 
				+    fixes_in_bugs = set(bug_labeled_commits).intersection(total_fixes)
			
 
				+    print("Total number of fixes in bugs found : {}".format(
			
 
				+        len(fixes_in_bugs)))
			
 
				+
			
 
				+    time_diff = []
			
 
				+    for pair in pair_map:
			
 
				+        fix = repo.get(pair[0])
			
 
				+        bug = repo.get(pair[1])
			
 
				+
			
 
				+        fix_date = datetime.fromtimestamp(fix.commit_time).replace(tzinfo=None)
			
 
				+        bug_date = datetime.fromtimestamp(bug.commit_time).replace(tzinfo=None)
			
 
				+
			
 
				+        diff = (fix_date - bug_date).days
			
 
				+
			
 
				+        time_diff.append(diff)
			
 
				+    years, days = divmod(float(mean(time_diff)), 365.25)
			
 
				+    myears, mdays = divmod(float(median(time_diff)), 365.25)
			
 
				+
			
 
				+    print(
			
 
				+        "Average time between bug introduction and fix: {} years and {} days".
			
 
				+        format(years, days))
			
 
				+    print("Median time between bug introduction and fix: {} years and {} days".
			
 
				+          format(myears, mdays))
			
 
				+
			
 
				+    with open(issue_path, 'r') as inp:
			
 
				+        issue_list = json.load(inp)
			
 
				+
			
 
				+    print("Total number of fixes found: {}".format(len(issue_list)))
			
 
				+
			
 
				+    save_commit_messages([repo.get(c) for c in bug_labeled_commits], repo)
			
 
				+    get_average_time_issues(issue_path)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    PARSER = ArgumentParser(
			
 
				+        description="Utility to extract purpose features from" +
			
 
				+        " a repository or a single commit.")
			
 
				+
			
 
				+    PARSER.add_argument(
			
 
				+        "--repository",
			
 
				+        "-r",
			
 
				+        type=str,
			
 
				+        default="./repos/jenkins",
			
 
				+        help="Path to local git repository.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--issues",
			
 
				+        "-i",
			
 
				+        type=str,
			
 
				+        default="../szz/issue_list_saved.json",
			
 
				+        help="Issues to analyze.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--labels",
			
 
				+        "-l",
			
 
				+        type=str,
			
 
				+        default="./labels.csv",
			
 
				+        help="Found labels.")
			
 
				+    PARSER.add_argument(
			
 
				+        "--fixinpairs",
			
 
				+        "-fp",
			
 
				+        type=str,
			
 
				+        default="./fix_and_introducers_pairs.json",
			
 
				+        help="File with fix and introducing pair commits.")
			
 
				+
			
 
				+    ARGS = PARSER.parse_args()
			
 
				+    REPO_PATH = ARGS.repository
			
 
				+    ISSUES = ARGS.issues
			
 
				+    LABELS = ARGS.labels
			
 
				+    PAIRS = ARGS.fixinpairs
			
 
				+
			
 
				+    get_general_data(REPO_PATH, ISSUES, LABELS, PAIRS)
			
--- a/code/data_assembler/scripts/analyse_commit
+++ b/code/data_assembler/scripts/analyse_commit
@@ -0,0 +1,7 @@
 
				+#!/bin/sh
			
 
				+cd /root/repo
			
 
				+git checkout $1
			
 
				+git log --all --numstat --date=short --pretty=format:'--%h--%ad--%aN' --no-renames >> /tmp/$1_log.log
			
 
				+mkdir /root/results/$1
			
 
				+java -jar /usr/src/code-maat/app-standalone.jar -l /tmp/$1_log.log -o /root/results/$1/$1_coupling.log.res -c git2 -a coupling
			
 
				+java -jar /usr/src/code-maat/app-standalone.jar -l /tmp/$1_log.log -o /root/results/$1/$1_revisions.log.res -c git2 -a revisions