Prechádzať zdrojové kódy

Added feature scripts

Runs docker containers onto a repository and generates features such as
coupling.

Added scripts for extracting code_churns, coupling_features,
diffusion_features, experience_features, history_features and
purpose_features.
Oscar Svensson 5 rokov pred
rodič
commit
a0e8e9537f

+ 15 - 0
code/data_assembler/CodeMaatDockerFile

@@ -0,0 +1,15 @@
+FROM clojure:alpine
+VOLUME /data
+LABEL description="code-maat docker image."
+
+ARG dest=/usr/src/code-maat
+
+RUN mkdir -p $dest
+WORKDIR $dest
+COPY project.clj $dest
+RUN lein deps
+COPY . $dest
+RUN mv "$(lein uberjar | sed -n 's/^Created \(.*standalone\.jar\)/\1/p')" app-standalone.jar
+
+RUN apk update
+RUN apk add git

+ 232 - 0
code/data_assembler/assemble_code_churns.py

@@ -0,0 +1,232 @@
+"""
+Script to extract code churns.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import os
+import sys
+import time
+
+from argparse import ArgumentParser
+
+from multiprocessing import Process, Manager, cpu_count
+from pygit2 import Repository, GIT_SORT_REVERSE, GIT_SORT_TOPOLOGICAL
+from tqdm import tqdm
+
+# Global variables
+MANAGER = Manager()
+RES = MANAGER.dict()
+
+
+def parse_code_churns(pid, repo_path, branch, start, stop=-1):
+    """
+    Function that is intended to be runned by a process. It extracts the code churns
+    for a set of commits and stores them in the RES dict.
+    """
+    repo = Repository(repo_path)
+
+    head = repo.references.get(branch)
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+
+    start = start - 1 if (start > 0) else start
+    commits = commits[start:stop] if (stop != -1) else commits[start:]
+
+    code_churns = [[] for c in range(len(commits))]
+    for i, commit in enumerate(tqdm(commits[1:], position=pid)):
+        diff = repo.diff(commits[i], commit)
+
+        tree = commit.tree
+        patches = [p for p in diff]
+        stats = diff.stats
+
+        # Count the total lines of code and find the biggest file that have been changed
+        total_tloc = 0
+        line_of_code_old = 0
+        for patch in patches:
+            if patch.delta.is_binary:
+                continue
+            new_file = patch.delta.new_file
+
+            # Total lines of code
+            total_tloc += get_file_lines_of_code(repo, tree, new_file)
+
+            old_file = patch.delta.old_file
+            # Total lines of code in the old file
+            line_of_code_old = max(
+                line_of_code_old, get_file_lines_of_code(repo, tree, old_file))
+
+        # Churned lines of code
+        cloc = stats.insertions
+        # Deleted lines of code
+        dloc = stats.deletions
+
+        # Churned files
+        files_churned = len(patches)
+
+        # File count
+        num_files = count_files(tree, repo)
+
+        # Apply relative code churns
+        measure_one = float(cloc) / total_tloc if (total_tloc > 0) else float(cloc)
+        measure_two = float(dloc) / total_tloc if (total_tloc > 0) else float(cloc)
+        measure_three = (float(files_churned) / num_files if (num_files > 0)
+                         else float(files_churned))
+
+        line_of_code_old = float(line_of_code_old)
+
+        # Churn features
+        code_churns[i].append(str(commit.hex))
+        code_churns[i].append(str(measure_one))
+        code_churns[i].append(str(measure_two))
+        code_churns[i].append(str(measure_three))
+        code_churns[i].append(str(line_of_code_old))
+
+    RES[pid] = code_churns
+
+
+def count_files(tree, repo):
+    """
+    Count how many files there are in a repository.
+    """
+    num_files = 0
+    trees = []
+    visited = set()
+    visited.add(tree.id)
+    trees.append(tree)
+
+    while trees:
+        current_tree = trees.pop()
+        for entry in current_tree:
+            if entry.type == "tree":
+                if entry.id not in visited:
+                    trees.append(repo[entry.id])
+                    visited.add(entry.id)
+            else:
+                num_files += 1
+    return num_files
+
+
+def get_file_lines_of_code(repo, tree, dfile):
+    """
+    Count how many lines of code there are in a file.
+    """
+    tloc = 0
+    try:
+        blob = repo[tree[dfile.path].id]
+
+        tloc = len(str(blob.data).split('\\n'))
+    except Exception as _:
+        return tloc
+    return tloc
+
+
+def get_code_churns(repo_path, branch):
+    """
+    General function for extracting code churns. It first extracts the code churns for
+    the first commit and then starts a number of processes(equal to the number of cores
+    on the computer), which equally extracts the code churns for the remaining commits.
+    """
+    repo = Repository(repo_path)
+
+    head = repo.references.get(branch)
+
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+    code_churns = [[]]
+
+    initial = commits[0]
+
+    # Relative code churns
+    measure_one = 0.0
+    measure_two = 0.0
+    measure_three = 1.0
+
+    line_of_code_old = 0.0
+
+    code_churns[0].append(str(initial.hex))
+    code_churns[0].append(str(measure_one))
+    code_churns[0].append(str(measure_two))
+    code_churns[0].append(str(measure_three))
+    code_churns[0].append(str(line_of_code_old))
+
+    # Check how many processes that could be spawned
+    cpus = cpu_count()
+    print("Using {} cpus...".format(cpus))
+
+    # Equally split the commit set into the equally sized parts.
+    quote, remainder = divmod(len(commits), cpus)
+
+    processes = [
+        Process(
+            target=parse_code_churns,
+            args=(i, repo_path, branch, i * quote + min(i, remainder),
+                  (i + 1) * quote + min(i + 1, remainder))) for i in range(cpus)
+    ]
+
+    for process in processes:
+        process.start()
+
+    start_time = time.time()
+    for process in processes:
+        process.join()
+    end_time = time.time()
+
+    print("Done")
+    print("Overall processing time {}".format(end_time - start_time))
+
+    # Assemble the results
+    churns = []
+    for _, churn in RES.items():
+        churns.extend(churn)
+
+    churns = list(reversed(churns))
+    churns.append(code_churns[0])
+    return churns
+
+def save_churns(churns, path="./results/code_churns_features_multithread.csv"):
+    """
+    Saves the code churns to a csv file.
+    """
+    with open(path, 'w') as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow([
+            "commit", "lines_of_code_added", "lines_of_code_deleted",
+            "files_churned", "line_of_code_old"
+        ])
+
+        for row in churns:
+            if row:
+                writer.writerow([row[0], row[1], row[2], row[3], row[4]])
+
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(description="Utility to extract code churns from" +
+                            " a repository or a single commit.")
+
+    PARSER.add_argument(
+        "--repository",
+        "-r",
+        type=str,
+        default="./repos/jenkins",
+        help="Path to local git repository.")
+    PARSER.add_argument(
+        "--branch",
+        "-b",
+        type=str,
+        default="refs/heads/master",
+        help="Which branch to use.")
+
+    ARGS = PARSER.parse_args()
+    REPOPATH = ARGS.repository
+    BRANCH = ARGS.branch
+
+    if not os.path.exists(REPOPATH):
+        print("The repository path does not exist!")
+        sys.exit(1)
+
+    CHURNS = get_code_churns(REPOPATH, BRANCH)
+    save_churns(CHURNS)

+ 135 - 0
code/data_assembler/assemble_coupling_features.py

@@ -0,0 +1,135 @@
+"""
+Script to extract coupling features from code maat analysis files.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import os
+
+from git import Repo
+import numpy as np
+from tqdm import tqdm
+
+def save_features(features, res_path):
+    """
+    Save the coupling features to a csv file.
+    """
+    print("Saving to {}".format(os.path.abspath(res_path)))
+    with open(os.path.abspath(res_path), 'w') as feat_file:
+        feat_writer = csv.writer(feat_file)
+
+        feat_writer.writerow([
+            "commit", "number_of_cruical_files",
+            "number_of_moderate_risk_cruical_files",
+            "number_of_high_risk_cruical_files",
+            "number_of_non_modified_change_couplings"
+        ])
+        for feature in features:
+            feat_writer.writerow(feature)
+
+
+def get_features():
+    """
+    Get the coupling features from a number of files.
+    """
+    commits = list(REPO.iter_commits('master'))
+
+    couplings = {}
+    features = []
+
+    for hexsha in os.listdir("/h/oskars/data_all"):
+        couplings[hexsha] = os.path.join(
+            os.path.join("/h/oskars/data_all", hexsha),
+            "{}_coupling.log.res".format(hexsha))
+
+    features.append([commits[0].hexsha, 0, 0, 0])
+    for i in tqdm(range(1, len(commits))):
+        first = commits[i - 1]
+        second = commits[i]
+
+        diff = first.diff(second)
+
+        paths = [d.b_path for d in diff]
+
+        cruical_moderate = 0
+        cruical_high = 0
+        cruical_files = 0
+        cruical_non_modified_couplings = 0
+
+        if second.hexsha in couplings:
+            cruical_commits = 0
+            cruical_degrees = []
+
+            with open(couplings[second.hexsha], 'r') as csvfile:
+                coup_rows = csv.reader(csvfile)
+                files = {}
+                file_coupling_graph = {}
+
+                next(coup_rows)
+                for row in coup_rows:
+                    degree = float(row[2])
+
+                    # Is this correct?
+                    in_files = bool(row[0] in files)
+                    if in_files and files[row[0]] > degree:
+                        files[row[0]] = degree
+                    elif not in_files:
+                        files[row[0]] = degree
+
+                    is_in_coupling_graph = bool(row[0] in file_coupling_graph)
+                    if is_in_coupling_graph and degree >= 75:
+                        file_coupling_graph[row[0]].append(row[1])
+                    elif degree >= 50:
+                        file_coupling_graph[row[0]] = [row[1]]
+
+                    # Is this correct?
+                    in_files = bool(row[1] in files)
+                    if in_files and files[row[1]] > degree:
+                        files[row[1]] = degree
+                    elif not in_files:
+                        files[row[1]] = degree
+
+                    is_in_coupling_graph = bool(row[1] in file_coupling_graph)
+                    if is_in_coupling_graph and degree >= 75:
+                        file_coupling_graph[row[1]].append(row[0])
+                    elif degree >= 50:
+                        file_coupling_graph[row[1]] = [row[0]]
+
+                for path in paths:
+                    if path in files:
+                        cruical_commits = cruical_commits + 1
+                        cruical_degrees.append(files[path])
+                        cruical_files = cruical_files + 1
+
+                # Check for all non modified cruical non coupled files.
+                set_path = set(paths)
+                for path in paths:
+                    if path in file_coupling_graph:
+                        file_couplings = set(file_coupling_graph[path])
+                        cruical_non_modified_couplings = cruical_non_modified_couplings + len(
+                            file_couplings - set_path)
+
+                inds = np.digitize(cruical_degrees, [25, 50, 75, 100])
+                cruical_moderate = sum([1 for i in inds if i == 3])
+                cruical_high = sum([1 for i in inds if i == 4])
+
+        features.append([
+            second.hexsha,
+            str(cruical_files),
+            str(cruical_moderate),
+            str(cruical_high),
+            str(cruical_non_modified_couplings)
+        ])
+
+    return features
+
+
+if __name__ == "__main__":
+    global REPO
+    REPO = Repo("../../jenkins")
+    REPO = Repo("./repos/jenkins")
+
+    FEATURES = get_features()
+    save_features(FEATURES, './results/coupling_features.csv')

+ 254 - 0
code/data_assembler/assemble_diffusion_features.py

@@ -0,0 +1,254 @@
+"""
+Script for extracting diffusion features from a git repository.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import os
+import sys
+import time
+
+from argparse import ArgumentParser
+from multiprocessing import Process, Manager, cpu_count
+from numpy import log2
+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
+from tqdm import tqdm
+
+MANAGER = Manager()
+RES = MANAGER.dict()
+
+
+def count_diffing_subsystems(subsystems):
+    """
+    Function for counting the number of subsystems in a repository.
+    """
+    number = 0
+    for system in subsystems.values():
+        number = number + count_diffing_subsystems(system)
+
+    return number + len(subsystems.keys())
+
+def count_entropy(file_changes, total_change):
+    """
+    Function to count entropy for some file changes.
+    """
+    if total_change == 0:
+        return 0
+    return sum([
+        -1 * (float(x) / total_change) * (log2(float(x) / total_change)
+                                          if x > 0 else 0)
+        for x in file_changes
+    ])
+
+
+def parse_diffusion_features(pid, repo_path, branch, start, stop=-1):
+    """
+    Function to extract diffusion features from a set of commits.
+    """
+    repo = Repository(repo_path)
+
+    head = repo.references.get(branch)
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+
+    start = start - 1 if (start > 0) else start
+    commits = commits[start:stop] if (stop != -1) else commits[start:]
+
+    features = [[] for c in range(len(commits))]
+    for i, commit in enumerate(tqdm(commits[1:], position=pid)):
+        diff = repo.diff(commits[i], commit)
+
+        patches = [p for p in diff]
+
+        # Extract all different subsystems that have been modified
+        modules = set([])
+        subsystems_mapping = {}
+        entropy_change = 0
+
+        file_changes = []
+        total_change = 0
+        for patch in patches:
+            # Skip binary files
+            if patch.delta.is_binary:
+                continue
+            _, addition, deletions = patch.line_stats
+            total_change = total_change + (addition + deletions)
+            file_changes.append(addition + deletions)
+
+            # Store all subsystems
+            fpath = patch.delta.new_file.path
+            subsystems = fpath.split('/')[:-1]
+
+            root = subsystems_mapping
+            for system in subsystems:
+                if system not in root:
+                    root[system] = {}
+                root = root[system]
+            if subsystems > 0:
+                modules.add(subsystems[0])
+
+        # Check how many subsystems that have been touched
+        modified_systems = count_diffing_subsystems(subsystems_mapping)
+
+        # Calculate the entropy for the commit
+        entropy_change = count_entropy(file_changes, total_change)
+
+        # Add all features
+        features[i].append(str(commit.hex))
+        features[i].append(str(float(modified_systems)))
+        features[i].append(str(float(len(modules))))
+        features[i].append(str(float(entropy_change)))
+
+    RES[pid] = features
+
+def parse_tree(tree, repo):
+    """
+    Parse a git tree and get the number of files, the number of systems and
+    the number of subdirectories.
+    """
+    found_sub_entries = 0
+    additions = 0
+    file_additions = []
+    tree = repo[tree.id]
+
+    for entry in tree:
+        if entry.type == "bin":
+            continue
+        if entry.type == "tree":
+            sub_additions, sub_file_additions, sub_entries = parse_tree(
+                entry, repo)
+            found_sub_entries += (1 + sub_entries)
+            additions += sub_additions
+            file_additions.extend(sub_file_additions)
+        else:
+            try:
+                sub_addition = len(str(repo[entry.id]).split('\n'))
+                additions += sub_addition
+                file_additions.append(sub_addition)
+            except Exception as ex:
+                print(ex)
+                continue
+
+    return additions, file_additions, found_sub_entries
+
+def get_diffusion_features(repo_path, branch):
+    """
+    Function that extracts the first commits diffusion features. It then starts
+    a number of processes(equal to the number of cores on the computer), and then
+    distributes the remaining commits to them.
+    """
+    repo = Repository(repo_path)
+
+    head = repo.references.get(branch)
+
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+    initial = commits[0]
+    init_tree = initial.tree
+
+    # Count inital total lines of code
+    init_total_additions = 0
+    init_file_addtions = []
+
+    init_subdirectories = 0
+    init_modules = 0
+
+    for entry in init_tree:
+        if entry.type == "tree":
+            added, file_additions, subdirectories = parse_tree(entry, repo)
+
+            init_modules += 1
+            init_file_addtions.extend(file_additions)
+            init_total_additions += added
+            init_subdirectories += subdirectories
+        else:
+            try:
+                additions = len(str(repo[entry.id]).split('\n'))
+                init_total_additions += additions
+                init_file_addtions.append(additions)
+            except:
+                continue
+    diffusion_features = []
+    diffusion_features.append(initial.hex)
+    diffusion_features.append(init_subdirectories)
+    diffusion_features.append(init_modules)
+    diffusion_features.append(
+        count_entropy(init_file_addtions, init_total_additions))
+
+    # Check how many processes that could be spawned
+    cpus = cpu_count()
+    print("Using {} cpus...".format(cpus))
+    # Divide the commits eqaully between the processes.
+    quote, remainder = divmod(len(commits), cpus)
+
+    processes = [
+        Process(
+            target=parse_diffusion_features,
+            args=(i, repo_path, branch, i * quote + min(i, remainder),
+                  (i + 1) * quote + min(i + 1, remainder))) for i in range(cpus)
+    ]
+
+    for process in processes:
+        process.start()
+
+    start_time = time.time()
+    for process in processes:
+        process.join()
+    end_time = time.time()
+
+    print("Done")
+    print("Overall processing time {}".format(end_time - start_time))
+
+    # Assemble the results
+    features = []
+    for _, feat in RES.items():
+        features.extend(feat)
+    features = list(reversed(features))
+    features.append(diffusion_features)
+    return features
+
+def save_diffusion_features(diffusion_features,
+                            path="./results/diffusion_features.csv"):
+    """
+    Save the diffusion features to a csv file.
+    """
+    with open(path, 'w') as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow([
+            "commit", "modified_subsystems", "modified_subdirectories",
+            "entropy"
+        ])
+        for row in diffusion_features:
+            if row:
+                writer.writerow([row[0], row[1], row[2], row[3]])
+
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(
+        description="Utility to extract diffusion features from" +
+        " a repository or a single commit.")
+
+    PARSER.add_argument(
+        "--repository",
+        "-r",
+        type=str,
+        default="./repos/jenkins",
+        help="Path to local git repository.")
+    PARSER.add_argument(
+        "--branch",
+        "-b",
+        type=str,
+        default="refs/heads/master",
+        help="Which branch to use.")
+
+    ARGS = PARSER.parse_args()
+    REPOPATH = ARGS.repository
+    BRANCH = ARGS.branch
+    if not os.path.exists(REPOPATH):
+        print("The repository path does not exist!")
+        sys.exit(1)
+
+    DIFFUSION_FEATURES = get_diffusion_features(REPOPATH, BRANCH)
+    save_diffusion_features(DIFFUSION_FEATURES)

+ 244 - 0
code/data_assembler/assemble_experience_features.py

@@ -0,0 +1,244 @@
+"""
+Script for extracting the experience features in a software repository.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import json
+import sys
+import time
+
+from argparse import ArgumentParser
+from datetime import datetime
+from numpy import floor
+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
+from tqdm import tqdm
+
+def set_to_list(obj):
+    """
+    Helper function to turn sets to lists and floats to strings.
+    """
+    if isinstance(obj, set):
+        return list(obj)
+    if isinstance(obj, float):
+        return str('%.15g' % obj)
+    raise TypeError
+
+
+def get_files_in_tree(tree, repo):
+    """
+    Function to get the files in a tree.
+    """
+    files = set()
+    for entry in tree:
+        if entry.type == "tree":
+            sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
+                         for f in get_files_in_tree(repo[entry.id], repo)]
+            files.update(sub_files)
+        else:
+            blob = repo[entry.id]
+            if not blob.is_binary:
+                if entry.name.endswith("java"):
+                    files.add((entry.hex, entry.name))
+    return files
+
+
+def get_diffing_files(commit, parent, repo):
+    """
+    Function to get the files that differs between two commits.
+    """
+    diff = repo.diff(parent, commit)
+
+    patches = [p for p in diff]
+
+    files = set()
+
+    for patch in patches:
+        if patch.delta.is_binary:
+            continue
+        nfile = patch.delta.new_file
+        files.add((nfile.id, nfile.path, patch.delta.status))
+
+    return files
+
+def save_experience_features_graph(repo_path, branch, graph_path):
+    """
+    Function to get and save the experience graph.
+    """
+    repo = Repository(repo_path)
+    head = repo.references.get(branch)
+
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+    current_commit = repo.head.target
+
+    start_time = time.time()
+
+    current_commit = repo.get(str(current_commit))
+    files = get_files_in_tree(current_commit.tree, repo)
+
+    all_authors = {}
+
+    author = current_commit.committer.name
+
+    all_authors[author] = {}
+    all_authors[author]['lastcommit'] = current_commit.hex
+    all_authors[author][current_commit.hex] = {}
+    all_authors[author][current_commit.hex]['prevcommit'] = ""
+    all_authors[author][current_commit.hex]["exp"] = 1
+    all_authors[author][current_commit.hex]["rexp"] = [[len(files), 1]]
+    all_authors[author][current_commit.hex]["sexp"] = {}
+
+    for i, commit in enumerate(tqdm(commits[1:])):
+        files = get_diffing_files(commit, commits[i], repo)
+
+        author = commit.committer.name
+        if author not in all_authors:
+            all_authors[author] = {}
+            all_authors[author]['lastcommit'] = commit.hex
+            all_authors[author][commit.hex] = {}
+            all_authors[author][commit.hex]['prevcommit'] = ""
+            all_authors[author][commit.hex]["exp"] = 1
+            all_authors[author][commit.hex]["rexp"] = [[len(files), 1.0]]
+            all_authors[author][commit.hex]["sexp"] = {}
+        else:
+            last_commit = all_authors[author]["lastcommit"]
+            all_authors[author]["lastcommit"] = commit.hex
+            all_authors[author][commit.hex] = {}
+            all_authors[author][commit.hex]['prevcommit'] = last_commit
+            all_authors[author][commit.hex][
+                'exp'] = 1 + all_authors[author][last_commit]['exp']
+
+            date_current = datetime.fromtimestamp(commit.commit_time)
+            date_last = datetime.fromtimestamp(repo.get(last_commit).commit_time)
+
+            diffing_years = abs(floor(float((date_current - date_last).days) / 365))
+
+            overall = all_authors[author][last_commit]['rexp']
+
+            all_authors[author][commit.hex][
+                'rexp'] = [[len(files), 1.0]] + [[e[0], e[1] + diffing_years]
+                                                 for e in overall]
+
+    with open(graph_path, 'w') as output:
+        json.dump(all_authors, output, default=set_to_list)
+
+    end_time = time.time()
+
+    print("Done")
+    print("Overall processing time {}".format(end_time - start_time))
+
+def load_experience_features_graph(path="./results/author_graph.json"):
+    """
+    Function to load the feeatures graph.
+    """
+    file_graph = {}
+    with open(path, 'r') as inp:
+        file_graph = json.load(inp, parse_float=lambda x: float(x))
+    return file_graph
+
+
+def get_experience_features(graph, repo_path, branch):
+    """
+    Function that extracts the experience features from a experience graph.
+    """
+    repo = Repository(repo_path)
+    head = repo.references.get(branch)
+
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+    current_commit = repo.head.target
+
+    files = get_files_in_tree(repo.get(str(current_commit)).tree, repo)
+
+    features = []
+
+    commit_feat = []
+    commit_feat.append(str(commits[0].hex))
+    commit_feat.append(str(1.0))
+    commit_feat.append(str(len(files)))
+    commit_feat.append(str(0.0))
+    features.append(commit_feat)
+
+    for _, commit in enumerate(tqdm(commits[1:])):
+        author = commit.committer.name
+
+        exp = graph[author][commit.hex]['exp']
+        rexp = graph[author][commit.hex]['rexp']
+        try:
+            rrexp = sum([float(float(e[0]) / (float(e[1]) + 1)) for e in rexp])
+        except:
+            print(author)
+            print(commit.hex)
+            print(rexp)
+            sys.exit(1)
+
+        commit_feat = []
+        commit_feat.append(str(commit.hex))
+        commit_feat.append(str(float(exp)))
+        commit_feat.append(str(float(rrexp)))
+        commit_feat.append(str(float(0)))
+        features.append(commit_feat)
+    return features
+
+
+def save_experience_features(history_features, path):
+    """
+    Save the experience features to a csv file.
+    """
+    with open(path, 'w') as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(["commit", "experience", "rexp", "sexp"])
+        for row in history_features:
+            if row:
+                writer.writerow([row[0], row[1], row[2], row[3]])
+
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(description="Utility to extract code churns from" +
+                            " a repository or a single commit.")
+
+    PARSER.add_argument(
+        "--repository",
+        "-r",
+        type=str,
+        default="./repos/jenkins",
+        help="Path to local git repository.")
+    PARSER.add_argument(
+        "--branch",
+        "-b",
+        type=str,
+        default="refs/heads/master",
+        help="Which branch to use.")
+    PARSER.add_argument(
+        "--save-graph",
+        "-sg",
+        action="store_true",
+        help="Generate a new graph for a repository.")
+    PARSER.add_argument(
+        "--graph-path",
+        "-gp",
+        type=str,
+        default="./results/author_graph.json",
+        help="The path to where the graph is stored.")
+    PARSER.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="./results/experience_features.csv",
+        help="The path where the output is written.")
+
+    ARGS = PARSER.parse_args()
+    REPO_PATH = ARGS.repository
+    BRANCH = ARGS.branch
+    SAVE_GRAPH = ARGS.save_graph
+    GRAPH_PATH = ARGS.graph_path
+    OUTPUT = ARGS.output
+
+    if SAVE_GRAPH:
+        save_experience_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
+    GRAPH = load_experience_features_graph(GRAPH_PATH)
+    EXPERIENCE_FEATURES = get_experience_features(GRAPH, REPO_PATH, BRANCH)
+    save_experience_features(EXPERIENCE_FEATURES, OUTPUT)

+ 292 - 0
code/data_assembler/assemble_features.py

@@ -0,0 +1,292 @@
+"""
+Script that runs several docker containers which in turn runs an analysis on
+a git repository.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import os
+import sys
+import shutil
+import time
+
+from argparse import ArgumentParser
+from distutils.dir_util import copy_tree
+from multiprocessing import Process, cpu_count
+from git import Repo
+from tqdm import tqdm
+
+import docker
+
+def start_container(client, image, name, repo_dir, result_dir):
+    """
+    Function that starts a docker container and links the repo into it and
+    a directory where the results are stored.
+    """
+    for container in client.containers.list(all=True):
+        if name == container.name:
+            if container.status == "running":
+                container.kill()
+            container.remove()
+
+    path = os.path.abspath('./')
+
+    container = client.containers.run(
+        image,
+        name=name,
+        stdin_open=True,
+        detach=True,
+        volumes={
+            str(path + "/scripts"): {
+                'bind': '/root/scripts',
+                'mode': 'rw'
+            },
+            result_dir: {
+                'bind': '/root/results',
+                'mode': 'rw'
+            },
+            os.path.abspath(repo_dir): {
+                'bind': '/root/repo',
+                'mode': 'rw'
+            }
+        },
+        command="bash")
+
+    return container
+
+def run_command(container, command):
+    """
+    Function that executes a command inside a container.
+    """
+    return container.exec_run(
+        cmd="bash -c \"" + command + "\"", tty=True, privileged=True)
+
+
+def run_analysis(t_id, container, commits):
+    """
+    Function that runs a command inside all docker container.
+    """
+    for commit in tqdm(
+            commits, desc="Progress process {}".format(t_id), position=t_id):
+        run_command(container,
+                    "/root/scripts/analyse_commit {}".format(commit))
+
+def copy_repo(src, dest):
+    """
+    Helper function to copy a repository to another destination.
+    """
+    try:
+        shutil.copytree(src, dest)
+    except shutil.Error as exp:
+        print("Directory not copied. Error: {}".format(exp))
+    except OSError as exp:
+        print("Directory not copied. Error: {}".format(exp))
+
+def partion_commits(commits, partitions):
+    """
+    Function that divides commits into evenly partitions.
+    """
+    quote, remainder = divmod(len(commits), partitions)
+    chunk_commits = [(i * quote + min(i, remainder), (i + 1) * quote + min(i + 1, remainder) - 1)
+                     for i in range(partitions)]
+    chunk_commits[-1] = (chunk_commits[-1][0], chunk_commits[-1][1] + 1)
+
+    commits = [[commit for commit in commits[chunk[0]:chunk[1]]]
+               for chunk in chunk_commits]
+    return commits
+
+def start_analysis(image, result_dir, commits=None, cpus=cpu_count()):
+    """
+    This function starts a docker container that can analyze a git repository. It starts several
+    containers if the cpus are more than one.
+    """
+    client = docker.from_env()
+    repo = Repo(REPO)
+
+    # Since the script is working directly on the repository, they have
+    # to have a separately copy.
+    if not os.path.exists("./repos"):
+        os.makedirs("./repos")
+
+    repo_name = os.path.basename(os.path.normpath(REPO))
+
+    for cpu in range(cpus):
+        copy_repo(REPO, "./repos/{}{}".format(repo_name, cpu))
+
+    # Split the commits into even parts.
+    if not commits:
+        commits = [
+            str(commit.hexsha) for commit in list(repo.iter_commits('master'))
+        ]
+
+    commits = partion_commits(commits, cpus)
+
+    containers = []
+    for cpu in range(cpus):
+        container = start_container(
+            client,
+            image=image,
+            name="analysis_{}_cpu_{}".format(repo_name, cpu),
+            repo_dir="./repos/{}{}".format(repo_name, cpu),
+            result_dir=result_dir + "/data{}".format(cpu))
+        containers.append(container)
+
+    processes = [
+        Process(target=run_analysis, args=(i, containers[i], commits[i]))
+        for i in range(cpus)
+    ]
+    for process in processes:
+        process.start()
+    for process in processes:
+        process.join()
+
+    for container in containers:
+        print(container.status)
+        print(container.name)
+        if (container.status != "exited" or container.status != "dead"):
+            container.kill()
+        container.remove()
+
+    shutil.rmtree("./repos", ignore_errors=True)
+
+def parse_commits(commit_file):
+    """
+    Read the commits from a file and reutrn the content.
+    """
+    if not os.path.exists(commit_file):
+        print("commit_file doesn't exist!!", file=sys.stderr)
+        sys.exit(1)
+
+    commits = []
+    with open(commit_file, 'r') as cfile:
+        commits = [line.strip() for line in cfile.readlines()]
+    return commits
+
+def assemble_directories(result_path, cpus=cpu_count()):
+    """
+    Copy all results into a single directory.
+    """
+    result_path = os.path.abspath(result_path)
+    paths = ["{}/data{}".format(result_path, i) for i in range(cpus)]
+
+    if not all([os.path.exists(p) for p in paths]):
+        print("data paths doesn't exists!", file=sys.stderr)
+        return
+
+    files = []
+
+    for path in paths:
+        for item in os.listdir(path):
+            commit = os.path.join(path, item)
+            corrupt = False if (len(os.listdir(commit)) == 2) else True
+
+            if (os.path.isdir(commit) and not corrupt):
+                files.append((commit, item))
+
+    print("Saving all analysed commits into a single directory: {}/data_all".
+          format(result_path))
+    if not os.path.exists("{}/data_all".format(result_path)):
+        os.makedirs("{}/data_all".format(result_path))
+
+    for file_tuple in files:
+        if not os.path.exists("{}/data_all/{}".format(result_path, file_tuple[1])):
+            copy_tree(file_tuple[0], "{}/data_all/{}".format(result_path, file_tuple[1]))
+
+def check_for_missing_commits(repo_path, result_path):
+    """
+    Controller function that checks if all commits has been analyzed.
+    """
+    result_dir = os.path.abspath(result_path)
+    if not os.path.exists(result_path):
+        print("Result path doesn't exist!", file=sys.stderr)
+        return
+
+    repo = Repo(repo_path)
+
+    current_commits = []
+    for item in os.listdir(result_dir):
+        current_commits.append(item)
+
+    all_repo_commits = [c.hexsha for c in list(repo.iter_commits('master'))]
+
+    missing_commits = set(all_repo_commits) - set(current_commits)
+
+    if missing_commits:
+        with open("./missing_commits.txt", 'w') as cfile:
+            for commit in missing_commits:
+                cfile.write(commit)
+                cfile.write('\n')
+        print("Wrote missing commits to missing_commits.txt")
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(description="Utility to run several docker " +
+                            "containers onto a git repository. " +
+                            "Each container is given a set of " +
+                            "commits and is instructed to run " +
+                            "an analysis on each one of them.")
+    PARSER.add_argument(
+        "--analyse", "-a", action="store_true", help="Run an analysation.")
+    PARSER.add_argument(
+        "--image",
+        "-i",
+        type=str,
+        default="code-maat",
+        help="Specification of which image to use.")
+    PARSER.add_argument(
+        "--repo-dir",
+        "-r",
+        type=str,
+        default="../../jenkins",
+        help="Specification of which repo to use.")
+    PARSER.add_argument(
+        "--result-dir",
+        "-rd",
+        type=str,
+        default="/h/oskars",
+        help="Specification of where to store the result.")
+    PARSER.add_argument(
+        "--commits",
+        "-c",
+        type=str,
+        default=None,
+        help="Direction to a file containing commits to analyse.")
+    PARSER.add_argument(
+        "--assemble",
+        "-as",
+        action="store_true",
+        help="Assemble the results into a single directory.")
+    PARSER.add_argument(
+        "--missing-commits",
+        "-mc",
+        action="store_true",
+        help="Check for non analysed commits.")
+
+    ARGS = PARSER.parse_args()
+
+    global REPO
+    REPO = os.path.abspath(ARGS.repo_dir)
+
+    if ARGS.commits:
+        COMMITS = parse_commits(ARGS.commits)
+    else:
+        COMMITS = []
+
+    CLIENT = docker.from_env()
+    if ARGS.analyse:
+        print("Starting the analysis using {} cpus...".format(cpu_count()))
+        START = time.time()
+        if COMMITS:
+            start_analysis(ARGS.image, ARGS.result_dir, commits=COMMITS)
+        else:
+            start_analysis(ARGS.image, ARGS.result_dir)
+        STOP = time.time()
+        print("Done in {}".format(
+            time.strftime('%H:%M:%S', time.gmtime(STOP - START))))
+        print("Results can be found in {}".format(
+            ARGS.result_dir + "/data{" +
+            ','.join(["{}".format(i) for i in range(cpu_count())]) + "}"))
+    if ARGS.assemble:
+        assemble_directories(ARGS.result_dir)
+    if ARGS.missing_commits:
+        check_for_missing_commits(ARGS.repo_dir, ARGS.result_dir)

+ 239 - 0
code/data_assembler/assemble_history_features.py

@@ -0,0 +1,239 @@
+"""
+Script to extract history features from a git repository.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import json
+import time
+
+from argparse import ArgumentParser
+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
+from tqdm import tqdm
+
+def set_to_list(obj):
+    """
+    Helper function to convert a set to a list.
+    """
+    if isinstance(obj, set):
+        return list(obj)
+    raise TypeError
+
+def get_files_in_tree(tree, repo):
+    """
+    Extract the hex of all files and their name.
+    """
+    files = set()
+    for entry in tree:
+        if entry.type == "tree":
+            sub_files = [(f[0], "{}/{}".format(entry.name, f[1]))
+                         for f in get_files_in_tree(repo[entry.id], repo)]
+            files.update(sub_files)
+        else:
+            blob = repo[entry.id]
+            if not blob.is_binary:
+                if entry.name.endswith("java"):
+                    files.add((entry.hex, entry.name))
+    return files
+
+
+def get_diffing_files(commit, parent, repo):
+    """
+    Get the files that diffed between two commits.
+    """
+    diff = repo.diff(parent, commit)
+
+    patches = [p for p in diff]
+
+    files = set()
+
+    for patch in patches:
+        if patch.delta.is_binary:
+            continue
+        nfile = patch.delta.new_file
+        files.add((nfile.id, nfile.path, patch.delta.status))
+
+    return files
+
+
+def save_history_features_graph(repo_path, branch, graph_path):
+    """
+    Track the number of developers that have worked in a repository and save the
+    results in a graph which could be used for later use.
+    """
+    repo = Repository(repo_path)
+    head = repo.references.get(branch)
+
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+    current_commit = repo.head.target
+
+    start_time = time.time()
+
+    all_files = {}
+    current_commit = repo.get(str(current_commit))
+    files = get_files_in_tree(current_commit.tree, repo)
+
+    for (_, name) in tqdm(files):
+        all_files[name] = {}
+        all_files[name]['lastcommit'] = current_commit.hex
+        all_files[name][current_commit.hex] = {}
+        all_files[name][current_commit.hex]["prevcommit"] = ""
+        all_files[name][current_commit.hex]["authors"] = [
+            current_commit.committer.name
+        ]
+
+    for i, commit in enumerate(tqdm(commits[1:])):
+        files = get_diffing_files(commit, commits[i], repo)
+        for (_, name, _) in files:
+            if name not in all_files:
+                all_files[name] = {}
+
+            last_commit = ""
+            if 'lastcommit' not in all_files[name]:
+                all_files[name]['lastcommit'] = commit.hex
+            else:
+                last_commit = all_files[name]['lastcommit']
+
+            all_files[name][commit.hex] = {}
+            all_files[name][commit.hex]["prevcommit"] = last_commit
+
+            authors = set([commit.committer.name])
+            if last_commit:
+                authors.update(all_files[name][last_commit]["authors"])
+            all_files[name][commit.hex]["authors"] = authors
+
+            all_files[name]['lastcommit'] = commit.hex
+
+    with open(graph_path, 'w') as output:
+        json.dump(all_files, output, default=set_to_list)
+
+    end_time = time.time()
+
+    print("Done")
+    print("Overall processing time {}".format(end_time - start_time))
+
+def load_history_features_graph(path):
+    """
+    Save the history features to a csv file.
+    """
+    file_graph = {}
+    with open(path, 'r') as inp:
+        file_graph = json.load(inp)
+    return file_graph
+
+
+def get_history_features(graph, repo_path, branch):
+    """
+    Function that extracts the history features from a git repository.
+    They are the total number of authors, the total age and the total
+    number of unique changes.
+    """
+    repo = Repository(repo_path)
+    head = repo.references.get(branch)
+
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+
+    features = []
+
+    commit_feat = []
+    commit_feat.append(str(commits[0].hex))
+    commit_feat.append(str(1.0))
+    commit_feat.append(str(0.0))
+    commit_feat.append(str(0.0))
+    features.append(commit_feat)
+
+    for i, commit in enumerate(tqdm(commits[1:])):
+        files = get_diffing_files(commit, commits[i], repo)
+
+        total_number_of_authors = set()
+        total_age = []
+        total_unique_changes = set()
+
+        for (_, name, _) in files:
+            sub_graph = graph[name][commit.hex]
+            total_number_of_authors.update(sub_graph['authors'])
+
+            prev_commit = sub_graph['prevcommit']
+            if prev_commit:
+                total_unique_changes.add(prev_commit)
+
+                prev_commit_obj = repo.get(prev_commit)
+
+                total_age.append(commit.commit_time -
+                                 prev_commit_obj.commit_time)
+
+        total_age = float(sum(total_age)) / len(total_age) if total_age else 0
+
+        commit_feat = []
+        commit_feat.append(str(commit.hex))
+        commit_feat.append(str(float(len(total_number_of_authors))))
+        commit_feat.append(str(float(total_age)))
+        commit_feat.append(str(float(len(total_unique_changes))))
+        features.append(commit_feat)
+    return features
+
+
+def save_history_features(history_features, path):
+    """
+    Function to save the history features as a csv file.
+    """
+    with open(path, 'w') as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(
+            ["commit", "number_of_authors", "age", "number_unique_changes"])
+        for row in history_features:
+            if row:
+                writer.writerow([row[0], row[1], row[2], row[3]])
+
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(description="Utility to extract code churns from" +
+                            " a repository or a single commit.")
+
+    PARSER.add_argument(
+        "--repository",
+        "-r",
+        type=str,
+        default="./repos/jenkins",
+        help="Path to local git repository.")
+    PARSER.add_argument(
+        "--branch",
+        "-b",
+        type=str,
+        default="refs/heads/master",
+        help="Which branch to use.")
+    PARSER.add_argument(
+        "--save-graph",
+        "-sg",
+        action="store_true",
+        help="Generate a new graph for a repository.")
+    PARSER.add_argument(
+        "--graph-path",
+        "-gp",
+        type=str,
+        default="./results/file_graph.json",
+        help="The path to where the graph is stored.")
+    PARSER.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="./results/history_features.csv",
+        help="The path where the output is written.")
+
+    ARGS = PARSER.parse_args()
+    REPO_PATH = ARGS.repository
+    BRANCH = ARGS.branch
+    SAVE_GRAPH = ARGS.save_graph
+    GRAPH_PATH = ARGS.graph_path
+    OUTPUT = ARGS.output
+    print(SAVE_GRAPH)
+
+    if SAVE_GRAPH:
+        save_history_features_graph(REPO_PATH, BRANCH, GRAPH_PATH)
+    GRAPH = load_history_features_graph(GRAPH_PATH)
+    HISTORY_FEATURES = get_history_features(GRAPH, REPO_PATH, BRANCH)
+    save_history_features(HISTORY_FEATURES, OUTPUT)

+ 149 - 0
code/data_assembler/assemble_labels.py

@@ -0,0 +1,149 @@
+"""
+Script to generate a labels file from a file produced by the SZZ algorithm.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import json
+
+from argparse import ArgumentParser
+from datetime import datetime as dat
+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
+from tqdm import tqdm
+
+import matplotlib.pyplot as plt
+
+
+def get_labels(repo_path, branch, pair_file, last_commit):
+    """
+    Get the labels from a file produced by the SZZ algorithm. It contains
+    bug fixing commits and their respective bug fixing commit.
+    """
+    repo = Repository(repo_path)
+    head = repo.references.get(branch)
+
+    commits = []
+    for commit in list(
+            repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)):
+        commits.append(commit)
+        if commit.hex == last_commit:
+            break
+
+    commits = list(reversed(commits))
+
+    pairs = {}
+    with open(pair_file, 'r') as inp:
+        pairs = json.load(inp)
+
+    unique_pairs = set([p[1] for p in pairs])
+    labels = []
+
+    for commit in tqdm(commits):
+        label = [commit.hex, "1" if commit.hex in unique_pairs else "0"]
+        labels.append(label)
+
+    return labels
+
+
+def save_labels(labels, res_path):
+    """
+    Save the labels as a csv file.
+    """
+    with open(res_path, 'w') as out:
+        writer = csv.writer(out)
+        writer.writerow(["commit", "label"])
+        for label in labels:
+            writer.writerow(label)
+
+
+def save_label_distribution(repo_path, branch, labels, res_path):
+    """
+    Save a distribution of the labels over time.
+    """
+    ldict = set()
+    for label in labels:
+        if label[1] == "1":
+            ldict.add(label[0])
+
+    repo = Repository(repo_path)
+    head = repo.references.get(branch)
+
+    commits = list(repo.walk(head.target, GIT_SORT_TOPOLOGICAL))
+
+    start_year = dat.fromtimestamp(commits[-1].commit_time).year
+    end_year = dat.fromtimestamp(commits[0].commit_time).year
+
+    num_years = end_year - start_year
+    year_dist = [0 for y in range(num_years + 1)]
+    years = [y for y in range(start_year, end_year + 1)]
+
+    for commit in commits:
+        if commit.hex in ldict:
+            commit_year = dat.fromtimestamp(commit.commit_time).year
+            year_dist[commit_year - start_year - 1] += 1
+
+    fig = plt.figure()
+    plt.bar(years, year_dist)
+    plt.xticks(years)
+    plt.xlim(xmin=years[0] - 1, xmax=years[-1] + 1)
+    fig.autofmt_xdate()
+    plt.savefig(res_path)
+
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(
+        description="Utility to extract unique bug " +
+        "introducing commits from a set a bug fix and bug introducing pairs.")
+    PARSER.add_argument(
+        "--repository",
+        "-r",
+        type=str,
+        default="../../jenkins_master/jenkins_master",
+        help=
+        "Path to a local git repository from which the pairs where extracted.")
+    PARSER.add_argument(
+        "--branch",
+        "-b",
+        type=str,
+        default="refs/heads/master",
+        help="Which branch to use.")
+    PARSER.add_argument(
+        "--file",
+        "-f",
+        type=str,
+        default="../szz/results/fix_and_introducers_pairs.json",
+        help="The file with the pairs.")
+    PARSER.add_argument(
+        "--resfile",
+        "-rf",
+        type=str,
+        default="./labels.csv",
+        help="The file to which the labels are written.")
+    PARSER.add_argument(
+        "--figfile",
+        "-ff",
+        type=str,
+        default="./distribution.png",
+        help="The file to which the bug introducing ditribution is written.")
+    PARSER.add_argument(
+        "--commit",
+        "-c",
+        type=str,
+        default="02d6908ada70fcf8012833ddef628bc09c6f8389",
+        help="The last commit that should be analyzed.")
+
+    ARGS = PARSER.parse_args()
+    REPOPATH = ARGS.repository
+    BRANCH = ARGS.branch
+    PAIRFILE = ARGS.file
+    RESFILE = ARGS.resfile
+    FIGFILE = ARGS.figfile
+    LAST_COMMIT = ARGS.commit
+
+    LABELS = get_labels(REPOPATH, BRANCH, PAIRFILE, LAST_COMMIT)
+
+    save_labels(LABELS, RESFILE)
+
+    save_label_distribution(REPOPATH, BRANCH, LABELS, FIGFILE)

+ 83 - 0
code/data_assembler/assemble_purpose_features.py

@@ -0,0 +1,83 @@
+"""
+Script to extract the purpose features.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import re
+
+from argparse import ArgumentParser
+from tqdm import tqdm
+from pygit2 import Repository, GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE
+
+PATTERNS = [r"bug", r"fix", r"defect", r"patch"]
+
+def is_fix(message):
+    """
+    Check if a message contains any of the fix patterns.
+    """
+    for pattern in PATTERNS:
+        if re.search(pattern, message):
+            return True
+    return False
+
+def get_purpose_features(repo_path, branch):
+    """
+    Extract the purpose features for each commit.
+    """
+    repo = Repository(repo_path)
+    head = repo.references.get(branch)
+
+    commits = list(
+        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))
+
+    features = []
+    for _, commit in enumerate(tqdm(commits)):
+        message = commit.message
+
+        fix = 1.0 if (is_fix(message)) else 0.0
+
+        feat = []
+        feat.append(str(commit.hex))
+        feat.append(str(fix))
+        features.append(feat)
+    return features
+
+def save_features(purpose_features, path="./results/purpose_features.csv"):
+    """
+    Save the purpose features to a csv file.
+    """
+    with open(path, 'w') as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(["commit", "purpose"])
+        for row in purpose_features:
+            if row:
+                writer.writerow([row[0], row[1]])
+
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(
+        description="Utility to extract purpose features from" +
+        " a repository or a single commit.")
+
+    PARSER.add_argument(
+        "--repository",
+        "-r",
+        type=str,
+        default="./repos/jenkins",
+        help="Path to local git repository.")
+    PARSER.add_argument(
+        "--branch",
+        "-b",
+        type=str,
+        default="refs/heads/master",
+        help="Which branch to use.")
+
+    ARGS = PARSER.parse_args()
+    REPOPATH = ARGS.repository
+    BRANCH = ARGS.branch
+
+    FEATURES = get_purpose_features(REPOPATH, BRANCH)
+    save_features(FEATURES)

+ 257 - 0
code/data_assembler/general_data.py

@@ -0,0 +1,257 @@
+"""
+Script that extracts general data about a git repository.
+"""
+__author__ = "Oscar Svensson"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import csv
+import json
+import re
+
+from argparse import ArgumentParser
+from datetime import datetime
+from numpy import median, mean
+from pygit2 import Repository
+
+def has_added(message):
+    """
+    Function to check if a message contains any word that indicates an addition of lines of code.
+    """
+    if (re.search(
+            r"add(?:ed)*|implement(?:ed)*|introduce(?:d)*|improve(?:ment|ments)*",
+            message.lower())):
+        return True
+    return False
+
+
+def has_updated(message):
+    """
+    Function to check if a message contains any word that indicates an update of lines of code.
+    """
+    if (re.search(
+            r"update[d]*|mov(?:ing|e|ed)|refactor|modifying|switching|deprecate(?:d)*|"+
+            "clean(?:up|ed)*",
+            message.lower())):
+        return True
+    return False
+
+
+def has_bugfix(message):
+    """
+    Function to check if a message contains any word that indicates a bug fix.
+    """
+    if (re.search(r"jenkins[-]?\d|hudson[-]?\d|fix(?:es|ed)*|solve(?:d)*",
+                  message.lower())):
+        return True
+    return False
+
+
+def has_issue(message):
+    """
+    Function to check if a message contains any word that indicates a issue.
+    """
+    if re.search(r"issue number", message.lower()):
+        return True
+    return False
+
+
+def save_commit_messages(commits, repo):
+    """
+    Function to run some statistics on a number of commits in a git repository.
+    """
+
+    mapping = {}
+
+    added = set()
+    updated = set()
+    bugfix = set()
+    issue_set = set()
+    for commit in commits:
+        message = commit.message
+        mapping[commit.hex] = commit.message
+
+        if has_added(message):
+            added.add(commit.hex)
+        elif has_updated(message):
+            updated.add(commit.hex)
+        elif has_bugfix(message):
+            bugfix.add(commit.hex)
+        elif has_issue(message):
+            issue_set.add(commit.hex)
+
+    """
+    Dumps all found commits to a file.
+    """
+    with open("./results/commit_messages.json", 'w') as output:
+        json.dump(mapping, output)
+
+    overall = set()
+    overall.update(added)
+    overall.update(updated)
+    overall.update(bugfix)
+    overall.update(issue_set)
+
+    all_messages = set([commit.hex for commit in commits])
+    not_defined = {c: repo.get(c).message for c in all_messages - overall}
+
+    print("Number of commits that added something: {} ({}%)".format(
+        len(added),
+        float(len(added)) / len(all_messages)))
+    print("Number of commits that updated something: {} ({}%)".format(
+        len(updated),
+        float(len(updated)) / len(all_messages)))
+    print("Number of commits that fixed a bug: {} ({}%)".format(
+        len(bugfix),
+        float(len(bugfix)) / len(all_messages)))
+    print("Number of commits that contained an issue number: {} ({}%)".format(
+        len(issue_set),
+        float(len(issue_set)) / len(all_messages)))
+
+    """
+    Dumps all undefined commits to a file as well.
+    """
+    with open("./results/undefined_commit_messages.json", 'w') as output:
+        json.dump(not_defined, output)
+    print("Number of undefined commits: {} ({}%)".format(
+        len(not_defined),
+        float(len(not_defined)) / len(all_messages)))
+
+
+def get_average_time_issues(issue_path):
+    """
+    Function to get the average times for issues.
+    """
+    issues_dict = {}
+    with open(issue_path, 'r') as inp:
+        issues_dict = json.load(inp)
+
+    days = []
+
+    lowest = (float('Inf'), 0, 0)
+    highest = (0, None, None)
+
+    for _, dates in issues_dict.items():
+        creationdate = dates['creationdate']
+        resolutiondate = dates['resolutiondate']
+
+        creationdate = datetime.strptime(
+            creationdate, "%Y-%m-%d %H:%M:%S %z").replace(tzinfo=None)
+        resolutiondate = datetime.strptime(
+            resolutiondate, "%Y-%m-%d %H:%M:%S %z").replace(tzinfo=None)
+
+        days.append(((resolutiondate - creationdate).days))
+        if days[-1] > highest[0]:
+            highest = (days[-1], creationdate, resolutiondate)
+        if days[-1] < lowest[0]:
+            lowest = (days[-1], creationdate, resolutiondate)
+
+    print("Lowest: {}".format(lowest))
+    print("Highest: {}".format(highest))
+    print("Mean time between resolution date and commit date: {} days".format(
+        mean(days)))
+
+
+def get_general_data(repo_path, issue_path, labels, pairs):
+    """
+    Function to get general statistics for a git repository.
+    """
+    repo = Repository(repo_path)
+
+    issue_list = {}
+    labeled_commits = {}
+
+    with open(labels, 'r') as inp:
+        reader = csv.reader(inp)
+        next(reader)
+
+        for commit in reader:
+            labeled_commits[commit[0]] = float(commit[1])
+
+    print("Number of commits: {}".format(len(labeled_commits)))
+    print("Number of found bugintroducing commits: {}".format(
+        len([
+            labeled_commits[f] for f in labeled_commits
+            if labeled_commits[f] > 0
+        ])))
+
+    pair_map = []
+    with open(pairs, 'r') as inp:
+        pair_map = json.load(inp)
+
+    total_fixes = set([p[0] for p in pair_map])
+    print("Total number of fixes used: {}".format(len(total_fixes)))
+
+    bug_labeled_commits = set(
+        [l for l in labeled_commits if labeled_commits[l] > 0])
+
+    fixes_in_bugs = set(bug_labeled_commits).intersection(total_fixes)
+    print("Total number of fixes in bugs found : {}".format(
+        len(fixes_in_bugs)))
+
+    time_diff = []
+    for pair in pair_map:
+        fix = repo.get(pair[0])
+        bug = repo.get(pair[1])
+
+        fix_date = datetime.fromtimestamp(fix.commit_time).replace(tzinfo=None)
+        bug_date = datetime.fromtimestamp(bug.commit_time).replace(tzinfo=None)
+
+        diff = (fix_date - bug_date).days
+
+        time_diff.append(diff)
+    years, days = divmod(float(mean(time_diff)), 365.25)
+    myears, mdays = divmod(float(median(time_diff)), 365.25)
+
+    print(
+        "Average time between bug introduction and fix: {} years and {} days".
+        format(years, days))
+    print("Median time between bug introduction and fix: {} years and {} days".
+          format(myears, mdays))
+
+    with open(issue_path, 'r') as inp:
+        issue_list = json.load(inp)
+
+    print("Total number of fixes found: {}".format(len(issue_list)))
+
+    save_commit_messages([repo.get(c) for c in bug_labeled_commits], repo)
+    get_average_time_issues(issue_path)
+
+
+if __name__ == "__main__":
+    PARSER = ArgumentParser(
+        description="Utility to extract purpose features from" +
+        " a repository or a single commit.")
+
+    PARSER.add_argument(
+        "--repository",
+        "-r",
+        type=str,
+        default="./repos/jenkins",
+        help="Path to local git repository.")
+    PARSER.add_argument(
+        "--issues",
+        "-i",
+        type=str,
+        default="../szz/issue_list_saved.json",
+        help="Issues to analyze.")
+    PARSER.add_argument(
+        "--labels",
+        "-l",
+        type=str,
+        default="./labels.csv",
+        help="Found labels.")
+    PARSER.add_argument(
+        "--fixinpairs",
+        "-fp",
+        type=str,
+        default="./fix_and_introducers_pairs.json",
+        help="File with fix and introducing pair commits.")
+
+    ARGS = PARSER.parse_args()
+    REPO_PATH = ARGS.repository
+    ISSUES = ARGS.issues
+    LABELS = ARGS.labels
+    PAIRS = ARGS.fixinpairs
+
+    get_general_data(REPO_PATH, ISSUES, LABELS, PAIRS)

+ 7 - 0
code/data_assembler/scripts/analyse_commit

@@ -0,0 +1,7 @@
+#!/bin/sh
+cd /root/repo
+git checkout $1
+git log --all --numstat --date=short --pretty=format:'--%h--%ad--%aN' --no-renames >> /tmp/$1_log.log
+mkdir /root/results/$1
+java -jar /usr/src/code-maat/app-standalone.jar -l /tmp/$1_log.log -o /root/results/$1/$1_coupling.log.res -c git2 -a coupling
+java -jar /usr/src/code-maat/app-standalone.jar -l /tmp/$1_log.log -o /root/results/$1/$1_revisions.log.res -c git2 -a revisions