Browse Source

inital grading-on-a-curve commit

Justin Tracey 8 months ago
parent
commit
afdd747eb6
37 changed files with 3461 additions and 14 deletions
  1. 16 0
      README.md
  2. 183 0
      code/author-identities.py
  3. 21 0
      code/bugcounts.sh
  4. 21 0
      code/compare_results-v2.sh
  5. 25 0
      code/compare_results.sh
  6. 33 0
      code/fetch_bugzilla_bugs/fetch-all-approval-reqs.sh
  7. 74 0
      code/fetch_bugzilla_bugs/fetch-bugzilla.py
  8. 150 0
      code/fetch_bugzilla_bugs/fetch_bugs.sh
  9. 496 0
      code/fetch_bugzilla_bugs/filter.sh
  10. 139 0
      code/fetch_bugzilla_bugs/find_bugzilla_fixes.py
  11. 24 0
      code/fetch_bugzilla_bugs/get-approval-request-comment.py
  12. 79 0
      code/fetch_bugzilla_bugs/get_bugzilla_patches.py
  13. 5 6
      code/fetch_jira_bugs/git_log_to_array.py
  14. 25 0
      code/learning_curves/genExp.sh
  15. 25 0
      code/learning_curves/gen_learning_curve_table.sh
  16. 183 0
      code/learning_curves/grid_search.sh
  17. 172 0
      code/learning_curves/learningCurve.py
  18. 240 0
      code/learning_curves/learningCurve_gradientDescent.py
  19. 91 0
      code/learning_curves/model-vs-model.py
  20. 95 0
      code/learning_curves/model-vs-real.py
  21. 67 0
      code/learning_curves/plot-experience.py
  22. 68 0
      code/learning_curves/plot_T1s.py
  23. 37 0
      code/learning_curves/plot_rust_data.sh
  24. 147 0
      code/learning_curves/vcclib.py
  25. 249 0
      code/reproduceResults.sh
  26. 3 0
      code/szz/build.gradle
  27. 62 0
      code/szz/src/main/java/diff/CPPFileExtension.java
  28. 18 0
      code/szz/src/main/java/diff/TokenComparator.java
  29. 18 0
      code/szz/src/main/java/diff/TokenSequence.java
  30. 1 0
      code/szz/src/main/java/heuristics/SimpleBugIntroducerFinder.java
  31. 184 8
      code/szz/src/main/java/parser/GitParser.java
  32. 10 0
      data/bugzilla-data/template.json
  33. 71 0
      data/hand-annotated/c++-blame.csv
  34. 66 0
      data/hand-annotated/c++.csv
  35. 338 0
      data/hand-annotated/relevant-c++
  36. 11 0
      data/hand-annotated/relevant-dirs.csv
  37. 14 0
      data/hand-annotated/rust-blame.csv

+ 16 - 0
README.md

@@ -0,0 +1,16 @@
+This repository contains the scripts and data necessary to reproduce the results of the "Grading on a Curve: How Rust can Facilitate New Contributors while Decreasing Vulnerabilities" paper.
+The data should reproduce by installing the requirements, cloning the repo, and running the `reproduceResults.sh` script in the `code` directory.
+Note that this will take some time, as it requires cloning seven additional git repositories (one of which is the very large Firefox git mirror), fetching data from the relatively slow Bugzilla API, and running gradient descent on a non-trivial amount of data.
+The exact amount of time will depend on your network connection and CPU, but running it overnight might be a good idea.
+
+The software requirements for running the `reproduceResults.sh` script are as follows:
+
+ - git
+ - bash
+ - jq
+ - gradle
+ - java and javac
+ - python3 with the following modules:
+   - numpy scipy pyplot urllib3
+
+The `reproduceResults.sh` is intended to be fairly human-readable for a shell script, and its source can be used to find descriptions of what each step is doing and the role of each directory and file.

+ 183 - 0
code/author-identities.py

@@ -0,0 +1,183 @@
+import sys
+import subprocess
+import re
+
+
+class Author:
+    # this "banned" list is just to prevent matching,
+    # full IDs that contain them will still be properly rewritten
+    banned = [
+        None, "", "None", "none@none", "unknown",  # common null identities
+        "bugs", "bugzilla", "dev",  # common context-specific identities
+        # mononyms with likely collisions
+        "Adam", "Alex", "alex", "Ben", "Bob", "Daniel", "Dmitry", "Ian",
+        "james", "Jason", "jason", "John", "kevin", "Martin", "martin",
+        "Michael", "Peter", "Petru", "Pranav", "Tyler"
+    ]
+
+    def __init__(self, names=[], emails=[], full_ids=[], main_id=None):
+        self.names = {name for name in names if name not in Author.banned}
+        self.emails = {email for email in emails if email not in Author.banned}
+        # old mozilla commits used % instead of @ a lot
+        for email in set(self.emails):
+            if '%' in email:
+                self.emails.add(email.replace('%', '@'))
+        self.full_ids = {full_id for full_id in full_ids}
+        self.main_id = main_id
+
+    def match_identities(self, other):
+        for name in self.names:
+            if name in other.names:
+                return True
+        for email in self.emails:
+            if email in other.emails:
+                return True
+        return False
+
+    def merge_identities(self, other):
+        self.names |= other.names
+        self.emails |= other.emails
+        self.full_ids |= other.full_ids
+        if other.main_id and not self.main_id:
+            self.main_id = other.main_id
+
+    def generate_mailmap(self):
+        full_ids = list(self.full_ids)
+        full_ids.sort()
+
+        if self.main_id:
+            main_id = self.main_id
+            print(main_id)
+        else:
+            ids_with_names = [full_id for full_id in full_ids
+                              if full_id[0] != '<']
+            if ids_with_names:
+                ids_with_emails = [full_id for full_id in ids_with_names
+                                   if '<>' not in full_id]
+                if ids_with_emails:
+                    main_id = min(ids_with_emails, key=len)
+                else:
+                    main_id = min(ids_with_names, key=len)
+            else:
+                main_id = "none@none" + min(full_ids, key=len)
+        secondary_ids = [full_id for full_id in full_ids if full_id != main_id]
+        for full_id in secondary_ids:
+            print("{} {}".format(main_id, full_id))
+
+
+class Authors:
+    def __init__(self, author_set=set()):
+        self.names = {}
+        self.emails = {}
+        self.authors = author_set
+        for author in author_set:
+            self.add_author(author)
+
+    def add_author(self, author):
+        existing_names = author.names & self.names.keys()
+        existing_emails = author.emails & self.emails.keys()
+        if existing_names:
+            name = existing_names.pop()
+            self.names[name].merge_identities(author)
+            author = self.names[name]
+        if existing_emails:
+            email = existing_emails.pop()
+            self.emails[email].merge_identities(author)
+            author = self.emails[email]
+        for name in author.names - self.names.keys():
+            self.names[name] = author
+        for email in author.emails - self.emails.keys():
+            self.emails[email] = author
+        self.authors.add(author)
+        self.dedup(author)
+
+        return author
+
+    def add_edge_cases(self):
+        # Edge cases that can't be cleanly handled generically
+        self.names["Jason Orendorff"].full_ids.add("jason <none@none>")
+        self.names["Glenn Watson"].full_ids.add("dev <dev@devs-MacBook-Pro.local>")
+
+    def dedup(self, author):
+        for name in author.names:
+            other = self.names[name]
+            if author != other:
+                self.names[name] = author
+                if other in self.authors:
+                    self.authors.remove(other)
+        for email in author.emails:
+            other = self.emails[email]
+            if author != other:
+                self.emails[email] = author
+                if other in self.authors:
+                    self.authors.remove(other)
+
+    def scan_and_merge(self, author, identities):
+        for other in identities:
+            if author != other and author.match_identities(other):
+                other.merge_identities(author)
+                self.scan_and_merge(other, identities)
+                return
+        identities.add(author)
+
+
+def build_full_id(name, email):
+    if name is None:
+        name = ""
+    else:
+        name += " "
+    if email is None:
+        email = ""
+    return "{}<{}>".format(name, email)
+
+
+authors = Authors()
+
+# first, include the existing .mailmap, if there is one
+with open(sys.argv[1] + '/.mailmap') as f:
+    for line in f.readlines():
+        line = line.split('#')[0]
+        if not len(line) > 0:
+            continue
+        # In English: capture a string we're calling name1, which does not
+        # contain the character to start an email, followed by a non-zero
+        # number of whitespace characters, but let all of this be optional.
+        name1_regex = r"(?:(?P<name1>[^<]+)\s+)?"
+        # In English: capture a string we're calling email1, which is preceded
+        # by a < and followed by a >, containing no >. Non-optional.
+        email1_regex = r"<(?P<email1>[^>]+)>"
+        # In English: same as the name1 regex, but with the non-zero whitespace
+        # preceding instead of following (and called name2).
+        name2_regex = r"(?:\s+(?P<name2>[^<]*[^<\s]))?"
+        # In English: same as email1 regex, but with non-zero whitespace
+        # preceding the matched email, and all optional (and called email2).
+        email2_regex = r"(?:\s+<(?P<email2>[^>]+)>)?"
+        d = re.match(name1_regex + email1_regex + name2_regex + email2_regex,
+                     line).groupdict()
+        d = {k: d[k].strip() for k in d if d[k] is not None}
+        # someone took the time to add this to the .mailmap, so we should
+        # respect their preferred name, rather than treat them all the same
+        main_id = build_full_id(d.get("name1"), d.get("email1"))
+        if d.get("email2") is not None:
+            full_ids = {build_full_id(d.get("name2"), d.get("email2"))}
+        else:
+            full_ids = set()
+        author = Author({d.get("name1"), d.get("name2")},
+                        {d.get("email1"), d.get("email2")},
+                        full_ids, main_id)
+        authors.add_author(author)
+
+command = "git log --full-history --no-merges --use-mailmap "\
+    "--format='format:%aN	%aE' -- {} | sort | uniq".format(sys.argv[1])
+log = subprocess.check_output(command, shell=True,
+                              universal_newlines=True).rstrip()
+
+
+for author in log.splitlines():
+    name, email = author.split('	')
+    full_id = build_full_id(name, email)
+    author = Author({name}, {email}, {full_id})
+    authors.add_author(author)
+
+for author in authors.authors:
+    author.generate_mailmap()

+ 21 - 0
code/bugcounts.sh

@@ -0,0 +1,21 @@
+#!/bin/sh
+projects="$1"
+issuesDir="$2"
+tablesDir="$3"
+
+for p in $projects; do
+    f="$issuesDir/$p-issues/res0.json"
+    #echo $f
+    #jq '.issues | length' "$f"
+    if [ "$(jq '.issues | length' "$f")" -gt 0 ]; then
+        allIssues="$(jq '.issues | .[] | .key' "$f" | tr -d '"-' | sort | uniq)"
+        blamedIssues="$(cut -d, -f2 "$tablesDir/$p" | sort | uniq)"
+        echo $allIssues
+        echo $blamedIssues
+        allIssuesCount=$(echo "$allIssues" | grep -v '^$' | wc -l)
+        blamedIssuesCount=$(echo "$blamedIssues" | grep -v '^$' | wc -l)
+        echo "$p,$allIssuesCount,$blamedIssuesCount"
+    else
+        echo "$p,0,0"
+    fi
+done

+ 21 - 0
code/compare_results-v2.sh

@@ -0,0 +1,21 @@
+#!/bin/env bash
+
+project="$1"
+issuesFile="$2"
+resultsFile="$3"
+handMatched="$4"
+bugToIntroDir="$5"
+
+bugs=$(grep "^$project	" "$handMatched"| cut -f2)
+for bug in $bugs ; do
+    fix=$(jq -r ".[\"-$bug\"].hash" "$issuesFile")
+    blamed=$(jq "unique | .[] | select(.[0] == \"$fix\")[1]" "$resultsFile" | tr -d '"')
+    groundTruth=$(grep -E "^$project	$bug" "$handMatched" | cut -f 5 | tr , ' ')
+    match=0
+    for commit in $blamed ; do
+        if grep -q "$commit" <(echo $groundTruth) ; then ((match++)) ; fi
+    done
+    szz=$(echo "$blamed" | wc -w)
+    form=$(echo "$groundTruth" | wc -w)
+    echo "$bug,$szz,$match,$form"
+done

+ 25 - 0
code/compare_results.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+issuesFile="$1"
+resultsFile="$2"
+bugzillaDir="$3/bugs/"
+bugToIntroDir="$4"
+
+fixes=$(jq 'map(.[0]) | unique' "$resultsFile" | grep -Eo '[[:alnum:]]+')
+for fix in $fixes ; do
+    bug=$(jq "to_entries[] | select(.value.\"hash\" == \"$fix\") | .key" "$issuesFile" | tr -dc [0-9])
+    blamed=$(jq "unique | .[] | select(.[0] == \"$fix\")[1]" "$resultsFile" | tr -d '"')
+    if [ -f "$bugzillaDir/$bug.json" ] ; then
+        groundTruth=$(jq '.inducedBy | map(.revisions)[]' "$bugzillaDir/$bug.json" | grep -Eo '[[:alnum:]]+')
+        match=0
+        for commit in $blamed ; do
+            if grep -q "$commit" <(echo $groundTruth) ; then ((match++)) ; fi
+        done
+        szz=$(echo "$blamed" | wc -w)
+        form=$(echo "$groundTruth" | wc -w)
+        echo "$bug,$fix,$szz,$match,$form"
+        echo "$groundTruth" > "$bugToIntroDir/$bug"
+    else
+        echo "$blamed" > "$bugToIntroDir/$bug"
+    fi
+done

+ 33 - 0
code/fetch_bugzilla_bugs/fetch-all-approval-reqs.sh

@@ -0,0 +1,33 @@
+issuesFile=$1
+firefoxDir=$2
+bugzillaDir=$3
+codeDir=$4
+
+gitDir="$firefoxDir/.git"
+
+mkdir -p "$bugzillaDir/bugs"
+
+bugs=$(jq 'keys' "$issuesFile" | grep -Eo '[0-9]+')
+
+for bug in $bugs; do
+    causes=$(python3 "$codeDir/fetch_bugzilla_bugs/get-approval-request-comment.py" $bug)
+    if [ "$causes" ] ; then
+        causes=$(echo $causes | tr -dc '0-9 ')
+        # grep -v for long indentation because it's easier if we construct the "inducedBy" field ourselves
+        grep -v '        ' "$bugzillaDir/template.json" > "$bugzillaDir/bugs/$bug.json"
+        sed -i "s/^    \"id\": \"\",/    \"id\": \"$bug\",/" "$bugzillaDir/bugs/$bug.json"
+        comma=""
+        for cause in $causes ; do
+            causeRevs=""
+            causeRevs=$(python3 "$codeDir/fetch_bugzilla_bugs/get_bugzilla_patches.py" 'titles' $cause |
+                            while read -r line ; do
+                                rev=$(git --git-dir "$gitDir" log --grep="$line" -F --pretty='tformat:"%H"' | tr '\n' ',' | head -c -1)
+                                if [ "$rev" ] ; then
+                                    echo "$rev"
+                                fi
+                            done | tr '\n' ',' | head -c -1)
+            sed -i "s/^    ]/$comma        {\n            \"id\": \"$cause\",\n            \"revisions\": [$causeRevs]\n        }\n    ]/" "$bugzillaDir/bugs/$bug.json"
+            comma=","
+        done
+    fi
+done

+ 74 - 0
code/fetch_bugzilla_bugs/fetch-bugzilla.py

@@ -0,0 +1,74 @@
+""" Fetch issues that match given jql query """
+__author__ = "Kristian Berg"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+from urllib.parse import quote
+from urllib3 import PoolManager
+
+import json
+import os
+import argparse
+import io
+
+
+def translate_json(j):
+    j['issues'] = j.pop('bugs')
+    for i in j['issues']:
+        i['key'] = '-' + str(i.pop('id'))
+        i['fields'] = {'created': i.pop('creation_time').replace('Z', ' +0000'),
+                       'resolutiondate': i.pop('cf_last_resolved').replace('Z', '+0000')}
+
+
+def fetch(keys, bugzilla_project_name):
+    """ Fetch issues that match given bugzilla query """
+    # Bugzilla query with necessary fields for fixed bugs
+    q = 'include_fields=id,creation_time,cf_last_resolved' \
+        + '&resolution=fixed'
+    for key in keys:
+        q += '&' + quote(key, safe='=')
+
+    start_at = 0
+
+    # max_results parameter is capped at 1000, specifying a higher value will
+    # still return only the first 1000 results
+    max_results = 1000
+
+    os.makedirs('issues/', exist_ok=True)
+    request = 'https://' + bugzilla_project_name + '/rest/bug?' + q \
+        + '&startAt={}&maxResults=' + str(max_results)
+
+    http = PoolManager()
+
+    print('Progress: | = ' + str(max_results) + ' issues')
+    while True:
+        rurl = request.format(start_at)
+        print(rurl)
+        res = http.request('GET', request, retries=10)
+        j = json.loads(res.data)
+        translate_json(j)
+        with io.open('issues/res' + str(start_at) + '.json', 'w',
+                     encoding="utf-8") as f:
+            json.dump(j, f)
+        if len(j['issues']) < max_results:
+            break
+        print('|', end='', flush='True')
+        start_at += max_results
+
+    print('\nDone!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="""Fetch issues from a Bugzilla REST API.
+                                                 """)
+    parser.add_argument('--bugzilla', type=str,
+                        help="The name of the Bugzilla repository of the project (i.e. the domain).")
+    parser.add_argument('--key-value', type=str, action='append',
+                        help="Custom key-value string for Bugzilla query (MUST be a string of the form 'key=value'). You can find some of the options here: https://bugzilla.readthedocs.io/en/latest/api/core/v1/bug.html#search-bugs")
+    #parser.add_argument('--products', type=str,
+    #                    help="Comma-separated list of Prodcuts to query in (shorthand for multiple '--key-value product=Foo' options).")
+
+    args = parser.parse_args()
+    keys = args.key_value # + ['product=' + p for p in args.products.split(',')]
+    bugzilla_project_name = args.bugzilla
+    fetch(keys, bugzilla_project_name)

+ 150 - 0
code/fetch_bugzilla_bugs/fetch_bugs.sh

@@ -0,0 +1,150 @@
+#!/bin/sh
+
+## Fetches security bugs from Bugzilla.
+## These queries are particular; modifications are made by editing this file,
+## the only arguments are the path to this project's code (to call a helper
+## script), and the path of where the issues should be output.
+
+codeDir="$1"
+issuesDir="$2"
+fetchScript="$codeDir/fetch_bugzilla_bugs/fetch-bugzilla.py"
+toFilter="$issuesDir/to_filter/"
+#products='Core,Firefox,Firefox for Android,Firefox for iOS,Firefox OS,Focus,Focus-iOS,NSS,Privacy,Servo'
+securityLevels='keywords=sec-critical, sec-high, sec-moderate, sec-low'
+
+mkdir -p "$issuesDir"
+mkdir -p "$toFilter"
+
+echo "mp4 parsing (stagefright)"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'short_desc=stagefright' \
+        --key-value 'short_desc_type=allwordssubstr' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2018-01-09'
+rm -rf "$issuesDir/stagefright-issues" && mv issues "$issuesDir/stagefright-issues"
+
+echo "unicode encoder (uconv)"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Internationalization' \
+        --key-value 'longdesc=uconv' \
+        --key-value 'longdesc_type=allwordssubstr' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2017-06-13'
+rm -rf "$issuesDir/uconv-issues" && mv issues "$issuesDir/uconv-issues"
+
+echo "CSS"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=CSS Parsing and Computation' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2017-11-14' \
+        --key-value 'o1=anywords' --key-value 'n1=1' --key-value 'f1=short_desc' --key-value 'v1=stylo'
+rm -rf "$toFilter/css-issues" && mv issues "$toFilter/css-issues"
+
+# component interaction (XPCOM)
+## This doesn't seem valid, so removing until/unless it's clear there's
+## something comparable.
+#python3 "$fetchScript" \
+#        --bugzilla bugzilla.mozilla.org \
+#        --key-value "$securityLevels" \
+#        --key-value 'keywords_type=anywords' \
+#        --key-value 'component=XPCOM'
+#rm -rf "$issuesDir/xpcom-issues" && mv issues "$issuesDir/xpcom-issues"
+
+echo "audio (cubeb)"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Audio/Video: cubeb' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2020-01-17'
+rm -rf "$toFilter/cubeb-linux-issues" && cp -r issues "$toFilter/cubeb-linux-issues"
+rm -rf "$toFilter/cubeb-macos-issues" && mv issues "$toFilter/cubeb-macos-issues"
+
+echo "prefrences parsing"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Preferences: Backend' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2018-02-01'
+rm -rf "$toFilter/prefs-parser-issues" && mv issues "$toFilter/prefs-parser-issues"
+
+echo "rendering (layers)"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Graphics: Layers' \
+        --key-value 'component=Graphics'
+rm -rf "$toFilter/layers-issues" && mv issues "$toFilter/layers-issues"
+
+echo "renering (webrender)"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Graphics: WebRender'
+rm -rf "$issuesDir/webrender-nonrust-issues" && mv issues "$issuesDir/webrender-nonrust-issues"
+
+echo "certificate blocklist"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component= Security: PSM'
+rm -rf "$toFilter/cert-blocklist-issues" && mv issues "$toFilter/cert-blocklist-issues"
+
+echo "Japanese encoding detector"
+echo "Unicode language identifier"
+echo "language negotiation"
+echo "encoding detector"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Internationalization' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2019-12-12'
+rm -rf "$toFilter/japanese-encoding-issues" && cp -r issues "$toFilter/japanese-encoding-issues"
+rm -rf "$toFilter/language-identifier-issues" && cp -r issues "$toFilter/language-identifier-issues"
+rm -rf "$toFilter/language-negotiation-issues" && cp -r issues "$toFilter/language-negotiation-issues"
+rm -rf "$toFilter/encoding-detector-issues" && mv issues "$toFilter/encoding-detector-issues"
+
+echo "hyphenation (libhyphen)"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Layout: Text and Fonts' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2019-11-12'
+rm -rf "$toFilter/hyphenation-issues" && mv issues "$toFilter/hyphenation-issues"
+
+echo "color management (qcms)"
+python3 "$fetchScript" \
+        --bugzilla bugzilla.mozilla.org \
+        --key-value "$securityLevels" \
+        --key-value 'keywords_type=anywords' \
+        --key-value 'component=Graphics: Color Management' \
+        --key-value 'chfieldvalue=FIXED' \
+        --key-value 'chfield=resolution' \
+        --key-value 'chfieldto=2020-09-21'
+rm -rf "$issuesDir/qcms-issues" && mv issues "$issuesDir/qcms-issues"

+ 496 - 0
code/fetch_bugzilla_bugs/filter.sh

@@ -0,0 +1,496 @@
+#!/bin/bash
+
+## Filters the result of Bugzilla queries.
+## Intended to be modified directly, args can only specify the paths of the
+## project root and the Firefox clone.
+
+## Most of these projects turn our to have 0 security bugs, so this ends up
+## being a reproducible way to create empty lists.
+
+## To find out if a file was moved in a git history, check the output of:
+# git log --diff-filter=A -- $filepath
+## and
+# git diff-tree --no-commit-id --name-only -r $commitThatAddedFileAbove
+## (these work even if the file has since been removed)
+
+codeDir="$1"
+issuesDir="$2"
+repoDir="$3"
+toFilter="$issuesDir/to_filter/"
+
+getIssues () {
+    for name in "$@" ; do
+        jsonFile="$name/res0.json"
+        jq '.issues | .[] | .key' "$jsonFile" | tr -d '"-'
+    done | sort -n | uniq
+}
+
+
+# only include bugs whose fixes touch at least one file in $2:...
+filter () {
+    project="$1"
+    echo "filtering $project" >&2
+    issues=$(getIssues "$project")
+    shift
+    matches=$(
+        for issue in $issues ; do
+            echo "checking bug $issue" >&2
+            affectedFiles=$(python3 "$codeDir/fetch_bugzilla_bugs/get_bugzilla_patches.py" 'files' $issue)
+            match=$(for filteredFile in $@ ; do echo $affectedFiles | grep "$filteredFile" ; done)
+            if [ -n "$match" ] ; then
+                echo "$issue"
+                continue
+            fi
+        done)
+    echo '{"issues": ['
+    unset comma
+    for match in $matches ; do
+        if [ -n "$comma" ]; then echo ","; fi
+        jq ".issues | map(select(.key == \"-$match\")) | .[]" "$project/res0.json"
+        comma=true
+    done
+    echo ']}'
+}
+
+# only include bugs whose fixes touch at least one file *not* in the tree as of commit $2
+inverse_filter() {
+    project="$1"
+    echo "(inverse) filtering $project" >&2
+    issues=$(getIssues "$project")
+    shift
+    gitrev="$1"
+    git -C "$repoDir" checkout "$gitrev"
+    matches=$(
+        for issue in $issues ; do
+            echo "checking bug $issue" >&2
+            affectedFiles=$(python3 "$codeDir/fetch_bugzilla_bugs/get_bugzilla_patches.py" 'files' $issue)
+            for affectedFile in $affectedFiles; do
+                if [ ! -f "$repoDir/$affectedFile" ]; then
+                    echo "$issue"
+                    break
+                fi
+            done
+        done)
+    echo '{"issues": ['
+    unset comma
+    for match in $matches ; do
+        if [ -n "$comma" ]; then echo ","; fi
+        jq ".issues | map(select(.key == \"-$match\")) | .[]" "$project/res0.json"
+        comma=true
+    done
+    echo ']}'
+}
+
+
+# css/style
+mkdir -p "$issuesDir/css-issues"
+filter "$toFilter/css-issues" \
+               'dom/animation/AnimValuesStyleRule.cpp' \
+               'dom/animation/AnimValuesStyleRule.h' \
+               'layout/base/GeckoRestyleManager.cpp' \
+               'layout/base/GeckoRestyleManager.h' \
+               'layout/base/RestyleTracker.cpp' \
+               'layout/style/CSSStyleSheet.cpp' \
+               'layout/style/CSSStyleSheet.h' \
+               'layout/style/CSSVariableDeclarations.cpp' \
+               'layout/style/CSSVariableDeclarations.h' \
+               'layout/style/CSSVariableResolver.cpp' \
+               'layout/style/CSSVariableResolver.h' \
+               'layout/style/CSSVariableValues.cpp' \
+               'layout/style/CSSVariableValues.h' \
+               'layout/style/Declaration.cpp' \
+               'layout/style/Declaration.h' \
+               'layout/style/GeckoStyleContext.cpp' \
+               'layout/style/GeckoStyleContext.h' \
+               'layout/style/ImportRule.h' \
+               'layout/style/IncrementalClearCOMRuleArray.cpp' \
+               'layout/style/IncrementalClearCOMRuleArray.h' \
+               'layout/style/NameSpaceRule.h' \
+               'layout/style/RuleNodeCacheConditions.cpp' \
+               'layout/style/RuleProcessorCache.cpp' \
+               'layout/style/RuleProcessorCache.h' \
+               'layout/style/StyleRule.cpp' \
+               'layout/style/StyleRule.h' \
+               'layout/style/nsCSSDataBlock.cpp' \
+               'layout/style/nsCSSParser.cpp' \
+               'layout/style/nsCSSRuleProcessor.cpp' \
+               'layout/style/nsCSSRuleProcessor.h' \
+               'layout/style/nsCSSRules.cpp' \
+               'layout/style/nsIStyleRule.h' \
+               'layout/style/nsIStyleRuleProcessor.h' \
+               'layout/style/nsMediaList.cpp' \
+               'layout/style/nsMediaList.h' \
+               'layout/style/nsNthIndexCache.cpp' \
+               'layout/style/nsRuleData.cpp' \
+               'layout/style/nsRuleData.h' \
+               'layout/style/nsRuleNode.cpp' \
+               'layout/style/nsRuleNode.h' \
+               'layout/style/nsRuleWalker.h' \
+               'layout/style/nsStyleSet.cpp' \
+               'layout/style/nsStyleSet.h' \
+               'layout/base/RestyleManager.cpp' \
+               'layout/base/RestyleManager.h' \
+               'layout/style/nsCSSStyleSheet.cpp' \
+               'layout/style/nsCSSStyleSheet.h' \
+               'layout/style/nsCSSDeclaration.cpp' \
+               'layout/style/nsCSSDeclaration.h' \
+               'layout/style/nsICSSImportRule.h' \
+               'layout/style/nsICSSNameSpaceRule.h' \
+               'layout/style/nsCSSStyleRule.cpp' \
+               'layout/style/nsICSSStyleRule.h' \
+               'layout/style/nsIMediaList.h' \
+               'layout/base/RestyleManagerBase.cpp' \
+               'layout/style/AnimationCommon.cpp' \
+               > "$issuesDir/css-issues/res0.json"
+
+# layers
+mkdir -p "$issuesDir/layers-issues"
+filter "$toFilter/layers-issues" \
+       'gfx/2d/CaptureCommandList.cpp' \
+       'gfx/2d/CaptureCommandList.h' \
+       'gfx/2d/DrawCommand.h' \
+       'gfx/2d/DrawCommands.h' \
+       'gfx/2d/DrawTargetCapture.cpp' \
+       'gfx/2d/DrawTargetCapture.h' \
+       'gfx/2d/DrawTargetDual.cpp' \
+       'gfx/2d/DrawTargetDual.h' \
+       'gfx/2d/DrawTargetTiled.cpp' \
+       'gfx/2d/DrawTargetTiled.h' \
+       'gfx/2d/DrawTargetWrapAndRecord.cpp' \
+       'gfx/2d/DrawTargetWrapAndRecord.h' \
+       'gfx/2d/FilterNodeCapture.cpp' \
+       'gfx/2d/FilterNodeCapture.h' \
+       'gfx/2d/PathCapture.cpp' \
+       'gfx/2d/PathCapture.h' \
+       'gfx/2d/SourceSurfaceCapture.cpp' \
+       'gfx/2d/SourceSurfaceCapture.h' \
+       'gfx/2d/SourceSurfaceDual.h' \
+       'gfx/gl/SharedSurfaceGLX.cpp' \
+       'gfx/gl/SharedSurfaceGLX.h' \
+       'gfx/layers/apz/public/MetricsSharingController.h' \
+       'gfx/layers/apz/test/gtest/InternalHitTester.cpp' \
+       'gfx/layers/apz/test/gtest/InternalHitTester.h' \
+       'gfx/layers/basic/AutoMaskData.h' \
+       'gfx/layers/basic/BasicCanvasLayer.cpp' \
+       'gfx/layers/basic/BasicCanvasLayer.h' \
+       'gfx/layers/basic/BasicColorLayer.cpp' \
+       'gfx/layers/basic/BasicCompositor.cpp' \
+       'gfx/layers/basic/BasicCompositor.h' \
+       'gfx/layers/basic/BasicContainerLayer.cpp' \
+       'gfx/layers/basic/BasicContainerLayer.h' \
+       'gfx/layers/basic/BasicImageLayer.cpp' \
+       'gfx/layers/basic/BasicImages.cpp' \
+       'gfx/layers/basic/BasicImplData.h' \
+       'gfx/layers/basic/BasicLayerManager.cpp' \
+       'gfx/layers/basic/BasicLayers.h' \
+       'gfx/layers/basic/BasicLayersImpl.cpp' \
+       'gfx/layers/basic/BasicLayersImpl.h' \
+       'gfx/layers/basic/BasicPaintedLayer.cpp' \
+       'gfx/layers/basic/BasicPaintedLayer.h' \
+       'gfx/layers/basic/MacIOSurfaceTextureHostBasic.cpp' \
+       'gfx/layers/basic/MacIOSurfaceTextureHostBasic.h' \
+       'gfx/layers/basic/TextureClientX11.cpp' \
+       'gfx/layers/basic/TextureClientX11.h' \
+       'gfx/layers/basic/TextureHostBasic.cpp' \
+       'gfx/layers/basic/TextureHostBasic.h' \
+       'gfx/layers/basic/X11BasicCompositor.cpp' \
+       'gfx/layers/basic/X11BasicCompositor.h' \
+       'gfx/layers/basic/X11TextureSourceBasic.cpp' \
+       'gfx/layers/basic/X11TextureSourceBasic.h' \
+       'gfx/layers/client/ClientCanvasLayer.cpp' \
+       'gfx/layers/client/ClientCanvasLayer.h' \
+       'gfx/layers/client/ClientCanvasRenderer.cpp' \
+       'gfx/layers/client/ClientCanvasRenderer.h' \
+       'gfx/layers/client/ClientColorLayer.cpp' \
+       'gfx/layers/client/ClientContainerLayer.cpp' \
+       'gfx/layers/client/ClientContainerLayer.h' \
+       'gfx/layers/client/ClientImageLayer.cpp' \
+       'gfx/layers/client/ClientLayerManager.cpp' \
+       'gfx/layers/client/ClientLayerManager.h' \
+       'gfx/layers/client/ClientPaintedLayer.cpp' \
+       'gfx/layers/client/ClientPaintedLayer.h' \
+       'gfx/layers/client/ClientReadbackLayer.h' \
+       'gfx/layers/client/ClientTiledPaintedLayer.cpp' \
+       'gfx/layers/client/ClientTiledPaintedLayer.h' \
+       'gfx/layers/client/ContentClient.cpp' \
+       'gfx/layers/client/ContentClient.h' \
+       'gfx/layers/client/MultiTiledContentClient.cpp' \
+       'gfx/layers/client/MultiTiledContentClient.h' \
+       'gfx/layers/client/SingleTiledContentClient.cpp' \
+       'gfx/layers/client/SingleTiledContentClient.h' \
+       'gfx/layers/client/TiledContentClient.cpp' \
+       'gfx/layers/client/TiledContentClient.h' \
+       'gfx/layers/composite/AsyncCompositionManager.cpp' \
+       'gfx/layers/composite/AsyncCompositionManager.h' \
+       'gfx/layers/composite/CanvasLayerComposite.cpp' \
+       'gfx/layers/composite/CanvasLayerComposite.h' \
+       'gfx/layers/composite/ColorLayerComposite.cpp' \
+       'gfx/layers/composite/ColorLayerComposite.h' \
+       'gfx/layers/composite/ConsolasFontData.h' \
+       'gfx/layers/composite/ContainerLayerComposite.cpp' \
+       'gfx/layers/composite/ContainerLayerComposite.h' \
+       'gfx/layers/composite/ContentHost.cpp' \
+       'gfx/layers/composite/ContentHost.h' \
+       'gfx/layers/composite/Diagnostics.cpp' \
+       'gfx/layers/composite/FPSCounter.cpp' \
+       'gfx/layers/composite/FPSCounter.h' \
+       'gfx/layers/composite/ImageHost.cpp' \
+       'gfx/layers/composite/ImageHost.h' \
+       'gfx/layers/composite/ImageLayerComposite.cpp' \
+       'gfx/layers/composite/ImageLayerComposite.h' \
+       'gfx/layers/composite/LayerManagerComposite.cpp' \
+       'gfx/layers/composite/LayerManagerComposite.h' \
+       'gfx/layers/composite/LayerManagerCompositeUtils.h' \
+       'gfx/layers/composite/PaintCounter.cpp' \
+       'gfx/layers/composite/PaintCounter.h' \
+       'gfx/layers/composite/PaintedLayerComposite.cpp' \
+       'gfx/layers/composite/PaintedLayerComposite.h' \
+       'gfx/layers/composite/TextRenderer.cpp' \
+       'gfx/layers/composite/TextRenderer.h' \
+       'gfx/layers/composite/TiledContentHost.cpp' \
+       'gfx/layers/composite/TiledContentHost.h' \
+       'gfx/layers/composite/X11TextureHost.cpp' \
+       'gfx/layers/composite/X11TextureHost.h' \
+       'gfx/layers/d3d11/BlendingHelpers.hlslh' \
+       'gfx/layers/d3d11/mlgshaders/blend-common.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/blend-ps-generated.hlslh' \
+       'gfx/layers/d3d11/mlgshaders/blend-ps-generated.hlslh.tpl' \
+       'gfx/layers/d3d11/mlgshaders/blend-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/blend-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/clear-common.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/clear-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/clear-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/color-common.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/color-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/color-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/common.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/common-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/common-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/component-alpha-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/diagnostics-common.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/diagnostics-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/diagnostics-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/mask-combiner-common.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/mask-combiner-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/mask-combiner-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/test-features-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/textured-common.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/textured-ps.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/textured-vs.hlsl' \
+       'gfx/layers/d3d11/mlgshaders/ycbcr-ps.hlsl' \
+       'gfx/layers/d3d11/ReadbackManagerD3D11.cpp' \
+       'gfx/layers/d3d11/ReadbackManagerD3D11.h' \
+       'gfx/layers/DirectedGraph.h' \
+       'gfx/layers/ImageLayers.cpp' \
+       'gfx/layers/ImageLayers.h' \
+       'gfx/layers/ipc/LayerTransactionChild.cpp' \
+       'gfx/layers/ipc/LayerTransactionChild.h' \
+       'gfx/layers/ipc/LayerTransactionParent.cpp' \
+       'gfx/layers/ipc/LayerTransactionParent.h' \
+       'gfx/layers/ipc/ShadowLayers.cpp' \
+       'gfx/layers/ipc/ShadowLayers.h' \
+       'gfx/layers/ipc/ShadowLayerUtilsMac.cpp' \
+       'gfx/layers/ipc/ShadowLayerUtilsX11.cpp' \
+       'gfx/layers/ipc/ShadowLayerUtilsX11.h' \
+       'gfx/layers/LayerManager.cpp' \
+       'gfx/layers/LayerMetricsWrapper.h' \
+       'gfx/layers/LayerScope.cpp' \
+       'gfx/layers/LayerScope.h' \
+       'gfx/layers/Layers.cpp' \
+       'gfx/layers/Layers.h' \
+       'gfx/layers/LayersHelpers.cpp' \
+       'gfx/layers/LayersHelpers.h' \
+       'gfx/layers/LayerSorter.cpp' \
+       'gfx/layers/LayerSorter.h' \
+       'gfx/layers/LayerTreeInvalidation.cpp' \
+       'gfx/layers/LayerTreeInvalidation.h' \
+       'gfx/layers/opengl/GLBlitTextureImageHelper.cpp' \
+       'gfx/layers/opengl/GLBlitTextureImageHelper.h' \
+       'gfx/layers/opengl/X11TextureSourceOGL.cpp' \
+       'gfx/layers/opengl/X11TextureSourceOGL.h' \
+       'gfx/layers/PaintThread.cpp' \
+       'gfx/layers/PaintThread.h' \
+       'gfx/layers/protobuf/LayerScopePacket.pb.h' \
+       'gfx/layers/ReadbackProcessor.cpp' \
+       'gfx/layers/ReadbackProcessor.h' \
+       'gfx/layers/RenderTrace.cpp' \
+       'gfx/layers/RenderTrace.h' \
+       'gfx/layers/RotatedBuffer.cpp' \
+       'gfx/layers/RotatedBuffer.h' \
+       'gfx/layers/SourceSurfaceVolatileData.cpp' \
+       'gfx/layers/SourceSurfaceVolatileData.h' \
+       'gfx/layers/TextureDIB.cpp' \
+       'gfx/layers/TextureDIB.h' \
+       'gfx/layers/TiledLayerBuffer.h' \
+       'gfx/src/TiledRegion.cpp' \
+       'gfx/src/TiledRegion.h' \
+       'gfx/tests/gtest/TestCompositor.cpp' \
+       'gfx/tests/gtest/TestLayers.h' \
+       'gfx/tests/gtest/TestTextureCompatibility.cpp' \
+       'gfx/thebes/gfxGdkNativeRenderer.cpp' \
+       'gfx/thebes/gfxGdkNativeRenderer.h' \
+       'gfx/thebes/gfxXlibNativeRenderer.cpp' \
+       'gfx/thebes/gfxXlibNativeRenderer.h' \
+       'layout/painting/FrameLayerBuilder.cpp' \
+       'layout/painting/FrameLayerBuilder.h' \
+       'widget/gtk/WindowSurfaceXRender.cpp' \
+       'widget/gtk/WindowSurfaceXRender.h' \
+       'gfx/2d/DrawTargetRecording.cpp' \
+       'gfx/2d/DrawTargetRecording.h' \
+       'gfx/gl/GLBlitTextureImageHelper.cpp' \
+       'gfx/gl/GLBlitTextureImageHelper.h' \
+       'gfx/layers/basic/BasicCanvasLayer.cpp' \
+       'gfx/layers/basic/BasicCanvasLayer.h' \
+       'gfx/layers/basic/BasicColorLayer.cpp' \
+       'gfx/layers/basic/BasicContainerLayer.cpp' \
+       'gfx/layers/basic/BasicContainerLayer.h' \
+       'gfx/layers/basic/BasicImageLayer.cpp' \
+       'gfx/layers/basic/BasicLayerManager.cpp' \
+       'gfx/layers/basic/BasicLayers.cpp' \
+       'gfx/layers/basic/BasicLayers.h' \
+       'gfx/layers/basic/BasicThebesLayer.cpp' \
+       'gfx/layers/basic/BasicThebesLayer.h' \
+       'gfx/layers/BasicLayers.h' \
+       'gfx/layers/basic/TextureHostX11.cpp' \
+       'gfx/layers/basic/TextureHostX11.h' \
+       'gfx/layers/client/ClientThebesLayer.cpp' \
+       'gfx/layers/client/ClientThebesLayer.h' \
+       'gfx/layers/client/ClientTiledThebesLayer.cpp' \
+       'gfx/layers/client/ClientTiledThebesLayer.h' \
+       'gfx/layers/client/TiledContentClient.cpp' \
+       'gfx/layers/client/TiledContentClient.h' \
+       'gfx/layers/composite/ThebesLayerComposite.cpp' \
+       'gfx/layers/composite/ThebesLayerComposite.h' \
+       'gfx/layers/ipc/CompositorParent.cpp' \
+       'gfx/layers/ipc/CompositorParent.h' \
+       'gfx/layers/ipc/LayerTransactionChild.cpp' \
+       'gfx/layers/ipc/LayerTransactionChild.h' \
+       'gfx/layers/ipc/ShadowLayersChild.cpp' \
+       'gfx/layers/ipc/ShadowLayersChild.h' \
+       'gfx/layers/ipc/ShadowLayersParent.cpp' \
+       'gfx/layers/ipc/ShadowLayersParent.h' \
+       'gfx/layers/opengl/FPSCounter.h' \
+       'gfx/layers/ThebesLayerBuffer.cpp' \
+       'gfx/layers/ThebesLayerBuffer.h' \
+       'gfx/thebes/public/gfxGdkNativeRenderer.h' \
+       'gfx/thebes/public/gfxXlibNativeRenderer.h' \
+       'gfx/thebes/src/gfxGdkNativeRenderer.cpp' \
+       'gfx/thebes/src/gfxXlibNativeRenderer.cpp' \
+       'layout/base/FrameLayerBuilder.cpp' \
+       'layout/base/FrameLayerBuilder.h' \
+       'gfx/layers/d3d11/MLGDeviceD3D11.cpp' \
+       'gfx/layers/d3d11/MLGDeviceD3D11.h' \
+       'gfx/layers/mlgpu/BufferCache.cpp' \
+       'gfx/layers/mlgpu/BufferCache.h' \
+       'gfx/layers/mlgpu/CanvasLayerMLGPU.cpp' \
+       'gfx/layers/mlgpu/CanvasLayerMLGPU.h' \
+       'gfx/layers/mlgpu/ClearRegionHelper.h' \
+       'gfx/layers/mlgpu/ContainerLayerMLGPU.cpp' \
+       'gfx/layers/mlgpu/ContainerLayerMLGPU.h' \
+       'gfx/layers/mlgpu/FrameBuilder.cpp' \
+       'gfx/layers/mlgpu/FrameBuilder.h' \
+       'gfx/layers/mlgpu/ImageLayerMLGPU.cpp' \
+       'gfx/layers/mlgpu/ImageLayerMLGPU.h' \
+       'gfx/layers/mlgpu/LayerManagerMLGPU.cpp' \
+       'gfx/layers/mlgpu/LayerManagerMLGPU.h' \
+       'gfx/layers/mlgpu/LayerMLGPU.cpp' \
+       'gfx/layers/mlgpu/LayerMLGPU.h' \
+       'gfx/layers/mlgpu/MaskOperation.cpp' \
+       'gfx/layers/mlgpu/MaskOperation.h' \
+       'gfx/layers/mlgpu/MemoryReportingMLGPU.cpp' \
+       'gfx/layers/mlgpu/MemoryReportingMLGPU.h' \
+       'gfx/layers/mlgpu/MLGDevice.cpp' \
+       'gfx/layers/mlgpu/MLGDevice.h' \
+       'gfx/layers/mlgpu/MLGDeviceTypes.h' \
+       'gfx/layers/mlgpu/MLGPUScreenshotGrabber.cpp' \
+       'gfx/layers/mlgpu/MLGPUScreenshotGrabber.h' \
+       'gfx/layers/mlgpu/PaintedLayerMLGPU.cpp' \
+       'gfx/layers/mlgpu/PaintedLayerMLGPU.h' \
+       'gfx/layers/mlgpu/RenderPassMLGPU.cpp' \
+       'gfx/layers/mlgpu/RenderPassMLGPU.h' \
+       'gfx/layers/mlgpu/RenderPassMLGPU-inl.h' \
+       'gfx/layers/mlgpu/RenderViewMLGPU.cpp' \
+       'gfx/layers/mlgpu/RenderViewMLGPU.h' \
+       'gfx/layers/mlgpu/ShaderDefinitionsMLGPU.h' \
+       'gfx/layers/mlgpu/ShaderDefinitionsMLGPU-inl.h' \
+       'gfx/layers/mlgpu/SharedBufferMLGPU.cpp' \
+       'gfx/layers/mlgpu/SharedBufferMLGPU.h' \
+       'gfx/layers/mlgpu/StagingBuffer.cpp' \
+       'gfx/layers/mlgpu/StagingBuffer.h' \
+       'gfx/layers/mlgpu/TexturedLayerMLGPU.cpp' \
+       'gfx/layers/mlgpu/TexturedLayerMLGPU.h' \
+       'gfx/layers/mlgpu/TextureSourceProviderMLGPU.cpp' \
+       'gfx/layers/mlgpu/TextureSourceProviderMLGPU.h' \
+       'gfx/layers/mlgpu/UtilityMLGPU.h' \
+       'gfx/layers/LayerAttributes.h' \
+       > "$issuesDir/layers-issues/res0.json"
+
+
+# cubeb-linux
+mkdir -p "$issuesDir/cubeb-linux-issues"
+filter "$toFilter/cubeb-linux-issues" \
+       'media/libcubeb/src/cubeb_pulse.c' \
+       > "$issuesDir/cubeb-linux-issues/res0.json"
+
+# cubeb-macos
+mkdir -p "$issuesDir/cubeb-macos-issues"
+filter "$toFilter/cubeb-macos-issues" \
+       'media/libcubeb/src/cubeb_audiounit.c' \
+       'media/libcubeb/src/cubeb_audiounit.cpp' \
+       > "$issuesDir/cubeb-macos-issues/res0.json"
+
+# prefs-parser
+mkdir -p "$issuesDir/prefs-parser-issues"
+filter "$toFilter/prefs-parser-issues" \
+       'modules/libpref/src/nsPrefService.cpp' \
+       'modules/libpref/src/Preferences.cpp' \
+       'modules/libpref/Preferences.cpp' \
+       > "$issuesDir/prefs-parser-issues/res0.json"
+
+# cert-blocklist
+mkdir -p "$issuesDir/cert-blocklist-issues"
+filter "$toFilter/cert-blocklist-issues" \
+       'security/manager/boot/src/CertBlocklist.cpp' \
+       'security/manager/ssl/CertBlocklist.cpp' \
+       > "$issuesDir/cert-blocklist-issues/res0.json"
+
+# japanese-encoding
+mkdir -p "$issuesDir/japanese-encoding-issues"
+filter "$toFilter/japanese-encoding-issues" \
+       'extensions/universalchardet/src/base/CharDistribution.cpp' \
+       'extensions/universalchardet/src/base/JpCntx.cpp' \
+       'extensions/universalchardet/src/base/nsCharSetProber.cpp' \
+       'extensions/universalchardet/src/base/nsEUCJPProber.cpp' \
+       'extensions/universalchardet/src/base/nsEscCharsetProber.cpp' \
+       'extensions/universalchardet/src/base/nsEscSM.cpp' \
+       'extensions/universalchardet/src/base/nsMBCSGroupProber.cpp' \
+       'extensions/universalchardet/src/base/nsMBCSSM.cpp' \
+       'extensions/universalchardet/src/base/nsSJISProber.cpp' \
+       'extensions/universalchardet/src/base/nsUTF8Prober.cpp' \
+       'extensions/universalchardet/src/base/nsUniversalDetector.cpp' \
+       'extensions/universalchardet/src/xpcom/nsUdetXPCOMWrapper.cpp' \
+       > "$issuesDir/japanese-encoding-issues/res0.json"
+
+# language-identifier
+mkdir -p "$issuesDir/language-identifier-issues"
+filter "$toFilter/language-identifier-issues" \
+       'intl/locale/MozLocale.cpp' \
+       > "$issuesDir/language-identifier-issues/res0.json"
+
+# language-negotiation
+mkdir -p "$issuesDir/language-negotiation-issues"
+filter "$toFilter/language-negotiation-issues" \
+       'intl/locale/LocaleService.cpp' \
+       > "$issuesDir/language-negotiation-issues/res0.json"
+
+# encoding-detector
+mkdir -p "$issuesDir/encoding-detector-issues"
+filter "$toFilter/encoding-detector-issues" \
+       'intl/chardet/' \
+       > "$issuesDir/encoding-detector-issues/res0.json"
+
+# hyphenation
+mkdir -p "$issuesDir/hyphenation-issues"
+filter "$toFilter/hyphenation-issues" \
+       'intl/hyphenation/' \
+       > "$issuesDir/hyphenation-issues/res0.json"

+ 139 - 0
code/fetch_bugzilla_bugs/find_bugzilla_fixes.py

@@ -0,0 +1,139 @@
+""" Identify bugfixes in Bugzilla repository given a list of issues """
+__author__ = "Justin Tracey and Kristian Berg"
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
+__license__ = "MIT"
+
+import os
+import json
+import argparse
+import subprocess
+import datetime
+from get_bugzilla_patches import get_title_lines
+
+
+class Commit:
+    def __init__(self, git_path=None, git_hash=None, author_date=None):
+        self.git_path = git_path
+        self.git_hash = git_hash
+        self.author_date = author_date
+
+    def files(self):
+        return subprocess.check_output(['git', '-C', self.git_path,
+                                        'diff-tree', '--no-commit-id',
+                                        '--name-only', '-r', self.git_hash],
+                                       universal_newlines=True)
+
+
+def find_bug_fixes(issue_path, git_path):
+    """ Identify bugfixes in Bugzilla repository given a list of issues """
+
+    progress = 0
+    no_matches = []
+    matches_per_issue = {}
+    total_matches = 0
+
+    issue_list = build_issue_list(issue_path)
+
+    for key in issue_list:
+        nbr = key.split('-')[1]
+        matches = []
+
+        patterns = list(get_title_lines(nbr))
+
+        for pattern in patterns:
+            commits = subprocess.check_output(['git', '-C', git_path, 'log',
+                                               '--date=iso',
+                                               '--format=format:%H|%ad',
+                                               '--grep={}'.format(pattern),
+                                               '-F'],
+                                              universal_newlines=True).strip()
+            for commit in commits.splitlines():
+                if commit:
+                    commit = Commit(git_path, *(commit.split('|')))
+                    matches.append(commit)
+        total_matches += len(matches)
+        matches_per_issue[key] = len(matches)
+
+        if matches:
+            selected_commit = commit_selector_heuristic(matches)
+            if not selected_commit:
+                no_matches.append(key)
+            else:
+                issue_list[key]['hash'] = selected_commit.git_hash
+                issue_list[key]['commitdate'] = selected_commit.author_date
+        else:
+            no_matches.append(key)
+
+        progress += 1
+        if progress % 10 == 0:
+            print(progress, end='\r')
+
+    print('Total issues: ' + str(len(issue_list)))
+    print('Issues matched to a bugfix: ' +
+          str(len(issue_list) - len(no_matches)))
+    print('Percent of issues matched to a bugfix: ' +
+          str((len(issue_list) - len(no_matches)) / len(issue_list)))
+    for key in no_matches:
+        issue_list.pop(key)
+
+    return issue_list
+
+
+def build_issue_list(path):
+    """ Helper method for find_bug_fixes """
+    issue_list = {}
+    for filename in os.listdir(path):
+        with open(path + '/' + filename) as f:
+            for issue in json.loads(f.read())['issues']:
+                issue_list[issue['key']] = {}
+
+                created_date = issue['fields']['created'].replace('T', ' ')
+                created_date = created_date.replace('.000', ' ')
+                issue_list[issue['key']]['creationdate'] = created_date
+
+                res_date = issue['fields']['resolutiondate'].replace('T', ' ')
+                res_date = res_date.replace('.000', ' ')
+                issue_list[issue['key']]['resolutiondate'] = res_date
+    return issue_list
+
+
+suffixes = ["c", "C", "cc", "cpp", "cxx", "c++",
+            "h", ".H", "hh", "hpp", "hxx", "h++"]
+
+
+def commit_selector_heuristic(commits):
+    """ SZZUnleashed only allows one fix commit per issue.
+        We follow its norm of using the most recent associated commit.
+        We also filter on commits touching C/C++ files.
+    """
+    def touches_c_file(commit):
+        return any(filename for filename in commit.files().splitlines()
+                   if filename.split('.')[-1] in suffixes)
+    commits = [c for c in commits if touches_c_file(c)]
+    # the weird string manipulation is to fix timezones formatted as +0000
+    # (that git produces) to +00:00 (that python wants)
+    return min(commits, key=lambda x:
+               datetime.datetime.fromisoformat(x.author_date[:-2] + ':' +
+                                               x.author_date[-2:]),
+               default=None)
+
+
+def main():
+    """ Main method """
+    parser = argparse.ArgumentParser(
+        description="Identify bugfixes. Use this script together with a git "
+        "repo and a path with issues. The issue directory is created and "
+        "populated using the fetch-bugzilla.py script.")
+    parser.add_argument('--git-path', type=str,
+                        help='Path to local git repository')
+    parser.add_argument('--issue-list', type=str,
+                        help='Path to directory containing issue json files')
+    args = parser.parse_args()
+
+    issue_list = find_bug_fixes(args.issue_list, args.git_path)
+    with open('issue_list.json', 'w') as f:
+        f.write(json.dumps(issue_list))
+
+
+if __name__ == '__main__':
+    main()

+ 24 - 0
code/fetch_bugzilla_bugs/get-approval-request-comment.py

@@ -0,0 +1,24 @@
+from urllib3 import PoolManager
+import json
+import sys
+
+
+if __name__ == '__main__':
+    bugzillaID = sys.argv[1]
+    bugzilla_project_name = 'bugzilla.mozilla.org'
+    request = 'https://{}/rest/bug/{}/comment'.format(bugzilla_project_name,
+                                                      bugzillaID)
+    introducerIDs = set()
+    res = PoolManager().request('GET', request, retries=10)
+    j = json.loads(res.data)
+    for comment in j['bugs'][bugzillaID]['comments']:
+        for line in comment['text'].split('\n'):
+            if '[Feature' in line:
+                introducerIDs |= {word for word in line.
+                                  replace(',', ' ').
+                                  replace(';', ' ').
+                                  replace(':', ' ').
+                                  replace('#', ' ').split()
+                                  if word.isdigit() and int(word) > 1000}
+    if len(introducerIDs) > 0:
+        print(json.dumps(list(introducerIDs)))

+ 79 - 0
code/fetch_bugzilla_bugs/get_bugzilla_patches.py

@@ -0,0 +1,79 @@
+from urllib3 import PoolManager
+import json
+import sys
+import re
+
+
+def get_patches(bugzillaID):
+    http = PoolManager()
+
+    # first find when the bug was fixed
+    history = 'https://bugzilla.mozilla.org/rest/bug/{}/history'.format(
+        bugzillaID)
+    request = http.request('GET', history, retries=10)
+    j = json.loads(request.data)
+    times = [change['when'] for change in j['bugs'][0]['history']
+             if "FIXED" in [c["added"] for c in change["changes"]]]
+
+    # then look through all the fix comments for patch URLs
+    comments = 'https://bugzilla.mozilla.org/rest/bug/{}/comment'.format(
+        bugzillaID)
+    request = http.request('GET', comments, retries=10)
+    j = json.loads(request.data)
+    urlregex = r'https?://hg\.mozilla\.org/(?:mozilla-central|releases/[^/]+)'\
+        '/rev/[0-9a-f]+'
+    fix_comments = [comment for comment in j['bugs'][bugzillaID]['comments']
+                    if comment['creation_time'] in times]
+    for comment in fix_comments:
+        for urlmatch in re.findall(urlregex, comment['text']):
+            patch_url = urlmatch.replace('/rev/', '/raw-rev/')
+            yield http.request('GET', patch_url, retries=10).data.decode(
+                errors='ignore')
+
+
+def get_title_lines(bugzillaID):
+    for patch in get_patches(bugzillaID):
+        for line in patch.split('\n'):
+            line = line.strip()
+            if len(line) != 0 and line[0] != '#':
+                yield line
+                break
+
+
+def get_affected_files(bugzillaID):
+    for patch in get_patches(bugzillaID):
+        for line in patch.split('\n'):
+            if len(line) > 5 and (
+                    line[:5] == '--- a' or
+                    line[:5] == '+++ b'):
+                yield line[6:]
+
+
+def get_deleted_files(bugzillaID):
+    fetch_next = False
+    for patch in get_patches(bugzillaID):
+        for line in patch.split('\n'):
+            if len(line) >= 17 and \
+               line[:17] == "deleted file mode" and \
+               not fetch_next:
+                fetch_next = True
+            elif fetch_next:
+                assert(line[:5] == '--- a')
+                yield line[6:]
+                fetch_next = False
+
+
+if __name__ == '__main__':
+    command = sys.argv[1]
+    bugzillaID = sys.argv[2]
+    if command == 'titles':
+        line_generator = get_title_lines(bugzillaID)
+    elif command == 'files':
+        line_generator = get_affected_files(bugzillaID)
+    elif command == 'deleted':
+        line_generator = get_deleted_files(bugzillaID)
+    else:
+        print("Bad command: " + command)
+        sys.exit(1)
+    for item in line_generator:
+        print(item)

+ 5 - 6
code/fetch_jira_bugs/git_log_to_array.py

@@ -6,30 +6,30 @@ __credits__ = ["Kristian Berg", "Oscar Svensson"]
 
 import argparse
 import subprocess
-import sys
 import json
 
+
 def git_log_to_json(init_hash, path_to_repo):
     hashes = subprocess.run(['git', 'rev-list', init_hash], cwd=path_to_repo,
-        stdout=subprocess.PIPE).stdout.decode('ascii').split()
+                            stdout=subprocess.PIPE).stdout.decode('ascii').split()
 
     logs = []
     i = 0
     for hash in hashes:
         entry = subprocess.run(['git', 'show', '--quiet', '--date=iso', hash],
-            cwd=path_to_repo, stdout=subprocess.PIPE)\
+                               cwd=path_to_repo, stdout=subprocess.PIPE)\
             .stdout.decode(errors='replace')
         logs.append(entry)
         i += 1
         if i % 10 == 0:
-            print(i, end='\r')
+            print("{} / {}".format(i, len(hashes)), end='\r')
 
     with open('gitlog.json', 'w') as f:
         f.write(json.dumps(logs))
 
+
 # Commits are saved in reverse chronological order from newest to oldest
 if __name__ == '__main__':
-
     parser = argparse.ArgumentParser(description="""Convert a git log output to json.
                                                  """)
     parser.add_argument('--from-commit', type=str,
@@ -41,4 +41,3 @@ if __name__ == '__main__':
     path_to_repo = args.repo_path
     init_hash = args.from_commit
     git_log_to_json(init_hash, path_to_repo)
-

+ 25 - 0
code/learning_curves/genExp.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+gitDir="$1"
+recentCommit="$2"
+experienceDir="$3"
+if [ ! -z "$(ls -A "$experienceDir")" ] ; then
+   echo "experience dir is not empty: $experienceDir"
+   exit 1
+fi
+count=0
+echo "counting commits..."
+maxCount=$(git -C "$gitDir" log --full-history --no-merges --use-mailmap --format='format:' "$recentCommit" | wc -l)
+(git -C "$gitDir" log --full-history --reverse --no-merges --use-mailmap --format='format:%ct	%H	%aN <%aE>' "$recentCommit" | sort -n | cut -f2,3; echo) | while IFS= read commit
+do
+    echo -ne "commit $count / $maxCount\r"
+    #echo "commit: $commit"
+    author=$(echo "$commit" | cut -f2 | tr '/' '_')
+    #echo "author: $author"
+    commit_hash=$(echo "$commit" | cut -f1)
+    if [ ! -z "$author" ] ; then
+        echo "$commit_hash,$count" >> "$experienceDir/$author"
+    fi
+    count=$(($count + 1))
+done
+echo

+ 25 - 0
code/learning_curves/gen_learning_curve_table.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# $1 is issues/${foo}/res0.json
+# $2 is $bugIntroDir
+# $3 is the firefox repo dir
+
+bugs=$(jq -r '.issues | map(.key) | .[]' "$1" | tr -d '-')
+cd "$2"
+for bug in $bugs ; do
+    #echo $bug
+    if [ -f "$bug" ] ; then
+        while read vcc ; do
+            if [ -z "$vcc" ]; then
+                echo "ERROR: empty introducer file: $bug"
+                echo "(continuing best we can, but you need to fix a bug somewhere)"
+                continue
+            fi
+            if [ -z "$(git  -C "$3" log --since-as-filter=2012-04-09 -1 "$vcc")" ]; then
+                continue
+            fi
+            contribto=$(grep -l $vcc *)
+            echo "$vcc",$contribto | tr ' ' ,
+        done < $bug
+    fi
+done

+ 183 - 0
code/learning_curves/grid_search.sh

@@ -0,0 +1,183 @@
+#!/bin/bash
+
+CODE_DIR="$1"
+DATA_DIR="$2"
+ANNOTATED_DIR="$3"
+REPO_DIR="$4"
+EXP_DIR="$5"
+PLOT_DIR="$6"
+RUST_PROJECTS="$7"
+GRAD_DESCENT_DIR="$DATA_DIR/gradient_descent/"
+mkdir -p "$GRAD_DESCENT_DIR"
+
+# assumes:
+# "$ANNOTATED_DIR/rust-blame.csv"
+# "$ANNOTATED_DIR/relevant-dirs.csv"
+# "$REPO_DIR/firefox"
+# "$EXP_DIR/firefox"
+# "$REPO_DIR/{$RUST_PROJECTS}"
+# "$EXP_DIR/{$RUST_PROJECTS}"
+
+t1_guesses="0.000001:0.00001:0.0001:0.001:0.01:0.1:0.2:0.4:0.8:0.99"
+l_guesses_1="0.0:0.1:0.2:0.3:0.4:0.5"
+l_guesses_2="-0.1:-0.2:-0.3:-0.4:-0.5"
+
+rust_repos=$(for project in $RUST_PROJECTS ; do
+                 echo -n "$REPO_DIR/$project/:"
+             done | head -c -1)
+rust_exps=$(for project in $RUST_PROJECTS ; do
+                echo -n "$EXP_DIR/$project/:"
+            done | head -c -1)
+rust_relevant=$(for project in $RUST_PROJECTS ; do
+                    echo -n "$(grep "^$project," "$ANNOTATED_DIR/relevant-dirs.csv" | cut -d, -f2):"
+                done | head -c -1)
+# mozilla is kind enough to have no files with spaces in names in gecko-dev
+c_relevant="$(tr '\n' ' ' <"$ANNOTATED_DIR/relevant-c++")"
+
+echo "running gradient descent (this will take a while)..."
+
+(
+    python3 "$CODE_DIR/learning_curves/learningCurve_gradientDescent.py" \
+            "$ANNOTATED_DIR/rust-blame.csv" \
+            "$rust_repos" \
+            "$rust_relevant" \
+            "$rust_exps" \
+            "$t1_guesses" \
+            "$l_guesses_1" > "$GRAD_DESCENT_DIR/grid_search.rust.1.cuml.txt" &&
+        echo "thread1 rust complete"
+) &
+
+(
+    python3 "$CODE_DIR/learning_curves/learningCurve_gradientDescent.py" \
+            "$ANNOTATED_DIR/rust-blame.csv" \
+            "$rust_repos" \
+            "$rust_relevant" \
+            "$rust_exps" \
+            "$t1_guesses" \
+            "$l_guesses_2" > "$GRAD_DESCENT_DIR/grid_search.rust.2.cuml.txt" &&
+        echo "thread2 rust complete"
+) &
+
+if [ $(nproc) -lt 4 ] ; then wait ; fi
+
+(
+    python3 "$CODE_DIR/learning_curves/learningCurve_gradientDescent.py" \
+            "$ANNOTATED_DIR/c++-blame.csv" \
+            "$REPO_DIR/firefox/" \
+            "$c_relevant" \
+            "$EXP_DIR/firefox/" \
+            "$t1_guesses" \
+            "$l_guesses_1" > "$GRAD_DESCENT_DIR/grid_search.c.1.cuml.txt" &&
+        echo "thread1 c complete"
+) &
+
+(
+    python3 "$CODE_DIR/learning_curves/learningCurve_gradientDescent.py" \
+            "$ANNOTATED_DIR/c++-blame.csv" \
+            "$REPO_DIR/firefox/" \
+            "$c_relevant" \
+            "$EXP_DIR/firefox/" \
+            "$t1_guesses" \
+            "$l_guesses_2" > "$GRAD_DESCENT_DIR/grid_search.c.2.cuml.txt" &&
+        echo "thread2 c complete"
+) &
+
+wait
+echo "processing complete"
+
+echo
+cat "$GRAD_DESCENT_DIR/grid_search.rust.1.cuml.txt" "$GRAD_DESCENT_DIR/grid_search.rust.2.cuml.txt" > "$GRAD_DESCENT_DIR/grid_search.rust.cuml.txt"
+cat "$GRAD_DESCENT_DIR/grid_search.c.1.cuml.txt" "$GRAD_DESCENT_DIR/grid_search.c.2.cuml.txt" > "$GRAD_DESCENT_DIR/grid_search.c.cuml.txt"
+rm "$GRAD_DESCENT_DIR/grid_search.rust.1.cuml.txt" "$GRAD_DESCENT_DIR/grid_search.rust.2.cuml.txt" "$GRAD_DESCENT_DIR/grid_search.c.1.cuml.txt" "$GRAD_DESCENT_DIR/grid_search.c.2.cuml.txt"
+
+function get_t1_l() {
+    bias="$1"
+    file="$2"
+    best=$(grep "^$bias" "$file" | sort -nk 8 | head -1)
+    t1=$(echo $best | cut -f4 -d' ')
+    l=$(echo $best | cut -f5 -d' ')
+    echo $t1 $l
+}
+
+c_t1_l="$(get_t1_l 0 "$GRAD_DESCENT_DIR/grid_search.c.cuml.txt")"
+c_t1_l_err_low="$(get_t1_l -1 "$GRAD_DESCENT_DIR/grid_search.c.cuml.txt")"
+c_t1_l_err_up="$(get_t1_l 1 "$GRAD_DESCENT_DIR/grid_search.c.cuml.txt")"
+
+c_t1="$(echo $c_t1_l | cut -f1 -d' ')"
+c_l="$(echo $c_t1_l | cut -f2 -d' ')"
+c_t1_err_low="$(echo $c_t1_l_err_low | cut -f1 -d' ')"
+c_l_err_low="$(echo $c_t1_l_err_low | cut -f2 -d' ')"
+c_t1_err_up="$(echo $c_t1_l_err_up | cut -f1 -d' ')"
+c_l_err_up="$(echo $c_t1_l_err_up | cut -f2 -d' ')"
+
+echo "C++ T1: $c_t1 ($c_t1_err_low, $c_t1_err_up)"
+echo "C++ l: $c_l ($c_l_err_low, $c_l_err_up)"
+
+rust_t1_l="$(get_t1_l 0 "$GRAD_DESCENT_DIR/grid_search.rust.cuml.txt")"
+rust_t1_l_err_low="$(get_t1_l -1 "$GRAD_DESCENT_DIR/grid_search.rust.cuml.txt")"
+rust_t1_l_err_up="$(get_t1_l 1 "$GRAD_DESCENT_DIR/grid_search.rust.cuml.txt")"
+
+rust_t1="$(echo $rust_t1_l | cut -f1 -d' ')"
+rust_l="$(echo $rust_t1_l | cut -f2 -d' ')"
+rust_t1_err_low="$(echo $rust_t1_l_err_low | cut -f1 -d' ')"
+rust_l_err_low="$(echo $rust_t1_l_err_low | cut -f2 -d' ')"
+rust_t1_err_up="$(echo $rust_t1_l_err_up | cut -f1 -d' ')"
+rust_l_err_up="$(echo $rust_t1_l_err_up | cut -f2 -d' ')"
+
+echo "Rust T1: $rust_t1 ($rust_t1_err_low, $rust_t1_err_up)"
+echo "Rust l: $rust_l ($rust_l_err_low, $rust_l_err_up)"
+
+echo
+echo "plotting data..."
+python3 "$CODE_DIR/learning_curves/model-vs-real.py" \
+        "$ANNOTATED_DIR/c++-blame.csv" \
+        "$REPO_DIR/firefox/" \
+        "$c_relevant" \
+        "$EXP_DIR/firefox/" \
+        "$PLOT_DIR/C++.pdf" \
+        $c_t1 $c_l $c_t1_err_low $c_l_err_low $c_t1_err_up $c_l_err_up &
+
+python3 "$CODE_DIR/learning_curves/model-vs-real.py" \
+        "$ANNOTATED_DIR/rust-blame.csv" \
+        "$rust_repos" \
+        "$rust_relevant" \
+        "$rust_exps" \
+        "$PLOT_DIR/Rust.pdf" \
+        $rust_t1 $rust_l $rust_t1_err_low $rust_l_err_low \
+        $rust_t1_err_up $rust_l_err_up &
+
+cpp_tex='C\nolinebreak\hspace{-.05em}\raisebox{.4ex}{\relsize{-3}{\textbf{+}}}\nolinebreak\hspace{-.10em}\raisebox{.4ex}{\relsize{-3}{\textbf{+}}}\xspace'
+
+python3 "$CODE_DIR/learning_curves/model-vs-model.py" \
+        "$PLOT_DIR/C++-vs-Rust.pdf" \
+        "$cpp_tex" \
+        "$c_t1" "$c_l" "$c_t1_err_low" "$c_l_err_low" \
+        "$c_t1_err_up" "$c_l_err_up" \
+        "Rust" "$rust_t1" "$rust_l" "$rust_t1_err_low" "$rust_l_err_low" \
+        "$rust_t1_err_up" "$rust_l_err_up" \
+        200
+
+python3 "$CODE_DIR/learning_curves/plot-experience.py" \
+        "$REPO_DIR/firefox/" \
+        "$c_relevant" \
+        "$EXP_DIR/firefox/" \
+        "$cpp_tex" \
+        "$rust_repos" \
+        "$rust_relevant" \
+        "$rust_exps" \
+        "Rust" \
+        "$PLOT_DIR/experience.frac.pdf" \
+        "$PLOT_DIR/experience.tot.pdf"
+
+wait
+echo "plotting complete"
+
+echo "calculating number of 0-exp. VCCs to flip sign of Rust l..."
+python3 "$CODE_DIR/learning_curves/learningCurve_gradientDescent.py" \
+        "$ANNOTATED_DIR/rust-blame.csv" \
+        "$rust_repos" \
+        "$rust_relevant" \
+        "$rust_exps" \
+        "$t1_guesses" \
+        "-0.3:-0.2:-0.1:0:0.1:0.2:0.3" \
+        "search"

+ 172 - 0
code/learning_curves/learningCurve.py

@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+
+import sys
+import numpy as np
+import scipy.stats as sp
+import subprocess
+import matplotlib.pyplot as plt
+
+
+class Commit:
+    def __init__(self, commit_hash, author, vcc=None):
+        self.commit_hash = commit_hash
+        self.author = author
+        self.is_vcc = vcc
+        self.xp = None
+
+    def get_experience(self, commits, exp_dir):
+        if self.xp is None:
+            with open(exp_dir + self.author.replace("/", "_")) as f:
+                commit_history = f.readlines()
+            for xp in range(len(commit_history)):
+                commit_hash = commit_history[xp].split(',')[0]
+                if commit_hash in commits.hash_to_commit:
+                    commits.hash_to_commit[commit_hash].xp = xp
+        assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\
+                   .format(self.author, self.commit_hash, self.is_vcc)
+        return self.xp
+
+
+class Commits:
+    def __init__(self, git_dir, paths, vccs=None):
+        """
+        Returns a list of Commits at the given paths, ordered chronologically
+        by authored time from old to new (NOT the order they were applied).
+        paths is a single string appended raw to the git command,
+        so any necessary escaping, quoting, etc. should be applied prior
+        """
+        command = "git -C " + git_dir + " log " \
+            "--full-history --reverse --no-merges --use-mailmap "\
+            "--since=2012-04-09 --format='format:%ct	%H	%aN <%aE>' -- " \
+            + paths + " | sort -n | cut -f2,3"
+        lines = subprocess.check_output(command, shell=True,
+                                        universal_newlines=True).strip()
+        assert lines
+        self.commits = []
+        self.hash_to_commit = {}
+        for line in lines.splitlines():
+            if '\\' in line:
+                # dark incantation to unescape string
+                line = line.encode('latin1').decode('unicode_escape').encode(
+                    'latin1').decode('utf-8')
+            line = line.strip().split('	')  # tab
+            commit_hash = line[0]
+            author = line[1]
+            if vccs:
+                vcc = commit_hash in vccs
+            else:
+                vcc = None
+            commit = Commit(line[0], author, vcc)
+            self.commits.append(commit)
+            self.hash_to_commit[commit_hash] = commit
+
+
+class GrowingList(list):
+    def __init__(self, default):
+        super().__init__()
+        self.default = default
+
+    def __setitem__(self, index, value):
+        while index >= len(self):
+            self.append(self.default())
+        list.__setitem__(self, index, value)
+
+    def __getitem__(self, index):
+        while index >= len(self):
+            self.append(self.default())
+        return list.__getitem__(self, index)
+
+
+class Counts:
+    def __init__(self, total=0, vccs=0):
+        self.total = total
+        self.vccs = vccs
+
+
+def count_commits(commits, vccs, exp_dir, counts=None):
+    if not counts:
+        counts = GrowingList(Counts)
+    bugs = set()
+    for commit in commits.commits:
+        j = commit.get_experience(commits, exp_dir)
+        if commit.is_vcc:
+            for bug in vccs[commit.commit_hash]:
+                if bug not in bugs:
+                    counts[j].vccs += 1
+                    bugs.add(bug)
+        counts[j].total += 1
+    return counts
+
+
+def main(argv):
+    # a file where each line is a VCC commit hash, followed by the issues it
+    # contributed to, comma separated
+    vcc_file = argv[1]
+    git_dirs = argv[2].split(':')
+    # the paths in the git dir to filter on (use "" or . to use everything)
+    project_paths = argv[3].split(':')
+    # the directory where experiences are stored
+    exp_dirs = argv[4].split(':')
+    for exp_dir in exp_dirs:
+        if exp_dir[-1] != '/':
+            exp_dir += '/'
+    assert len(git_dirs) == len(exp_dirs) and \
+        len(git_dirs) == len(project_paths), \
+        "each git dir needs one project path and one experience dir"
+    # the path+name of where to save the resulting plot
+    plot_path = argv[5]
+
+    vccs = {}
+    with open(vcc_file) as f:
+        for line in f.readlines():
+            line = line.strip().split(',')
+            vccs[line[0]] = {issue for issue in line[1:]}
+
+    counts = None
+    for i in range(len(git_dirs)):
+        commits = Commits(git_dirs[i], project_paths[i], vccs)
+        counts = [c for c in count_commits(commits, vccs, exp_dirs[i], counts)]
+
+    def divide(a, b):
+        """make division errors (primarily, divide by zero) return None"""
+        if a and b:
+            return a / b
+        elif b:
+            return 0
+        return None
+    cuml_vccs = [sum(c.vccs for c in counts[:j+1]) for j in range(len(counts))]
+    cuml_tot = [sum(c.total for c in counts[:j+1]) for j in range(len(counts))]
+    cuml_frac = [divide(cuml_vccs[j], cuml_tot[j]) for j in range(len(counts))]
+
+    # to prevent regressing on leading 0 values (i.e., the first n values of j
+    # where there were 0 contributors of those j's, so we have no data to
+    # regress on, or to take the log of), we need to count and skip them
+    offset = 0
+    for i in range(len(cuml_vccs)):
+        if cuml_vccs[i] != 0:
+            offset = i
+            break
+
+    xs = np.log([x+1 for x in range(offset, len(counts))])
+    ys = np.log(cuml_frac[offset:])
+    regression = sp.linregress(xs, ys)
+
+    print(regression)
+    learning_coef = -regression.slope
+    learning_intercept = -np.exp(regression.intercept) * (learning_coef - 1)
+    print("l={}, T1={}".format(learning_coef, learning_intercept))
+
+    xs = np.log([x+1 for x in range(len(counts))])
+    plt.plot(
+        [x for x in range(offset, len(counts))], cuml_frac[offset:], 'b.',
+        [x for x in range(len(counts))],
+        np.exp(xs*regression.slope+regression.intercept), 'r--'
+    )
+    plt.xlabel("j=Experience")
+    plt.ylabel("Tj=P(error)")
+    plt.xlim(left=0)
+    plt.savefig(plot_path)
+
+
+if __name__ == '__main__':
+    main(sys.argv)

+ 240 - 0
code/learning_curves/learningCurve_gradientDescent.py

@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+
+import sys
+import math
+import numpy as np
+
+import vcclib
+
+# n.b.: in the following functions,
+# j0/k0 are the 0-indexed versions (used mostly for indexing lists),
+# j1/k1 are the 1-indexed versions (used mostly for arithmetic)
+# ell is l spelled out to prevent confusion with the 1 character
+
+
+def loss(t1, ell, j1, a):
+    return (t1*j1**-ell - a)**2
+
+
+def loss_cuml(t1, ell, j1, cs, as_cuml, expected_vuln):
+    return (t1*expected_vuln - as_cuml[j1-1])**2
+
+
+def gradient(t1, ell, j1, a):
+    # returns (d/dt1, d/dl)
+    ret = (j1**(-2*ell)*(2*t1 - 2*a*j1),
+           2*t1*j1**(-2*ell)*np.log(j1)*(a*j1**ell - t1))
+    return ret
+
+
+def gradient_cuml(t1, ell, j1, cs, as_cuml, expected_vuln, log_factor):
+    # returns (d/dt1, d/dl)
+
+    # without t1 (too slow to use in practice, but left here to explain code
+    # expected_vuln_c = sum([k1**-l * cs[k1-1] for k1 in range(1, j1+1)])
+    # assert(expected_vuln == expected_vuln_c)
+    # log_factor_c = sum([-k1**-l * cs[k1-1] * np.log(k1)
+    #                     for k1 in range(1, j1+1)])
+    # assert(log_factor == log_factor_c)
+
+    gap = t1*expected_vuln - as_cuml[j1-1]
+    d_dt1 = expected_vuln*gap  # *2 removed from both since it's common and
+    d_dl = t1*log_factor*gap  # positive, so preserves direction
+    return (d_dt1, d_dl)
+
+
+def gradient_descent(initial_T1, initial_ell, counts, learning_rate):
+    minloss = 1000000000
+    guess = (initial_T1, initial_ell)
+    for i in range(2000):
+        gradients = [gradient(guess[0], guess[1], j0+1,
+                              counts[j0].vccs/counts[j0].total)
+                     for j0 in range(len(counts)) if counts[j0].total > 0]
+        grad = (sum(grad[0] for grad in gradients),
+                sum(grad[1] for grad in gradients))
+        guess = (guess[0] - learning_rate*grad[0],
+                 guess[1] - learning_rate*grad[1])
+        losss = sum([loss(guess[0], guess[1], j0+1,
+                          counts[j0].vccs/counts[j0].total)
+                     for j0 in range(len(counts)) if counts[j0].total > 0])
+        if losss >= minloss:
+            break
+        minloss = losss
+
+
+def calc_expected_vulns(ell, cs):
+    """
+    generates values giving the expected number of vulns at exp. <=j,
+    without the T1 factor, for memoization
+    """
+    total = 0
+    j1 = 1
+    while True:
+        try:
+            total += j1**-ell * cs[j1-1]
+        except OverflowError:
+            total = float("inf")
+        yield total
+        j1 += 1
+
+
+def calc_log_factors(ell, cs):
+    total = 0
+    j1 = 1
+    while True:
+        try:
+            total += -j1**-ell * cs[j1-1] * np.log(j1)
+        except OverflowError:
+            total = float("inf")
+        yield total
+        j1 += 1
+
+
+def gradient_descent_cuml(initial_T1, initial_ell,
+                          cs, cuml_vccs, learning_rate):
+    t1, ell = (initial_T1, initial_ell)
+    # without t1 factor
+    g = calc_expected_vulns(ell, cs)
+    expected_vulns = [next(g) for _ in range(len(cs))]
+    g = calc_log_factors(ell, cs)
+    log_factors = [next(g) for _ in range(len(cs))]
+
+    losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs,
+                        expected_vulns[j0])
+              for j0 in range(len(cs)) if cs[j0] > 0]
+    minloss = sum(losses)
+    for i in range(1000):
+        gradients = [gradient_cuml(t1, ell, j0+1, cs, cuml_vccs,
+                                   expected_vulns[j0], log_factors[j0])
+                     for j0 in range(len(cs)) if cs[j0] > 0]
+        grad = (sum(gradient[0] for gradient in gradients),
+                sum(gradient[1] for gradient in gradients))
+        old_t1 = t1
+        old_ell = ell
+        old_expected_vulns = expected_vulns
+        old_log_factors = log_factors
+        t1 = t1 - grad[0]*learning_rate[0]
+        ell = ell - grad[1]*learning_rate[1]
+        # without t1 factor
+        g = calc_expected_vulns(ell, cs)
+        expected_vulns = [next(g) for _ in range(len(cs))]
+        g = calc_log_factors(ell, cs)
+        log_factors = [next(g) for _ in range(len(cs))]
+        losses = [loss_cuml(t1, ell, j0+1, cs, cuml_vccs,
+                            expected_vulns[j0])
+                  for j0 in range(len(cs)) if cs[j0] > 0]
+        losss = sum(losses)
+
+        if losss > minloss:
+            t1 = old_t1
+            ell = old_ell
+            expected_vulns = old_expected_vulns
+            log_factors = old_log_factors
+            learning_rate = (learning_rate[0]/2, learning_rate[1]/2)
+        else:
+            if i % 100 == 0:
+                learning_rate = (learning_rate[0]*2, learning_rate[1]*2)
+            minloss = losss
+
+    return t1, ell, grad, minloss
+
+
+def main(argv):
+    # a file where each line is a VCC commit hash, followed by the issues it
+    # contributed to, comma separated
+    vcc_file = argv[1]
+    git_dirs = argv[2].split(':')
+    # the paths in the git dir to filter on (use "" or . to use everything)
+    project_paths = argv[3].split(':')
+    # the directory where experiences are stored
+    exp_dirs = vcclib.expdirs(argv[4].split(':'))
+    assert len(git_dirs) == len(exp_dirs) and \
+        len(git_dirs) == len(project_paths), \
+        "each git dir needs one project path and one experience dir"
+
+    guesses_t1 = [float(f) for f in argv[5].split(':')]
+    guesses_ell = [float(f) for f in argv[6].split(':')]
+
+    search = len(argv) > 7 and argv[7] == "search"
+
+    vccs = vcclib.get_vccs(vcc_file)
+    assert len(vccs) > 0, "found no VCCs (vcc_file: {})".format(vcc_file)
+
+    counts = vcclib.count_all_commits(git_dirs, project_paths, exp_dirs, vccs)
+    cs = [count.total for count in counts]
+
+    learning_rate = (1e-14, 1e-14)
+
+    if not search:
+        # normal mode, run grad descent on actual data, then +/-1 for err bars
+        for bias in [0, -1, +1]:
+            cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1]))
+                         for j in range(len(counts))]
+            for t1_guess in guesses_t1:
+                for ell_guess in guesses_ell:
+                    t1, ell, grad, minloss = \
+                        gradient_descent_cuml(t1_guess, ell_guess,
+                                              cs, cuml_vccs, learning_rate)
+                    print(bias, t1_guess, ell_guess, t1, ell, grad, minloss,
+                          flush=True)
+    else:
+        # search mode, run a binary search on 1 exp VCCs to flip ell to +
+
+        # a reasonable starting point is from no change to "1/2 of VCCs are 1"
+        bias_low = 0
+        bias_high = len(vccs)
+
+        # first, initial backoff to find upper bound
+        best_ell = 0
+        while best_ell <= 0:
+            cuml_vccs = [max(0, bias_high + sum(c.vccs for c in counts[:j+1]))
+                         for j in range(len(counts))]
+            best_minloss = math.inf
+            best_t1 = 0
+            for t1_guess in guesses_t1:
+                for ell_guess in guesses_ell:
+                    t1, ell, _grad, minloss = \
+                        gradient_descent_cuml(t1_guess, ell_guess,
+                                              cs, cuml_vccs, learning_rate)
+                    if minloss < best_minloss:
+                        best_minloss = minloss
+                        best_ell = ell
+                        best_t1 = t1
+
+            print(bias_high, best_t1, best_ell, flush=True)
+
+            # no do-while loop in python
+            if best_ell <= 0:
+                bias_low = bias_high
+                bias_high *= 2
+
+        # now do the actual bisecting search, with the previous loop having
+        # given us two values we know are above and below the target
+        while bias_high - bias_low > 1:
+            bias = (bias_high - bias_low)//2 + bias_low
+            cuml_vccs = [max(0, bias + sum(c.vccs for c in counts[:j+1]))
+                         for j in range(len(counts))]
+            best_minloss = math.inf
+            best_ell = 0
+            for t1_guess in guesses_t1:
+                for ell_guess in guesses_ell:
+                    t1, ell, _grad, minloss = \
+                        gradient_descent_cuml(t1_guess, ell_guess,
+                                              cs, cuml_vccs, learning_rate)
+                    if minloss < best_minloss:
+                        best_minloss = minloss
+                        best_ell = ell
+                        best_t1 = t1
+
+            print(bias, best_t1, best_ell, flush=True)
+
+            if best_ell > 0:
+                bias_high = bias
+            else:
+                bias_low = bias
+        print("Learning rate becomes positive going from {} to {}".format(
+            bias_low, bias_high))
+
+
+if __name__ == '__main__':
+    main(sys.argv)

+ 91 - 0
code/learning_curves/model-vs-model.py

@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import sys
+from decimal import Decimal
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+
+import vcclib
+
+# n.b.: ell is l spelled out to prevent confusion with the 1 character
+
+
+def main(argv):
+    argv2 = argv[1:]
+    plot_path = argv2.pop(0)
+
+    model1_name = argv2.pop(0)
+    model1_t1 = Decimal(argv2.pop(0))
+    model1_ell = Decimal(argv2.pop(0))
+    model1_t1_err_low = Decimal(argv2.pop(0))
+    model1_ell_err_low = Decimal(argv2.pop(0))
+    model1_t1_err_up = Decimal(argv2.pop(0))
+    model1_ell_err_up = Decimal(argv2.pop(0))
+
+    model2_name = argv2.pop(0)
+    model2_t1 = Decimal(argv2.pop(0))
+    model2_ell = Decimal(argv2.pop(0))
+    model2_t1_err_low = Decimal(argv2.pop(0))
+    model2_ell_err_low = Decimal(argv2.pop(0))
+    model2_t1_err_up = Decimal(argv2.pop(0))
+    model2_ell_err_up = Decimal(argv2.pop(0))
+
+    max_j = int(argv2.pop(0))
+
+    model1_t1_sig = vcclib.sigfigs([model1_t1,
+                                    model1_t1_err_low,
+                                    model1_t1_err_up])[0]
+    model1_ell_sig = vcclib.sigfigs([model1_ell,
+                                     model1_ell_err_low,
+                                     model1_ell_err_up])[0]
+    model2_t1_sig = vcclib.sigfigs([model2_t1,
+                                    model2_t1_err_low,
+                                    model2_t1_err_up])[0]
+    model2_ell_sig = vcclib.sigfigs([model2_ell,
+                                     model2_ell_err_low,
+                                     model2_ell_err_up])[0]
+
+    model1_t1_str = np.format_float_positional(model1_t1_sig, 3,
+                                               fractional=False)
+    model1_ell_str = np.format_float_positional(-model1_ell_sig, 3,
+                                                fractional=False)
+    model2_t1_str = np.format_float_positional(model2_t1_sig, 3,
+                                               fractional=False)
+    model2_ell_str = np.format_float_positional(-model2_ell_sig, 3,
+                                                fractional=False)
+
+    xs = [x+1 for x in range(max_j)]
+
+    ys_model1 = [model1_t1 * (x)**-model1_ell for x in xs]
+    ys_model1_low = [model1_t1_err_low * (x)**-model1_ell_err_low for x in xs]
+    ys_model1_up = [model1_t1_err_up * (x)**-model1_ell_err_up for x in xs]
+
+    ys_model2 = [model2_t1 * (x)**-model2_ell for x in xs]
+    ys_model2_low = [model2_t1_err_low * (x)**-model2_ell_err_low for x in xs]
+    ys_model2_up = [model2_t1_err_up * (x)**-model2_ell_err_up for x in xs]
+
+    plt.rc('text', usetex=True)
+    plt.rc('font', family='serif', size=18)
+    mpl.rcParams["text.latex.preamble"] = \
+        "\\usepackage{relsize}\n\\usepackage{xspace}"
+    plt.plot(xs, ys_model1, 'b--',
+             label=r"{}: $P_j={} j^{{{}}}$".format(
+                 model1_name, model1_t1_str, model1_ell_str))
+    plt.fill_between(xs, ys_model1_low, ys_model1_up,
+                     color='blue', alpha=0.2)
+    plt.plot(xs, ys_model2, 'r-',
+             label=r"{}: $P_j={} j^{{{}}}$".format(
+                 model2_name, model2_t1_str, model2_ell_str))
+    plt.fill_between(xs, ys_model2_low, ys_model2_up,
+                     color='red', alpha=0.2)
+    plt.xlabel("$j=$ Experience")
+    plt.ylabel("$P_j$")
+    plt.xlim(left=1)
+    plt.legend(loc="upper right")
+    plt.tight_layout()
+    plt.savefig(plot_path)
+
+
+if __name__ == '__main__':
+    main(sys.argv)

+ 95 - 0
code/learning_curves/model-vs-real.py

@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+import sys
+from decimal import Decimal
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MaxNLocator
+
+import vcclib
+
+# n.b.: ell is l spelled out to prevent confusion with the 1 character
+
+
+def project_from_model(t1, ell, xs, counts):
+    ys = []
+    total = 0
+    for x in xs:
+        expected = t1 * (x)**-ell * counts[x-1].total
+        total += expected
+        ys.append(total)
+    return ys
+
+
+def main(argv):
+    # a file where each line is a VCC commit hash, followed by the issues it
+    # contributed to, comma separated
+    vcc_file = argv[1]
+    git_dirs = argv[2].split(':')
+    # the paths in the git dir to filter on (use "" or . to use everything)
+    project_paths = argv[3].split(':')
+    # the directory where experiences are stored
+    exp_dirs = vcclib.expdirs(argv[4].split(':'))
+    assert len(git_dirs) == len(exp_dirs) and \
+        len(git_dirs) == len(project_paths), \
+        "each git dir needs one project path and one experience dir"
+    # the path+name of where to save the resulting plot
+    plot_path = argv[5]
+
+    model_t1 = Decimal(argv[6])
+    model_ell = Decimal(argv[7])
+    model_t1_err_low = Decimal(argv[8])
+    model_ell_err_low = Decimal(argv[9])
+    model_t1_err_up = Decimal(argv[10])
+    model_ell_err_up = Decimal(argv[11])
+    mt1_sig = vcclib.sigfigs([model_t1, model_t1_err_low, model_t1_err_up])[0]
+    ml_sig = vcclib.sigfigs([model_ell, model_ell_err_low,
+                             model_ell_err_up])[0]
+
+    model_t1_str = np.format_float_positional(mt1_sig, 3, fractional=False)
+    model_ell_str = np.format_float_positional(-ml_sig, 3, fractional=False)
+
+    vccs = vcclib.get_vccs(vcc_file)
+
+    counts = vcclib.count_all_commits(git_dirs, project_paths, exp_dirs, vccs)
+    cuml_vccs = [sum(c.vccs for c in counts[:j+1]) for j in range(len(counts))]
+    cuml_tot = [sum(c.total for c in counts[:j+1]) for j in range(len(counts))]
+
+    # skip values where there's no data to compare against
+    offset = 0
+    for i in range(len(cuml_vccs)):
+        if cuml_tot[i] != 0:
+            offset = i
+            break
+
+    xs_empirical = [x+1 for x in range(offset, len(counts))]
+    xs_model = [x+1 for x in range(len(counts))]
+    ys_model = project_from_model(model_t1, model_ell, xs_model, counts)
+    print(model_t1, model_ell)
+    ys_err_low = project_from_model(model_t1_err_low, model_ell_err_low,
+                                    xs_model, counts)
+    ys_err_up = project_from_model(model_t1_err_up, model_ell_err_up,
+                                   xs_model, counts)
+
+    plt.rc('text', usetex=True)
+    plt.rc('font', family='serif', size=18)
+    ax = plt.figure().gca()
+    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
+
+    plt.plot(xs_empirical, cuml_vccs, 'm.',
+             label=r"Empirical $v_{\le j}$")
+    plt.plot(xs_model, ys_model, 'g--',
+             label=r"$V_{\le j}=\sum_{k=0}^{j}" + model_t1_str + " c_k k^{" +
+             model_ell_str + "}$")
+    plt.fill_between(xs_model, ys_err_low, ys_err_up,
+                     color='green', alpha=0.2)
+    plt.xlabel("$j=$ Experience")
+    plt.ylabel("Vulnerabilities")
+    plt.xlim(left=0)
+    plt.legend(loc="lower right")
+    plt.tight_layout()
+    plt.savefig(plot_path)
+
+
+if __name__ == '__main__':
+    main(sys.argv)

+ 67 - 0
code/learning_curves/plot-experience.py

@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+import sys
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+
+import vcclib
+
+
+def run(args):
+    git_dirs = args[0].split(':')
+    # the paths in the git dir to filter on (use "" or . to use everything)
+    project_paths = args[1].split(':')
+    # the directory where experiences are stored
+    exp_dirs = vcclib.expdirs(args[2].split(':'))
+
+    counts = vcclib.count_all_commits(git_dirs, project_paths, exp_dirs, {})
+    cuml_tot = [sum(c.total for c in counts[:j+1]) for j in range(len(counts))]
+    return cuml_tot
+
+
+def main(argv):
+    # the path+name of where to save the resulting plot
+    frac_path = argv[9]
+    tot_path = argv[10]
+
+    cuml_tot1 = run(argv[1:4])
+    label1 = argv[4]
+    cuml_tot2 = run(argv[5:8])
+    label2 = argv[8]
+
+    cuml_frac1 = [v/cuml_tot1[-1] for v in cuml_tot1]
+    cuml_frac2 = [v/cuml_tot2[-1] for v in cuml_tot2]
+    xs1 = [x+1 for x in range(len(cuml_tot1))]
+    xs2 = [x+1 for x in range(len(cuml_tot2))]
+
+    plt.rc('text', usetex=True)
+    plt.rc('font', family='serif', size=18)
+    mpl.rcParams["text.latex.preamble"] = \
+        "\\usepackage{relsize}\n\\usepackage{xspace}"
+    l1 = plt.plot(xs1, cuml_frac1, 'b+',
+                  label=r"{} $c_{{\le j}}$".format(label1))
+    l2 = plt.plot(xs2, cuml_frac2, 'rs',
+                  label=r"{} $c_{{\le j}}$".format(label2))
+    plt.xlabel("$j=$ Experience")
+    plt.ylabel("Fraction of projects' commits")
+    plt.xscale('log')
+    plt.yscale('log')
+    plt.xlim(left=1)
+    ax = plt.gca()
+    plt.legend(loc='lower right')
+    plt.tight_layout()
+    plt.savefig(frac_path)
+
+    l1.pop().remove()
+    l2.pop().remove()
+
+    plt.ylabel("Number of projects' commits")
+    ax.set_ylim([1, 50000])
+    plt.plot(xs1, cuml_tot1, 'b+', label=r"{} $c_{{\le j}}$".format(label1))
+    plt.plot(xs2, cuml_tot2, 'rs', label=r"{} $c_{{\le j}}$".format(label2))
+    plt.tight_layout()
+    plt.savefig(tot_path)
+
+
+if __name__ == '__main__':
+    main(sys.argv)

+ 68 - 0
code/learning_curves/plot_T1s.py

@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+import sys
+import subprocess
+import math
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def get_commit_xps(git_dir, paths):
+    command = "git -C " + git_dir + " log " \
+        "--full-history --reverse --no-merges --use-mailmap "\
+        "--format='format:%ct	%aN <%aE>' -- " + paths + \
+        " | sort -n | cut -f2"
+    lines = subprocess.check_output(command, shell=True,
+                                    universal_newlines=True).strip()
+    assert lines
+    author_xps = {}
+    xps = []
+    for line in lines.splitlines():
+        author = line.strip()
+        if author not in author_xps:
+            author_xps[author] = 1
+        xps.append(author_xps[author])
+        author_xps[author] += 1
+    return xps
+
+
+def find_T1_at_l(commit_xps, num_vccs, l, precision):
+    left = 0.0
+    right = 1.0
+    s = len(commit_xps) - num_vccs
+    print("l: {} s: {}".format(l, s))
+    while (right - left) > precision:
+        T1_guess = (left + right) / 2
+        mean = sum([1.0-T1_guess*c**-l for c in commit_xps])
+        assert(s > mean)
+        assert(math.log(s/mean) > 0)
+        # Pr[S>=s] (i.e., the probability that a dist with T1_guess would have produced more vuln-free commits)
+        p = math.exp(s - mean - s * math.log(s/mean))
+        print(T1_guess, mean, p)
+        if p < 0.05:
+            # the probability of fewer good commits is <5%,
+            # so this T1 would have produced more vulns with >95% prob.
+            # we can lower our guess for T1
+            right = T1_guess
+        else:
+            left = T1_guess
+    return T1_guess
+
+
+def main(argv):
+    git_dirs = argv[1].split(':')
+    paths = argv[2].split(':')
+    plot_path = argv[3]
+    commit_xps = [xp for i in range(len(git_dirs)) for xp in
+                  get_commit_xps(git_dirs[i], paths[i])]
+    num_vccs = int(argv[4])
+    l_vals = np.arange(0.01, 0.2, 0.02)
+    print(l_vals)
+    T1s = [find_T1_at_l(commit_xps, num_vccs, l, 0.0001) for l in l_vals]
+    print(T1s)
+    plt.plot(l_vals, T1s)
+    plt.savefig(plot_path)
+
+
+if __name__ == '__main__':
+    main(sys.argv)

+ 37 - 0
code/learning_curves/plot_rust_data.sh

@@ -0,0 +1,37 @@
+DATA_DIR=/home/j3tracey/Documents/rustsafety/data/
+CODE_DIR=/home/j3tracey/Documents/rustsafety/code/
+
+# CSS styling
+# stylo
+cd "$DATA_DIR/servo"
+python3 "$CODE_DIR/author-identities.py" . | sort > .mailmap
+python3 "$CODE_DIR/learning_curves/plot_T1s.py" "." "components/style/" "$DATA_DIR/learning-curve-plots/stylo.svg" 4
+
+# Rendering
+# webrender
+cd "$DATA_DIR/webrender"
+python3 "$CODE_DIR/author-identities.py" . | sort > .mailmap
+python3 "$CODE_DIR/learning_curves/plot_T1s.py" "../gecko-dev:." "gfx/webrender_bindings:." "$DATA_DIR/learning-curve-plots/webrender.svg" 2
+
+# Color management
+# qcms
+cd "$DATA_DIR/qcms"
+python3 "$CODE_DIR/author-identities.py" . | sort > .mailmap
+python3 "$CODE_DIR/learning_curves/plot_T1s.py" "." "." "$DATA_DIR/learning-curve-plots/qcms-rust.svg" 0
+
+# MP4 Parser
+# mp4parse-rust
+cd "$DATA_DIR/mp4parse-rust"
+python3 "$CODE_DIR/author-identities.py" . | sort > .mailmap
+python3 "$CODE_DIR/learning_curves/plot_T1s.py" "." "." "$DATA_DIR/learning-curve-plots/mp4parse-rust.svg" 0
+
+# Unicode Encoder
+# encoding_rs
+cd "$DATA_DIR/encoding_rs"
+python3 "$CODE_DIR/author-identities.py" . | sort > .mailmap
+python3 "$CODE_DIR/learning_curves/plot_T1s.py" "." "." "$DATA_DIR/learning-curve-plots/encoding_rs.svg" 0
+
+# Combined (above + Libcubeb: MacOS)
+cd "$DATA_DIR/cubeb-coreaudio-rs"
+python3 "$CODE_DIR/author-identities.py" . | sort > .mailmap
+python3 "$CODE_DIR/learning_curves/plot_T1s.py" "$DATA_DIR/gecko-dev:$DATA_DIR/servo:$DATA_DIR/webrender:$DATA_DIR/qcms:$DATA_DIR/mp4parse-rust:$DATA_DIR/encoding_rs:$DATA_DIR/cubeb-coreaudio-rs" "gfx/webrender_bindings:components/style/:::::" "$DATA_DIR/learning-curve-plots/rust-combined.svg" 7

+ 147 - 0
code/learning_curves/vcclib.py

@@ -0,0 +1,147 @@
+import subprocess
+
+
+class Commit:
+    def __init__(self, commit_hash, author, vcc=None):
+        self.commit_hash = commit_hash
+        self.author = author
+        self.is_vcc = vcc
+        self.xp = None
+
+    def get_experience(self, commits, exp_dir):
+        if self.xp is None:
+            with open(exp_dir + self.author.replace("/", "_")) as f:
+                commit_history = f.readlines()
+            for xp in range(len(commit_history)):
+                commit_hash = commit_history[xp].split(',')[0]
+                if commit_hash in commits.hash_to_commit:
+                    commits.hash_to_commit[commit_hash].xp = xp
+        assert self.xp is not None, "author: {}\ncommit: {}\nis vcc: {}"\
+                   .format(self.author, self.commit_hash, self.is_vcc)
+        return self.xp
+
+
+class Commits:
+    def __init__(self, git_dir, paths, vccs=None):
+        """
+        Returns a list of Commits at the given paths, ordered chronologically
+        by authored time from old to new (NOT the order they were applied).
+        paths is a single string appended raw to the git command,
+        so any necessary escaping, quoting, etc. should be applied prior
+        """
+        command = "git -C " + git_dir + " log " \
+            "--full-history --reverse --no-merges --use-mailmap " \
+            "--since-as-filter=2012-04-09 " \
+            "--format='format:%ct	%H	%aN <%aE>' -- " \
+            + paths + " | sort -n | cut -f2,3"
+        lines = subprocess.check_output(command, shell=True,
+                                        universal_newlines=True).strip()
+        assert lines, "git command failed to return any commits: {}"\
+            .format(command)
+        self.commits = []
+        self.hash_to_commit = {}
+        matched_vccs = set()
+        for line in lines.splitlines():
+            if '\\' in line:
+                # dark incantation to unescape string
+                line = line.encode('latin1').decode('unicode_escape').encode(
+                    'latin1').decode('utf-8')
+            line = line.strip().split('	')  # tab
+            commit_hash = line[0]
+            author = line[1]
+            if vccs:
+                vcc = commit_hash in vccs
+                if vcc:
+                    matched_vccs.add(commit_hash)
+            else:
+                vcc = None
+            commit = Commit(line[0], author, vcc)
+            self.commits.append(commit)
+            self.hash_to_commit[commit_hash] = commit
+        # unmatched_vccs = [vcc for vcc in vccs if vcc not in matched_vccs]
+        # print("VCCs unmatched to any valid commit:", unmatched_vccs,
+        #      file=sys.stderr, flush=True)
+
+
+class GrowingList(list):
+    def __init__(self, default):
+        super().__init__()
+        self.default = default
+
+    def __setitem__(self, index, value):
+        while index >= len(self):
+            self.append(self.default())
+        list.__setitem__(self, index, value)
+
+    def __getitem__(self, index):
+        while index >= len(self):
+            self.append(self.default())
+        return list.__getitem__(self, index)
+
+
+class Counts:
+    def __init__(self, total=0, vccs=0):
+        self.total = total
+        self.vccs = vccs
+
+
+def count_commits(commits, vccs, exp_dir, counts=None):
+    if not counts:
+        counts = GrowingList(Counts)
+    bugs = set()
+    for commit in commits.commits:
+        j = commit.get_experience(commits, exp_dir)
+        if commit.is_vcc:
+            for bug in vccs[commit.commit_hash]:
+                if bug not in bugs:
+                    counts[j].vccs += 1
+                    bugs.add(bug)
+        counts[j].total += 1
+    return counts
+
+
+def count_all_commits(git_dirs, project_paths, exp_dirs, vccs):
+    assert len(git_dirs) == len(exp_dirs) and \
+        len(git_dirs) == len(project_paths), \
+        "each git dir needs one project path and one experience dir"
+
+    counts = None
+    for i in range(len(git_dirs)):
+        commits = Commits(git_dirs[i], project_paths[i], vccs)
+        counts = count_commits(commits, vccs, exp_dirs[i], counts)
+
+    # convert to a normal list
+    return [c for c in counts]
+
+
+def get_vccs(vcc_file):
+    vccs = {}
+    with open(vcc_file) as f:
+        for line in f.readlines():
+            line = line.strip().split(',')
+            issues = {issue for issue in line[1:]}
+            if line[0] not in vccs:
+                vccs[line[0]] = issues
+            else:
+                vccs[line[0]] |= issues
+    return vccs
+
+
+def expdirs(exp_dirs):
+    for exp_dir in exp_dirs:
+        if exp_dir[-1] != '/':
+            exp_dir += '/'
+    return exp_dirs
+
+
+# takes an iterable of Decimal objects
+def sigfigs(vals):
+    msds = [v.adjusted() for v in vals]
+    if not all(msd == msds[0] for msd in msds):
+        msd = -max(msds)
+        return [round(vals[i], msd) for i in range(len(vals))]
+
+    for i in range(-msds[0], 20):  # arbitrarily high precision
+        if any(round(v, i) != round(vals[0], i) for v in vals):
+            return [round(v, i) for v in vals]
+    return vals

+ 249 - 0
code/reproduceResults.sh

@@ -0,0 +1,249 @@
+#!/bin/bash
+
+## How to Use
+# Takes 0 inputs, and outputs the data from our techniques.
+# Can also take optional flags to run subset of commands, see -h output.
+
+## Fields to Maintain
+# git repos of projects we analyze
+# N.b: "webrender_bindings", the only Rust code without its own git repo,
+#      is handled specially in the last step of this script.
+declare -A repos
+repos["firefox"]='https://github.com/mozilla/gecko-dev.git'
+repos["webrender"]='https://github.com/servo/webrender.git'
+repos["servo"]='https://github.com/jtracey/servo-mirror.git'
+repos["mp4parse-rust"]='https://github.com/mozilla/mp4parse-rust.git'
+#repos["qcms"]='https://github.com/FirefoxGraphics/qcms.git'
+repos["encoding_rs"]='https://github.com/hsivonen/encoding_rs.git'
+repos["mapped_hyph"]='https://github.com/jfkthame/mapped_hyph.git'
+#repos["chardetng"]='https://github.com/hsivonen/chardetng.git'
+#repos["shift_or_euc"]='https://github.com/hsivonen/shift_or_euc'
+repos["cubeb-coreaudio-rs"]='https://github.com/mozilla/cubeb-coreaudio-rs.git'
+
+# git hash of the most recent commit in the tree we consider
+declare -A commits
+commits["firefox"]=e5d3122984cea27576ad55b9898f2ec46529c5c9
+commits["webrender"]=cb2b55394892ef9ea1e89dbe41fd3a8cebd61468
+commits["servo"]=b1578947ef369a1810d1a83373f68bfd7fe23fe1
+commits["mp4parse-rust"]=4f70fc9ec2b43f17003c476dcc0ad1737ae100dc
+#commits["qcms"]=f2fdcde3912967fa06a5fff0957eebc7901c0645
+commits["encoding_rs"]=a962ef4f8e569ccf5a22104d19cc10e8a0b458e6
+commits["mapped_hyph"]=c7651a0cffff41996ad13c44f689bd9cd2192c01
+#commits["chardetng"]=143dadde20e283a46ef33ba960b517a3283a3d22
+commits["cubeb-coreaudio-rs"]=3ea3897147fa52ee3586b81d6d48315f0fba2777
+
+# whitespace-separated list of C++ projects to analyze
+cprojects="layers css stagefright qcms uconv hyphenation japanese-encoding cubeb-macos"
+
+# whitespace-separated list of Rust projects to analyze
+# (webrender_bindings is in gecko, stylo is in the servo git repo)
+# you would also need to update the following scripts/files:
+# /code/fetch_bugzilla_bugs/fetch_bugs.sh
+# /code/fetch_bugzilla_bugs/filter.sh
+# /data/hand-annotated/relevant-dirs.csv
+rustprojects="webrender webrender_bindings servo mp4parse-rust encoding_rs mapped_hyph cubeb-coreaudio-rs"
+
+## Project Directory Structure
+# (data dirs are made prior to being populated)
+# directory this script is in
+scriptDir="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
+# root dir of this project
+rootDir="$(dirname $scriptDir)"
+# dir that stores all code, scripts, etc. (that we run, not analyze)
+codeDir="$rootDir/code/"
+# dir where we store all generated results and the few hand-annotated files
+dataDir="$rootDir/data/"
+# dir to clone repos into
+repoDir="$dataDir/repos/"
+# dir to store structured representation of issues
+issuesDir="$dataDir/issues/"
+# dir to store structured representation of fixes
+fixesDir="$dataDir/fixes/"
+# dir to store the output of SZZ
+resultsDir="$dataDir/szz-results/"
+# dir to store our best guess of which commits introduced which bugs
+bugIntroDir="$dataDir/introducers/"
+# dir to store the results of comparison of SZZ with available ground truth
+groundDir="$dataDir/ground-truth/"
+# dir where we store hand-annotated data
+# (i.e., data not generated by this script)
+annotatedDir="$dataDir/hand-annotated/"
+# dir where we store structured data about issues, their fixes, and inducers
+bugzillaDir="$dataDir/bugzilla-data/"
+# dir where we store the experience of each contributor for each project
+experienceDir="$dataDir/experiences/"
+# dir where we store the learning curve plots
+plotDir="$dataDir/learning-curve-plots/"
+
+set -e
+
+if (($# == 0)); then
+    allFlag=true
+fi
+while getopts hacdistuv opt; do
+    case $opt in
+        h)
+            echo "usage: $0 [flags]"
+            echo "  -h   print this help and exit"
+            echo "  -a   run all steps (this is the default if no args given)"
+            echo "  -c   clone each project git repo into $repoDir, if it doesn't already exist"
+            echo "  -d   download relevant issues"
+            echo "  -i   identify fix commits"
+            echo "  -s   run SZZ"
+            echo "  -t   compare to ground truth"
+            echo "  -u   update experience files"
+            echo "  -v   create and visualize learning curves"
+            exit
+            ;;
+        a) allFlag=true ;;
+        c) cloneFlag=true ;;
+        d) downloadFlag=true ;;
+        i) identifyFlag=true ;;
+        s) szzFlag=true ;;
+        t) truthFlag=true ;;
+        u) experienceFlag=true ;;
+        v) learningFlag=true ;;
+        \?) echo "unknown flag: -$OPTARG" >&2 ;;
+    esac
+done
+
+
+# Step -1: check dependencies (only if running everything)
+if [ "$allFlag" = true ] ; then
+    missing=false
+    for dependency in git jq python3 gradle java javac ; do
+                      if ! type "$dependency" > /dev/null; then
+                          echo "Missing dependency: $dependency" >&2
+                          missing=true
+                      fi
+    done
+    for pyMod in numpy scipy matplotlib.pyplot urllib3 ; do
+        if ! python3 -c "import $pyMod" > /dev/null; then
+            echo "Missing python module: $pyMod"
+            missing=true
+        fi
+    done
+    # dvipng texlive-latex-extra texlive-fonts-recommended cm-super
+    if [ "$missing" = true ] ; then
+        echo "Aborting due to missing dependencies." >&2
+        exit 1
+    fi
+    git_major=$(git --version | cut -f1 -d.)
+    git_minor=$(git --version | cut -f2 -d.)
+    if [ $git_major -lt 2 ] || [ $git_minor -lt 37 ] ; then
+        echo "Aborting: git needs --since-as-filter, added in 2.37" >&2
+        exit 1
+    fi
+fi
+
+
+# Step 0: for each repo that doesn't already exist:
+#  - clone
+#  - uncap rename limits
+#  - generate the .mailmap file
+for repo in "${!repos[@]}" ; do
+    if ! [ -d "$repoDir/$repo" ] && ([ "$allFlag" = true ] || [ "$cloneFlag" = true ])
+    then
+        mkdir -p "$repoDir"
+        git clone "${repos[$repo]}" "$repoDir/$repo"
+        cd "$repoDir/$repo"
+        git config diff.renameLimit 0
+        git checkout "${commits[$repo]}"
+        python3 "$codeDir/author-identities.py" . | sort > .mailmap
+        cd "$rootDir"
+    elif [ "$cloneFlag" = true ] ; then
+        echo "You're trying to clone $repo, but it seems to already exist." >&2
+        echo "Remove or move $repoDir/$repo if you really want to clone it again." >&2
+        echo "Continuing as though this succeeded." >&2
+    fi
+done
+
+
+# Step 1: get (filtered) issues
+if [ "$allFlag" = true ] || [ "$downloadFlag" = true ] ; then
+    # Pulls and structures bug data we need, as available from Bugzilla
+    "$codeDir"/fetch_bugzilla_bugs/fetch_bugs.sh "$codeDir" "$issuesDir"
+    # Additional filters based on conditions not visible in Bugzilla metadata
+    "$codeDir"/fetch_bugzilla_bugs/filter.sh "$codeDir" "$issuesDir" "$repoDir/firefox"
+fi
+
+
+# Step 2: identify fix commits
+if [ "$allFlag" = true ] || [ "$identifyFlag" = true ] ; then
+    mkdir -p "$fixesDir"
+    cd "$dataDir"
+    for project in $cprojects ; do
+        echo "getting $project fixes"
+        python3 "$codeDir/fetch_bugzilla_bugs/find_bugzilla_fixes.py" \
+                --git-path="$repoDir/firefox" \
+                --issue-list="$issuesDir/$project-issues/"
+        mv issue_list.json "$fixesDir/$project.json"
+    done
+    cd "$rootDir"
+fi
+
+
+# Step 3: run SZZ
+if [ "$allFlag" = true ] || [ "$szzFlag" = true ] ; then
+    cd "$codeDir/szz"
+    mkdir -p "$resultsDir"
+    gradle fatJar
+    for project in $cprojects ; do
+        echo "running SZZ on $project"
+        rm -rf "$resultsDir/$project"
+        java -Xmx5g -jar ./build/libs/szz_find_bug_introducers-0.1.jar -i \
+             "$fixesDir/$project.json" -r "$repoDir/firefox" -d 1
+        mv results "$resultsDir/$project"
+        rm -r issues
+    done
+    cd "$rootDir"
+fi
+
+
+# Step 4: compare to ground truth
+if [ "$allFlag" = true ] || [ "$truthFlag" = true ] ; then
+    #for project in $cprojects ; do
+    #    echo "pulling approval-reqs for $project"
+    #    "$codeDir"/fetch_bugzilla_bugs/fetch-all-approval-reqs.sh \
+    #              "$fixesDir/$project.json" "$repoDir/firefox" "$bugzillaDir" "$codeDir"
+    #done
+    cd "$resultsDir"
+    mkdir -p "$groundDir"
+    mkdir -p "$bugIntroDir"
+    for project in $cprojects ; do
+        echo "comparing results for $project"
+        mkdir -p "$bugIntroDir/$project"
+        "$codeDir"/compare_results-v2.sh \
+                  "$project" \
+                  "$fixesDir/$project.json" \
+                  "$resultsDir/$project/fix_and_introducers_pairs.json" \
+                  "$annotatedDir/c++.csv" \
+                  "$bugIntroDir/$project/" \
+                  > "$groundDir/$project.csv"
+    done
+    cd "$rootDir"
+fi
+
+
+# Step 5: update/generate experience files
+if [ "$allFlag" = true ] || [ "$experienceFlag" = true ] ; then
+    mkdir -p "$plotDir"
+    for repo in "${!repos[@]}" ; do
+        rm -rf "$experienceDir/$repo"
+        mkdir -p "$experienceDir/$repo"
+        "$codeDir"/learning_curves/genExp.sh "$repoDir/$repo" \
+                  "${commits[$repo]}" "$experienceDir/$repo"
+    done
+fi
+
+
+# Step 6: generate and plot learning curve
+if [ "$allFlag" = true ] || [ "$learningFlag" = true ] ; then
+    rm -rf "$repoDir/webrender_bindings" "$experienceDir/webrender_bindings"
+    ln -s "$repoDir/firefox" "$repoDir/webrender_bindings"
+    ln -s "$experienceDir/firefox" "$experienceDir/webrender_bindings"
+    echo "creating and plotting learning curves..."
+    "$codeDir/learning_curves/grid_search.sh" \
+        "$codeDir" "$dataDir" "$annotatedDir" "$repoDir" \
+        "$experienceDir" "$plotDir" "$rustprojects"
+fi

+ 3 - 0
code/szz/build.gradle

@@ -53,6 +53,9 @@ dependencies {
         compile group: 'info.debatty', name: 'java-string-similarity', version: '1.0.1'
         compile group: 'com.googlecode.json-simple', name: 'json-simple', version: '1.1'
         compile group: 'org.incava', name: 'diffj', version: '1.6.4'
+        compile group: 'net.sourceforge.pmd', name: 'pmd', version: '5.8.1'
+        compile group: 'net.sourceforge.pmd', name: 'pmd-core', version: '5.8.1'
+        compile group: 'net.sourceforge.pmd', name: 'pmd-cpp', version: '5.8.1'
 }
 
 task runJar(type: JavaExec) {

+ 62 - 0
code/szz/src/main/java/diff/CPPFileExtension.java

@@ -0,0 +1,62 @@
+package diff;
+
+import net.sourceforge.pmd.lang.cpp.ast.CppParserConstants;
+import net.sourceforge.pmd.lang.cpp.ast.Token;
+import net.sourceforge.pmd.lang.cpp.CppTokenManager;
+
+import org.eclipse.jgit.diff.HistogramDiff;
+import org.eclipse.jgit.diff.Edit;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Set;
+import java.io.Reader;
+import java.io.IOException;
+
+import org.slf4j.Logger;
+
+public class CPPFileExtension {
+  private CppTokenManager tokenManager;
+  private List<Token> tokens;
+  public CPPFileExtension(String filePath, Reader reader) {
+    this.tokenManager = new CppTokenManager(reader);
+  }
+
+  private List<Token> getTokens() {
+    if (tokens == null) {
+      this.tokens = new ArrayList<Token>();
+      for(Token token = (Token) tokenManager.getNextToken();
+          token.kind != CppParserConstants.EOF;
+          token = (Token) tokenManager.getNextToken()) {
+        this.tokens.add(token);
+      }
+    }
+    return this.tokens;
+  }
+
+  public Set<Integer> allLineNumbers() {
+    List<Token> tokens = this.getTokens();
+    Set<Integer> lines = new HashSet<>();
+    for (Token t : tokens) {
+      lines.add(t.beginLine);
+    }
+    return lines;
+  }
+
+  public Set<Integer> affectedLineNumbers(CPPFileExtension fileToCompare) {
+    Set<Integer> affectedLines = new HashSet<>();
+    TokenSequence tokens1 = new TokenSequence(this.getTokens());
+    TokenSequence tokens2 = new TokenSequence(fileToCompare.getTokens());
+    HistogramDiff diff = new HistogramDiff();
+    List<Edit> tokdiff = diff.diff(new TokenComparator(), tokens1, tokens2);
+    tokdiff.forEach(it -> {
+        for (int i = it.getBeginB(); i <= it.getEndB(); i++) {
+          Token t = tokens2.tokens.get(i);
+          affectedLines.add(t.beginLine);
+        }
+      }
+      );
+    return affectedLines;
+  }
+}

+ 18 - 0
code/szz/src/main/java/diff/TokenComparator.java

@@ -0,0 +1,18 @@
+package diff;
+
+import org.eclipse.jgit.diff.SequenceComparator;
+import net.sourceforge.pmd.lang.cpp.ast.Token;
+
+import java.util.List;
+
+public class TokenComparator extends SequenceComparator<TokenSequence> {
+    @Override
+    public boolean equals(TokenSequence a, int ai, TokenSequence b, int bi) {
+        return a.tokens.get(ai).image.equals(b.tokens.get(bi).image);
+    }
+
+    @Override
+    public int hash(TokenSequence seq, int ptr) {
+        return seq.tokens.get(ptr).image.hashCode();
+    }
+}

+ 18 - 0
code/szz/src/main/java/diff/TokenSequence.java

@@ -0,0 +1,18 @@
+package diff;
+
+import org.eclipse.jgit.diff.Sequence;
+import net.sourceforge.pmd.lang.cpp.ast.Token;
+
+import java.util.List;
+
+public class TokenSequence extends Sequence {
+    public List<Token> tokens;
+    public TokenSequence(List<Token> tokens) {
+        this.tokens = tokens;
+    }
+
+    @Override
+    public int size() {
+        return tokens.size();
+    }
+}

+ 1 - 0
code/szz/src/main/java/heuristics/SimpleBugIntroducerFinder.java

@@ -142,6 +142,7 @@ public class SimpleBugIntroducerFinder implements BugIntroducerFinder {
 
       for (FileAnnotationGraph fileGraph : subGraphs) {
         Iterator<String> revisions = fileGraph.revisions.iterator();
+        if (!revisions.hasNext()) continue;
         revisions.next();
         if (!revisions.hasNext()) continue;
 

+ 184 - 8
code/szz/src/main/java/parser/GitParser.java

@@ -26,6 +26,7 @@ package parser;
 
 import data.Issues;
 import diff.JavaFileExtension;
+import diff.CPPFileExtension;
 import graph.AnnotationMap;
 import graph.FileAnnotationGraph;
 import org.eclipse.jgit.api.BlameCommand;
@@ -53,9 +54,11 @@ import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.IOException;
+import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.IntPredicate;
 import java.util.stream.Collectors;
 
 /**
@@ -134,6 +137,22 @@ public class GitParser {
     this.logger = logger;
   }
 
+  private boolean isCPlusPlus(String filePath) {
+    return
+      filePath.endsWith(".cpp") ||
+      filePath.endsWith(".cxx") ||
+      filePath.endsWith(".c++") ||
+      filePath.endsWith(".hpp") ||
+      filePath.endsWith(".hxx") ||
+      filePath.endsWith(".h++") ||
+      filePath.endsWith(".cc") ||
+      filePath.endsWith(".hh") ||
+      filePath.endsWith(".c") ||
+      filePath.endsWith(".C") ||
+      filePath.endsWith(".h") ||
+      filePath.endsWith(".H");
+  }
+
   private int getSourceLine(BlameResult foundCommit, int index) throws IOException {
     foundCommit.computeAll();
 
@@ -155,18 +174,39 @@ public class GitParser {
 
     if (step == 0 || !source.diffWithParent.containsKey(filePath)) return null;
 
+    if (!isCPlusPlus(filePath)) {
+        logger.warn("early skipping non-C++ file: " + filePath);
+        return createEmptyGraph(filePath);
+    }
+
     /*
-     * Save all line numbers for the source commits deletions.
+     * Save all line numbers for the source commits deletions,
+     * and the lines immediately above and below source commit additions.
      */
     List<Integer> delIndexes = buildDelIndexes(filePath, source);
 
+    /*foundRevisions1.forEach((k,v) -> { logger.warn(k + "[" + v + "]"); });
+    Map<Integer, Integer> lineMap = foundRevisions1.get(parentRev);
+    logger.warn("" + lineMap.get(7));
+    List<Integer> tracedAddIndexes = addIndexes.stream().map(i -> lineMap.get(i)).collect(Collectors.toList());*/
+
     FileAnnotationGraph graph = createEmptyGraph(filePath);
     graph.revisions.add(ObjectId.toString(source.commit.toObjectId()));
 
-    BlameResult found = callBlameCommand(filePath, source.commit.getParent(0));
-    if (found == null) return graph;
+    BlameResult foundDel = callBlameCommand(filePath, source.commit.getParent(0));
 
-    Map<RevCommit, Map<Integer, Integer>> foundRevisions = linkRevisionsWithLineNumbers(delIndexes, found);
+    Map<RevCommit, Map<Integer, Integer>> foundRevisions = null;
+    if (foundDel != null) {
+      foundRevisions = linkRevisionsWithLineNumbers(delIndexes, foundDel);
+    } else {
+      List<Integer> addIndexes = buildAddIndexes(filePath, source);
+      BlameResult foundAdd = callBlameCommand(filePath, source.commit);
+      if (foundAdd != null) {
+        foundRevisions = linkRevisionsWithLineNumbers(addIndexes, foundAdd);
+      } else {
+        return graph;
+      }
+    }
     populateGraphWithMappings(graph, foundRevisions);
     populateSubgraphs(filePath, step, graph, foundRevisions);
 
@@ -185,15 +225,121 @@ public class GitParser {
     if(filePath.endsWith(".java")) {
       Set<Integer> changesFromDiffJ = changesFromDiffJ(filePath, source);
       delIndexes = delIndexes.stream().filter(changesFromDiffJ::contains).collect(Collectors.toList());
+    } else if(isCPlusPlus(filePath)) {
+      Set<Integer> changesFromCPP = changesFromCPP(filePath, source);
+      delIndexes = delIndexes.stream().filter(changesFromCPP::contains).collect(Collectors.toList());
     }
 
     return delIndexes;
   }
 
+  private List<Integer> buildAddIndexes(String filePath, Commit source) {
+    Set<Integer> addIndexes = source
+      .diffWithParent
+      .get(filePath)
+      .insertions
+      .stream()
+      .map(s -> parseInt(s[0]))
+      .collect(Collectors.toSet());
+
+    if (addIndexes.isEmpty()) {
+      return new ArrayList();
+    }
+    IntPredicate hasToken = (i) -> true;
+    int lastLine = Collections.max(addIndexes);
+    if(isCPlusPlus(filePath)) {
+      try {
+        CPPFileExtension file = getCPPFileContentAtRevision(filePath, source.commit);
+        final Set<Integer> tokenLines = file.allLineNumbers().stream().map(it -> it-1).collect(Collectors.toSet());
+        addIndexes = addIndexes.stream().filter(tokenLines::contains).collect(Collectors.toSet());
+        hasToken = (i) -> tokenLines.contains(i);
+        if (!tokenLines.isEmpty()) {
+          lastLine = Collections.max(tokenLines);
+        }
+      } catch (IOException e){
+        logger.warn("buildAddIndexes Failed to parse " + filePath + '\n');
+      }
+    } else {
+      //logger.warn("blaming non-C++ file");
+      logger.warn("skipping non-C++ file: " + filePath);
+      return new ArrayList(new TreeSet());
+    }
+    List<Integer> sortedAddIndexes = new ArrayList(new TreeSet(addIndexes));
+    if (sortedAddIndexes.isEmpty()) {
+      return sortedAddIndexes;
+    }
+
+    /* How this works conceptually and how it's implemented slightly differ.
+
+       conceptual algorithm:
+       For each add line, find the line immediately preceding it (skipping
+       "empty" lines without relevant tokens). If that line is not an add line,
+       track it, as well as the (non-empty) line after the last add line.
+       So in a sense, we're actually tracking the ranges of non-add blocks.
+
+       actual algorithm:
+       Track the index of the last add line, but include "empty" lines after
+       that line, even if they aren't actually add lines.
+       If this add line is the one after that, we are in a contiguous add block,
+       so keep going, updating the aforementioned index.
+       Once that's not the case, we're in a new add block, so add the non-empty
+       line after the tracked index, and the non-empty line before this one.
+
+       Edge cases to be aware of:
+        - fencepost errors from the first/last add lines
+        - add lines that are the first/last lines in the file
+        - add lines that are second to first/last lines in the file
+        - files without any add lines
+     */
+    List<Integer> trackedIndexes = new ArrayList<>();
+    // last added line, including "empty" lines
+    int lastAddIndex = -2; // more than one less than the first valid index
+    //List<Integer> addIndexList = addIndexes.stream().sorted().collect(Collectors.toList());
+    //logger.warn("addIndexes: " + sortedAddIndexes);
+    //logger.warn("lastLine: " + lastLine);
+    for (int i : sortedAddIndexes) {
+        //logger.warn(i + ",");
+      if (i == lastAddIndex + 1) {
+        // contiguous add block, keep going (skipping contiguous "empty" lines)
+        while (!hasToken.test(++lastAddIndex + 1) && lastAddIndex + 1 <= lastLine){};
+        continue;
+      }
+
+      if (lastAddIndex >= 0) {
+        // lastAddIndex points to first line before a non-"empty" line,
+        // after the last add block
+        trackedIndexes.add(lastAddIndex + 1);
+      }
+
+      // find the most recent non-"empty" line,
+      // checking that it's not what we just added
+      int prevline = i;
+      while (--prevline > lastAddIndex + 1 && !hasToken.test(prevline)){};
+      if (prevline > lastAddIndex + 1 && prevline >= 0) {
+        trackedIndexes.add(prevline);
+      }
+
+      lastAddIndex = i;
+      // skip contiguous "empty" lines
+      while (!hasToken.test(lastAddIndex + 1) && lastAddIndex + 1 <= lastLine){lastAddIndex++;}
+    }
+    if (lastAddIndex >= 0 && hasToken.test(lastAddIndex + 1)) {
+      trackedIndexes.add(lastAddIndex + 1);
+    }
+    /*logger.warn("\n");
+
+    logger.warn("trackedIndexes: ");
+    for (int i : trackedIndexes) {
+      logger.warn(i + ",");
+    }
+    logger.warn("\n");*/
+    return trackedIndexes;
+  }
+
   private Set<Integer> changesFromDiffJ(String filePath, Commit source) {
     try {
-      JavaFileExtension revision = getFileContentAtRevision(filePath, source.commit);
-      JavaFileExtension parentRev = getFileContentAtRevision(filePath, source.commit.getParent(0));
+      JavaFileExtension revision = getJavaFileContentAtRevision(filePath, source.commit);
+      JavaFileExtension parentRev = getJavaFileContentAtRevision(filePath, source.commit.getParent(0));
 
       if(revision == null || parentRev == null) {
         return Collections.emptySet();
@@ -208,7 +354,28 @@ public class GitParser {
     }
   }
 
-  private JavaFileExtension getFileContentAtRevision(String filePath, RevCommit revision) throws IOException, DiffJException {
+  private Set<Integer> changesFromCPP(String filePath, Commit source) {
+    try {
+      CPPFileExtension revision = getCPPFileContentAtRevision(filePath, source.commit);
+      CPPFileExtension parentRev = getCPPFileContentAtRevision(filePath, source.commit.getParent(0));
+      //logger.warn("revision: " + revision.allLineNumbers());
+      //logger.warn("parentRev: " + parentRev.allLineNumbers());
+
+      if(revision == null || parentRev == null) {
+        logger.warn("Found empty revision: " + revision + " " + parentRev);
+        return Collections.emptySet();
+      }
+      // Converting line numbers to indexes.
+      return revision.affectedLineNumbers(parentRev).stream().map(it ->
+              it-1
+      ).collect(Collectors.toSet());
+    } catch (Exception e) {
+        logger.warn(String.format("Exception %s ### File %s from: %s to: %s", e.toString(), filePath, source.commit.toString(), source.commit.getParent(0).toString()));
+      return Collections.emptySet();
+    }
+  }
+
+  private String getFileContentAtRevision(String filePath, RevCommit revision) throws IOException {
     RevTree tree = revWalk.parseCommit(revision.getId()).getTree();
     TreeWalk treeWalk = TreeWalk.forPath(repo, filePath, tree);
     if(treeWalk == null) {
@@ -218,8 +385,17 @@ public class GitParser {
     ObjectReader objectReader = repo.newObjectReader();
     ObjectLoader objectLoader = objectReader.open(blobId);
     byte[] bytes = objectLoader.getBytes();
+    return new String(bytes, StandardCharsets.UTF_8);
+  }
+
+  private JavaFileExtension getJavaFileContentAtRevision(String filePath, RevCommit revision) throws IOException, DiffJException {
+    String string = getFileContentAtRevision(filePath, revision);
+    return new JavaFileExtension(string);
+  }
 
-    return new JavaFileExtension(new String(bytes, StandardCharsets.UTF_8));
+  private CPPFileExtension getCPPFileContentAtRevision(String filePath, RevCommit revision) throws IOException {
+    String string = getFileContentAtRevision(filePath, revision);
+    return new CPPFileExtension(filePath + revision.toString(), new StringReader(string));
   }
 
   /*

+ 10 - 0
data/bugzilla-data/template.json

@@ -0,0 +1,10 @@
+{
+    "id": "",
+    "revisions": [],
+    "inducedBy": [
+        {
+            "id": "",
+            "revisions": []
+        }
+    ]
+}

+ 71 - 0
data/hand-annotated/c++-blame.csv

@@ -0,0 +1,71 @@
+6adcd0523ceceff6b609201bcfa44cac3ef920fb,827591
+a8cf86e4b78c83844db3af3982a3df662c0327c0,955913
+4db79cc0f9d8e38b30c8d00238baf53da67b0e36,1025267
+8174f98f222d09734a870a7797e7a434afac9372,1041512
+c011970f03885bda3ab81cf5af35201b56b3f406,1077687
+99097814d60268b4e707abb46b501cb3d1a501ed,1092363
+a60c564803f80b1f456ad6cd7014723a5a3914d7,1127198
+99097814d60268b4e707abb46b501cb3d1a501ed,1146101
+225f3dd70d8a89a1d87ea8a631e6c859fb17457b,1181011
+82ee685e15e57711c652bb1682a71330741707b3,1230639
+a1dea9b4fc8568e83db4fc71132be54ea227e272,1353312
+b2ea3d02939d7951a9c1bb788f1c5763b9cdbdfb,1614971
+4322faf5044b966380fab44fad57801264b72345,780979
+4322faf5044b966380fab44fad57801264b72345,801330
+62ced97631409332ffe5a5b9b32794316f141a9b,1390550
+62ced97631409332ffe5a5b9b32794316f141a9b,1448771
+368f4de891c58dae46a25de9c8e789a20c868fcc,793065
+6c5bb2cc06966fa16bc180c45f4f07169ccb2011,963974
+d301df5d459b7ccec7c5fd99149b3645aa667a2a,1072877
+b951f3597d034341473c2b17df3e7dea68d2c5b9,1074280
+41fa2ab6fd1e8df907fff0905078c11efa81e4b1,1082986
+2949a9f18bd68bf06170da90821b61b5ad5bf5f5,1107009
+8baed26d9b7b9f02fa63ec3c0397b8da4ed99141,1122722
+b605651fddd381523feaa11bef0026d3da6d3b96,1167356b1
+fdb4be9e33f50503d3a75ea815f99528da32a5bf,1167356b2
+6624d1aefe3368807d498df3759c2a2a0b1d2588,1191463
+733dca91239848758542adf4489bb06172efe7f4,1283826
+5b3ab326b6956c181338a1b39c3e6f9a17d24d13,1307458
+afdf8e01dd69dc75c88acebafd773508f97f91e4,1363280
+95a17ea52c6c4d614adf2aebf4dc4d873b52c897,1369560
+6376e2c6bb8b771dd6513156d84ac13b0f15c7f0,1382829
+504484a45696d5d089e3781af3e4595295be9eda,1387659
+8592c4c12d6d5a87c5e14e0268ca5e78af2291e0,1388020
+4f8f5212b2407a12a4611eb1711ce5d57799faa4,1395138
+a5fc6a819fb89b6b587fc92d82f592eef3dc5bd5,1452375
+7e8f4657993e913c8ed6707db170142d51e2b289,1496413
+1a997e96bc34f00d04e7f42fdb8dd7d9f80bd471,1538736
+8f2c88cbf670912edcd70596f387e941cdc434c1,1613009
+e6e07e47b00c0c7765c0c5a1f4b6e1001a55d70f,761014
+d60517eb9d826a54b80d6d7d25022bae0585e24e,839621
+52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a,969226
+52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a,1132467
+e6e07e47b00c0c7765c0c5a1f4b6e1001a55d70f,1132468b1
+52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a,1132468b2
+52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a,1166252
+e6e07e47b00c0c7765c0c5a1f4b6e1001a55d70f,1464039
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1048517
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1144107b1
+f156664cf8f5e355339f39d6e1e64fdb9cae4e91,1144107b2
+ba9a83adedce80e906f183c317455ba242487417,1144107b3
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1149605
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1154672
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1154683
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1158568
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1184871
+469e420982a31ef9d9d33acbf80871ca5dab5692,1185115b1
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1185115b2
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1186715
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1186718
+d66cd54dfd76e11e6be7fd610a02be30345af212,1204580
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1216748
+7aa64494dfef6c6b9967f500885e30a03b74c48a,1227052
+a0d1bf988dc1fc47631c884edd810f5d8ad2c5b8,1254721
+44c2409acd437d71e60896c61289a7960db69a39,415491
+c912fce892fd377511d8d77c8fbca6e3a331da17,814254
+99011ec3ca19e2e3b16e08bb9e6fa572e597a2b6,1170794b1
+f6972dcca3d6551ec2e4892cf86bf33db19dce09,1170794b2
+9997d13b08d84383be2f58b91c3492660ce2e304,1170794b3
+5b4bb14a66a24e433c5a5695ce27710cbb1070c9,1170794b4
+d58e92eb5da3013cdcb77b441312ed3b8e60274f,1255863
+5e143ef9ca98972e12e5a07cd755c0a8b0a15581,1336836

+ 66 - 0
data/hand-annotated/c++.csv

@@ -0,0 +1,66 @@
+project	issue	bugzilla form?	fix(es)	inducer(s)	memory safety	valid	count	notes	
+css	827591		da1c0a3d9e951b3a5f4cfa9f763304014958524d	6adcd0523ceceff6b609201bcfa44cac3ef920fb	1		1		
+css	955913		f46f8c14366bbcf507bd3656fe1c9fe73249dfeb	a8cf86e4b78c83844db3af3982a3df662c0327c0	1		1		
+css	1025267		7b5cb65169005f6967ac39b64b58534e2ece6ba7	4db79cc0f9d8e38b30c8d00238baf53da67b0e36	0	no		VCC predates security bug tracking	
+css	1041512	yes	9e08b4d2488e4b86959c18645643f830a0e0ad60	8174f98f222d09734a870a7797e7a434afac9372	1	no		VCC was introduced in a file that was not deleted as part of oxidation, and was later moved to a file that was	
+css	1077687	yes	1aceb17cf63a9b317abb3310d5d276471e4bb392	c011970f03885bda3ab81cf5af35201b56b3f406	1		1		
+css	1092363	yes	301463dc2087ffeb3a76d9c34f906b5945acf4b6	99097814d60268b4e707abb46b501cb3d1a501ed	1		1		
+css	1127198	yes	635933d3f87fa2b076eace741e4df207c41f6043	a60c564803f80b1f456ad6cd7014723a5a3914d7	1		1		
+css	1146101	yes	5ed02337c87c3d07c7f4b8dc6c20b23560deb8f5	99097814d60268b4e707abb46b501cb3d1a501ed	1		1		
+css	1181011		3585bedd89bc8a96ce921b76990d3e5a4ed313be	225f3dd70d8a89a1d87ea8a631e6c859fb17457b	1		1		
+css	1230639	yes	8eff629f46edf8095e4abe17d0e4ae6458fb4c5e	82ee685e15e57711c652bb1682a71330741707b3	1		1		
+css	1353312	yes	db8e759c6654a09aa851384768d580fb632b7274	a1dea9b4fc8568e83db4fc71132be54ea227e272	1		1		
+cubeb-macos	1614971		210f8ccbc5968c5765699020746bf42fecff8548	b2ea3d02939d7951a9c1bb788f1c5763b9cdbdfb	1		1	fix is from version import even though macos portion of C++ was already unused by then (see Rust analysis)	
+japanese-encoding	780979		c987a78b883b2fc328b5113e7bde64ec95473f2	4322faf5044b966380fab44fad57801264b72345	1	no		VCC predates security bug tracking	
+japanese-encoding	801330		a1592dc47c8e72c30103ad21d72f45a79c628e5d	4322faf5044b966380fab44fad57801264b72345	1	no		VCC predates security bug tracking	
+hyphenation	1390550		bd606d11904d4c62fe4cac0a80da0df9e8f4a2d1	62ced97631409332ffe5a5b9b32794316f141a9b	1		1	upstream commit is	5a60cb75a9dd9034331df216c2d3f59e1a08fc9a
+hyphenation	1448771		9ff23d9c3af0e58b2b16fca7b3c04642ebee9f2a	62ced97631409332ffe5a5b9b32794316f141a9b	1		1	upstream commit is	5a60cb75a9dd9034331df216c2d3f59e1a08fc9a
+layers	793065		ed5ba29431970bbbc6ee0184f4a03f145769c598	368f4de891c58dae46a25de9c8e789a20c868fcc	1	no		VCC predates security bug tracking	
+layers	963974		fc6b63874da4a24440d6fad4f8c6314545255e83	6c5bb2cc06966fa16bc180c45f4f07169ccb2011	1		1		
+layers	1072877	yes (incorrect for our purposes)	e381d1456da7626ee710e73a081964ec7a47fddd	d301df5d459b7ccec7c5fd99149b3645aa667a2a	1		1		
+layers	1074280	yes	391dd9ad59b71afa76919f80b8f8a019318e2e45	b951f3597d034341473c2b17df3e7dea68d2c5b9	1	no		VCC predates security bug tracking	
+layers	1082986	yes	11b001307a623cd4222e65a387b8aa9055546cfa	41fa2ab6fd1e8df907fff0905078c11efa81e4b1	1		1		
+layers	1107009		421e78dd674dccca2009fde198722dd9459816f7	2949a9f18bd68bf06170da90821b61b5ad5bf5f5	1		1	VCC is a best guess (it added the code that should only be called from the main thread and that was causing the bug but it is unclear if the other threads started using it immediately or in some later commit). In any case the commit could have prevented or detected the bug via assertions.	
+layers	1122722	yes	bad8bf235d06b3592a6b858afcb1ddea67ee5932	8baed26d9b7b9f02fa63ec3c0397b8da4ed99141	0		1		
+layers	1167356		101316199629532508a83c130893c1e88c8a9c40	b605651fddd381523feaa11bef0026d3da6d3b96,fdb4be9e33f50503d3a75ea815f99528da32a5bf	1	yes,no	1	actually fixes a good-sized list of vulnerabilities all of the same form (forgetting to check the return value of a Map method before using memory it was responsible for allocating). Two VCCs are for the files removed by Oxidation.	
+layers	1191463		991ab71c3494c5d2b22700ede3e96212f74487b5	6624d1aefe3368807d498df3759c2a2a0b1d2588	1		1		
+layers	1283826	yes	85a734ff8d13149a6b1468e6162d0a4075d02bec	733dca91239848758542adf4489bb06172efe7f4	1		1		
+layers	1307458	yes	2df50d5706d74b5601b2ec6a234bfdd074ad2e39	5b3ab326b6956c181338a1b39c3e6f9a17d24d13	1		1		
+layers	1363280	yes	9825a21a174d8cf9aba12d911c209bee8e57ece8	afdf8e01dd69dc75c88acebafd773508f97f91e4	1		1		
+layers	1369560	yes	9df1db2e879a1b8d8496bd03dea15a064274beb8	95a17ea52c6c4d614adf2aebf4dc4d873b52c897	0		1		
+layers	1382829		33346c4c1b93009ae9c2a7f6e2620041fd14ec65	6376e2c6bb8b771dd6513156d84ac13b0f15c7f0	1		1		
+layers	1387659	yes	9ec076488969b9083ed934904aac753a99691b71	504484a45696d5d089e3781af3e4595295be9eda	1		1		
+layers	1388020	yes	679c14a9c452896516541efaf20bcacbfee07ea9	8592c4c12d6d5a87c5e14e0268ca5e78af2291e0	1		1		
+layers	1395138		7417b9b0d4140286cbf485b6ac010f7db95b940f	4f8f5212b2407a12a4611eb1711ce5d57799faa4	1		1	Mozilla never identified cause but this seems like the most likely origin even if something else ultimately exposed it	
+layers	1452375	yes	fc530ca3167058f975212a75c0524f50d0321f24	a5fc6a819fb89b6b587fc92d82f592eef3dc5bd5	1		1		
+layers	1496413	yes	d511b3b696392984359ae7957a8ed1ac1388eef9	7e8f4657993e913c8ed6707db170142d51e2b289	1		1		
+layers	1538736		546c396de9a999d1994fb861fb08dee2e47bb0e0	1a997e96bc34f00d04e7f42fdb8dd7d9f80bd471	1		1	Actual cause never established but this introduced the code that could have prevented it	
+layers	1613009		4795d60bd927118475ecae0a5db8c6002c4f3317	8f2c88cbf670912edcd70596f387e941cdc434c1	1		1		
+qcms	761014		5885617fc0ee6e15cb15192c77febd69e27e3cb8	e6e07e47b00c0c7765c0c5a1f4b6e1001a55d70f	1	no		Fixed via 764181 to avoid attention on vuln, VCC predates security bug tracking	
+qcms	839621		8dbfb840d9a2410455b61688d5c5067a8ece17e4	d60517eb9d826a54b80d6d7d25022bae0585e24e	1	no			
+qcms	969226		8fcbd56508d6fd1c37add850d4f5728a7ec73a5a	52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a	1	no		Not actually exploitable	
+qcms	1132467		509337a1a0b16cbc5c364ed6b5b2b596b0d1c728	52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a	1	no			
+qcms	1132468		27b42e9142414650884354d0087070c6220d8718	e6e07e47b00c0c7765c0c5a1f4b6e1001a55d70f,52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a	1	no,no		multiple vulns over two commits	
+qcms	1166252		58bb8fc72f9e3021efd9874d39dcf63d27a1afb1	52c97edf1ed8b51f5cac4b2e7d824cd7e5f29c0a	1	no			
+qcms	1464039		fdf1d9bbb2518dc4f17d79d0d10b48b33ab3376d	e6e07e47b00c0c7765c0c5a1f4b6e1001a55d70f	1	no		This could arguably be blamed on 58bb8fc72f9e3021efd9874d39dcf63d27a1afb1 since it attempted to address this problem and failed to do so completely. The fix commit should also be 7b8229b86bb69152d6720a8df02cc614691eb89c which was added because the initial fix turned valid execution paths into crashes.	
+stagefright	1048517		6a71a89b014d84d884ec79176318588797412bb9	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1144107	yes	03d7a25152938863695b54f317cc8fb7bc34ca09	7aa64494dfef6c6b9967f500885e30a03b74c48a	0		1	Actually 3 potential vulns in one part of the code	
+stagefright	1144107	yes	76090b241e15eca718438c73930229594ad6830f	f156664cf8f5e355339f39d6e1e64fdb9cae4e91	0		1		
+stagefright	1144107	yes	95bf127594a4b581fa3d3cd510c06b4b76da85f0	ba9a83adedce80e906f183c317455ba242487417	0		1		
+stagefright	1149605	yes	d51ead79d52511af0e2bb1cb164e5785e6ee5302	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1154672		0e46a78213c30262d81ddef4777460d1ebf1b317	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1154683	yes	0e46a78213c30262d81ddef4777460d1ebf1b317	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1158568	yes (incorrect for our purposes)	aa8f8fc0d2151d8fc7bf49573fbb583e655d16d6	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1184871		89e5d96fae85abb21170de8456a887650ebce673	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1185115	yes	fdad9271bf2eab01ef92e344307d0e7583c236c4	469e420982a31ef9d9d33acbf80871ca5dab5692,7aa64494dfef6c6b9967f500885e30a03b74c48a	1		2	many vulns but we’ll count as two	
+stagefright	1186715		03b4683f2cbf360e9f6071ba49a71a8df5f712b2	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1	actual cause never established (just a guess)	
+stagefright	1186718	yes	89e5d96fae85abb21170de8456a887650ebce673	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1204580		1fe648369985406d4fc84ae864cc08796d4c2a4f	d66cd54dfd76e11e6be7fd610a02be30345af212	0		1		
+stagefright	1216748		11139b4935301dfa24e779b1ae4bd000409957aa	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1		
+stagefright	1227052		ed2eee2066b0cf9f9ef980ca05e41dd61493965f	7aa64494dfef6c6b9967f500885e30a03b74c48a	1		1	not exploitable in ff but abstractly a security bug	
+stagefright	1254721	yes	c2467e583e0d638d1ede5fe88f1a2bfd73f303ee	a0d1bf988dc1fc47631c884edd810f5d8ad2c5b8	1		1		
+uconv	415491		decd558ea256f528f4d3a07d32f10d024c4307a6	44c2409acd437d71e60896c61289a7960db69a39		no		VCC predates security bug tracking	
+uconv	814254		51efa01fca4e1c8cd7a6a603950b2e84f24af8a2	c912fce892fd377511d8d77c8fbca6e3a331da17		no		VCC predates security bug tracking	
+uconv	1170794		45578e03fa30c4d604c6e78786d02ee32f5e7873	99011ec3ca19e2e3b16e08bb9e6fa572e597a2b6,f6972dcca3d6551ec2e4892cf86bf33db19dce09,9997d13b08d84383be2f58b91c3492660ce2e304,5b4bb14a66a24e433c5a5695ce27710cbb1070c9		no		multiple vulns of same form (int overflow) fixed in one patch, VCC predates security bug tracking	
+uconv	1255863		9696b4bdf60c24bd552387e123fd0ac8965f1461	d58e92eb5da3013cdcb77b441312ed3b8e60274f		no		VCC predates security bug tracking	
+uconv	1336836		26a8c610e53b2f116de8205c701c2a2a90788857	5e143ef9ca98972e12e5a07cd755c0a8b0a15581	1		1		

+ 338 - 0
data/hand-annotated/relevant-c++

@@ -0,0 +1,338 @@
+dom/animation/AnimValuesStyleRule.cpp
+dom/animation/AnimValuesStyleRule.h
+layout/base/GeckoRestyleManager.cpp
+layout/base/GeckoRestyleManager.h
+layout/base/RestyleTracker.cpp
+layout/style/CSSStyleSheet.cpp
+layout/style/CSSStyleSheet.h
+layout/style/CSSVariableDeclarations.cpp
+layout/style/CSSVariableDeclarations.h
+layout/style/CSSVariableResolver.cpp
+layout/style/CSSVariableResolver.h
+layout/style/CSSVariableValues.cpp
+layout/style/CSSVariableValues.h
+layout/style/Declaration.cpp
+layout/style/Declaration.h
+layout/style/GeckoStyleContext.cpp
+layout/style/GeckoStyleContext.h
+layout/style/ImportRule.h
+layout/style/IncrementalClearCOMRuleArray.cpp
+layout/style/IncrementalClearCOMRuleArray.h
+layout/style/NameSpaceRule.h
+layout/style/RuleNodeCacheConditions.cpp
+layout/style/RuleProcessorCache.cpp
+layout/style/RuleProcessorCache.h
+layout/style/StyleRule.cpp
+layout/style/StyleRule.h
+layout/style/nsCSSDataBlock.cpp
+layout/style/nsCSSParser.cpp
+layout/style/nsCSSRuleProcessor.cpp
+layout/style/nsCSSRuleProcessor.h
+layout/style/nsCSSRules.cpp
+layout/style/nsIStyleRule.h
+layout/style/nsIStyleRuleProcessor.h
+layout/style/nsMediaList.cpp
+layout/style/nsMediaList.h
+layout/style/nsNthIndexCache.cpp
+layout/style/nsRuleData.cpp
+layout/style/nsRuleData.h
+layout/style/nsRuleNode.cpp
+layout/style/nsRuleNode.h
+layout/style/nsRuleWalker.h
+layout/style/nsStyleSet.cpp
+layout/style/nsStyleSet.h
+layout/base/RestyleManager.cpp
+layout/base/RestyleManager.h
+layout/style/nsCSSStyleSheet.cpp
+layout/style/nsCSSStyleSheet.h
+layout/style/nsCSSDeclaration.cpp
+layout/style/nsCSSDeclaration.h
+layout/style/nsICSSImportRule.h
+layout/style/nsICSSNameSpaceRule.h
+layout/style/nsCSSStyleRule.cpp
+layout/style/nsICSSStyleRule.h
+layout/style/nsIMediaList.h
+layout/base/RestyleManagerBase.cpp
+layout/style/AnimationCommon.cpp
+gfx/2d/CaptureCommandList.cpp
+gfx/2d/CaptureCommandList.h
+gfx/2d/DrawCommand.h
+gfx/2d/DrawCommands.h
+gfx/2d/DrawTargetCapture.cpp
+gfx/2d/DrawTargetCapture.h
+gfx/2d/DrawTargetDual.cpp
+gfx/2d/DrawTargetDual.h
+gfx/2d/DrawTargetTiled.cpp
+gfx/2d/DrawTargetTiled.h
+gfx/2d/DrawTargetWrapAndRecord.cpp
+gfx/2d/DrawTargetWrapAndRecord.h
+gfx/2d/FilterNodeCapture.cpp
+gfx/2d/FilterNodeCapture.h
+gfx/2d/PathCapture.cpp
+gfx/2d/PathCapture.h
+gfx/2d/SourceSurfaceCapture.cpp
+gfx/2d/SourceSurfaceCapture.h
+gfx/2d/SourceSurfaceDual.h
+gfx/gl/SharedSurfaceGLX.cpp
+gfx/gl/SharedSurfaceGLX.h
+gfx/layers/apz/public/MetricsSharingController.h
+gfx/layers/apz/test/gtest/InternalHitTester.cpp
+gfx/layers/apz/test/gtest/InternalHitTester.h
+gfx/layers/basic/AutoMaskData.h
+gfx/layers/basic/BasicCanvasLayer.cpp
+gfx/layers/basic/BasicCanvasLayer.h
+gfx/layers/basic/BasicColorLayer.cpp
+gfx/layers/basic/BasicCompositor.cpp
+gfx/layers/basic/BasicCompositor.h
+gfx/layers/basic/BasicContainerLayer.cpp
+gfx/layers/basic/BasicContainerLayer.h
+gfx/layers/basic/BasicImageLayer.cpp
+gfx/layers/basic/BasicImages.cpp
+gfx/layers/basic/BasicImplData.h
+gfx/layers/basic/BasicLayerManager.cpp
+gfx/layers/basic/BasicLayers.h
+gfx/layers/basic/BasicLayersImpl.cpp
+gfx/layers/basic/BasicLayersImpl.h
+gfx/layers/basic/BasicPaintedLayer.cpp
+gfx/layers/basic/BasicPaintedLayer.h
+gfx/layers/basic/MacIOSurfaceTextureHostBasic.cpp
+gfx/layers/basic/MacIOSurfaceTextureHostBasic.h
+gfx/layers/basic/TextureClientX11.cpp
+gfx/layers/basic/TextureClientX11.h
+gfx/layers/basic/TextureHostBasic.cpp
+gfx/layers/basic/TextureHostBasic.h
+gfx/layers/basic/X11BasicCompositor.cpp
+gfx/layers/basic/X11BasicCompositor.h
+gfx/layers/basic/X11TextureSourceBasic.cpp
+gfx/layers/basic/X11TextureSourceBasic.h
+gfx/layers/client/ClientCanvasLayer.cpp
+gfx/layers/client/ClientCanvasLayer.h
+gfx/layers/client/ClientCanvasRenderer.cpp
+gfx/layers/client/ClientCanvasRenderer.h
+gfx/layers/client/ClientColorLayer.cpp
+gfx/layers/client/ClientContainerLayer.cpp
+gfx/layers/client/ClientContainerLayer.h
+gfx/layers/client/ClientImageLayer.cpp
+gfx/layers/client/ClientLayerManager.cpp
+gfx/layers/client/ClientLayerManager.h
+gfx/layers/client/ClientPaintedLayer.cpp
+gfx/layers/client/ClientPaintedLayer.h
+gfx/layers/client/ClientReadbackLayer.h
+gfx/layers/client/ClientTiledPaintedLayer.cpp
+gfx/layers/client/ClientTiledPaintedLayer.h
+gfx/layers/client/ContentClient.cpp
+gfx/layers/client/ContentClient.h
+gfx/layers/client/MultiTiledContentClient.cpp
+gfx/layers/client/MultiTiledContentClient.h
+gfx/layers/client/SingleTiledContentClient.cpp
+gfx/layers/client/SingleTiledContentClient.h
+gfx/layers/client/TiledContentClient.cpp
+gfx/layers/client/TiledContentClient.h
+gfx/layers/composite/AsyncCompositionManager.cpp
+gfx/layers/composite/AsyncCompositionManager.h
+gfx/layers/composite/CanvasLayerComposite.cpp
+gfx/layers/composite/CanvasLayerComposite.h
+gfx/layers/composite/ColorLayerComposite.cpp
+gfx/layers/composite/ColorLayerComposite.h
+gfx/layers/composite/ConsolasFontData.h
+gfx/layers/composite/ContainerLayerComposite.cpp
+gfx/layers/composite/ContainerLayerComposite.h
+gfx/layers/composite/ContentHost.cpp
+gfx/layers/composite/ContentHost.h
+gfx/layers/composite/Diagnostics.cpp
+gfx/layers/composite/FPSCounter.cpp
+gfx/layers/composite/FPSCounter.h
+gfx/layers/composite/ImageHost.cpp
+gfx/layers/composite/ImageHost.h
+gfx/layers/composite/ImageLayerComposite.cpp
+gfx/layers/composite/ImageLayerComposite.h
+gfx/layers/composite/LayerManagerComposite.cpp
+gfx/layers/composite/LayerManagerComposite.h
+gfx/layers/composite/LayerManagerCompositeUtils.h
+gfx/layers/composite/PaintCounter.cpp
+gfx/layers/composite/PaintCounter.h
+gfx/layers/composite/PaintedLayerComposite.cpp
+gfx/layers/composite/PaintedLayerComposite.h
+gfx/layers/composite/TextRenderer.cpp
+gfx/layers/composite/TextRenderer.h
+gfx/layers/composite/TiledContentHost.cpp
+gfx/layers/composite/TiledContentHost.h
+gfx/layers/composite/X11TextureHost.cpp
+gfx/layers/composite/X11TextureHost.h
+gfx/layers/d3d11/BlendingHelpers.hlslh
+gfx/layers/d3d11/mlgshaders/blend-common.hlsl
+gfx/layers/d3d11/mlgshaders/blend-ps-generated.hlslh
+gfx/layers/d3d11/mlgshaders/blend-ps-generated.hlslh.tpl
+gfx/layers/d3d11/mlgshaders/blend-ps.hlsl
+gfx/layers/d3d11/mlgshaders/blend-vs.hlsl
+gfx/layers/d3d11/mlgshaders/clear-common.hlsl
+gfx/layers/d3d11/mlgshaders/clear-ps.hlsl
+gfx/layers/d3d11/mlgshaders/clear-vs.hlsl
+gfx/layers/d3d11/mlgshaders/color-common.hlsl
+gfx/layers/d3d11/mlgshaders/color-ps.hlsl
+gfx/layers/d3d11/mlgshaders/color-vs.hlsl
+gfx/layers/d3d11/mlgshaders/common.hlsl
+gfx/layers/d3d11/mlgshaders/common-ps.hlsl
+gfx/layers/d3d11/mlgshaders/common-vs.hlsl
+gfx/layers/d3d11/mlgshaders/component-alpha-ps.hlsl
+gfx/layers/d3d11/mlgshaders/diagnostics-common.hlsl
+gfx/layers/d3d11/mlgshaders/diagnostics-ps.hlsl
+gfx/layers/d3d11/mlgshaders/diagnostics-vs.hlsl
+gfx/layers/d3d11/mlgshaders/mask-combiner-common.hlsl
+gfx/layers/d3d11/mlgshaders/mask-combiner-ps.hlsl
+gfx/layers/d3d11/mlgshaders/mask-combiner-vs.hlsl
+gfx/layers/d3d11/mlgshaders/test-features-vs.hlsl
+gfx/layers/d3d11/mlgshaders/textured-common.hlsl
+gfx/layers/d3d11/mlgshaders/textured-ps.hlsl
+gfx/layers/d3d11/mlgshaders/textured-vs.hlsl
+gfx/layers/d3d11/mlgshaders/ycbcr-ps.hlsl
+gfx/layers/d3d11/ReadbackManagerD3D11.cpp
+gfx/layers/d3d11/ReadbackManagerD3D11.h
+gfx/layers/DirectedGraph.h
+gfx/layers/ImageLayers.cpp
+gfx/layers/ImageLayers.h
+gfx/layers/ipc/LayerTransactionChild.cpp
+gfx/layers/ipc/LayerTransactionChild.h
+gfx/layers/ipc/LayerTransactionParent.cpp
+gfx/layers/ipc/LayerTransactionParent.h
+gfx/layers/ipc/ShadowLayers.cpp
+gfx/layers/ipc/ShadowLayers.h
+gfx/layers/ipc/ShadowLayerUtilsMac.cpp
+gfx/layers/ipc/ShadowLayerUtilsX11.cpp
+gfx/layers/ipc/ShadowLayerUtilsX11.h
+gfx/layers/LayerManager.cpp
+gfx/layers/LayerMetricsWrapper.h
+gfx/layers/LayerScope.cpp
+gfx/layers/LayerScope.h
+gfx/layers/Layers.cpp
+gfx/layers/Layers.h
+gfx/layers/LayersHelpers.cpp
+gfx/layers/LayersHelpers.h
+gfx/layers/LayerSorter.cpp
+gfx/layers/LayerSorter.h
+gfx/layers/LayerTreeInvalidation.cpp
+gfx/layers/LayerTreeInvalidation.h
+gfx/layers/opengl/GLBlitTextureImageHelper.cpp
+gfx/layers/opengl/GLBlitTextureImageHelper.h
+gfx/layers/opengl/X11TextureSourceOGL.cpp
+gfx/layers/opengl/X11TextureSourceOGL.h
+gfx/layers/PaintThread.cpp
+gfx/layers/PaintThread.h
+gfx/layers/protobuf/LayerScopePacket.pb.h
+gfx/layers/ReadbackProcessor.cpp
+gfx/layers/ReadbackProcessor.h
+gfx/layers/RenderTrace.cpp
+gfx/layers/RenderTrace.h
+gfx/layers/RotatedBuffer.cpp
+gfx/layers/RotatedBuffer.h
+gfx/layers/SourceSurfaceVolatileData.cpp
+gfx/layers/SourceSurfaceVolatileData.h
+gfx/layers/TextureDIB.cpp
+gfx/layers/TextureDIB.h
+gfx/layers/TiledLayerBuffer.h
+gfx/src/TiledRegion.cpp
+gfx/src/TiledRegion.h
+gfx/tests/gtest/TestCompositor.cpp
+gfx/tests/gtest/TestLayers.h
+gfx/tests/gtest/TestTextureCompatibility.cpp
+gfx/thebes/gfxGdkNativeRenderer.cpp
+gfx/thebes/gfxGdkNativeRenderer.h
+gfx/thebes/gfxXlibNativeRenderer.cpp
+gfx/thebes/gfxXlibNativeRenderer.h
+layout/painting/FrameLayerBuilder.cpp
+layout/painting/FrameLayerBuilder.h
+widget/gtk/WindowSurfaceXRender.cpp
+widget/gtk/WindowSurfaceXRender.h
+gfx/2d/DrawTargetRecording.cpp
+gfx/2d/DrawTargetRecording.h
+gfx/gl/GLBlitTextureImageHelper.cpp
+gfx/gl/GLBlitTextureImageHelper.h
+gfx/layers/basic/BasicCanvasLayer.cpp
+gfx/layers/basic/BasicCanvasLayer.h
+gfx/layers/basic/BasicColorLayer.cpp
+gfx/layers/basic/BasicContainerLayer.cpp
+gfx/layers/basic/BasicContainerLayer.h
+gfx/layers/basic/BasicImageLayer.cpp
+gfx/layers/basic/BasicLayerManager.cpp
+gfx/layers/basic/BasicLayers.cpp
+gfx/layers/basic/BasicLayers.h
+gfx/layers/basic/BasicThebesLayer.cpp
+gfx/layers/basic/BasicThebesLayer.h
+gfx/layers/BasicLayers.h
+gfx/layers/basic/TextureHostX11.cpp
+gfx/layers/basic/TextureHostX11.h
+gfx/layers/client/ClientThebesLayer.cpp
+gfx/layers/client/ClientThebesLayer.h
+gfx/layers/client/ClientTiledThebesLayer.cpp
+gfx/layers/client/ClientTiledThebesLayer.h
+gfx/layers/client/TiledContentClient.cpp
+gfx/layers/client/TiledContentClient.h
+gfx/layers/composite/ThebesLayerComposite.cpp
+gfx/layers/composite/ThebesLayerComposite.h
+gfx/layers/ipc/CompositorParent.cpp
+gfx/layers/ipc/CompositorParent.h
+gfx/layers/ipc/LayerTransactionChild.cpp
+gfx/layers/ipc/LayerTransactionChild.h
+gfx/layers/ipc/ShadowLayersChild.cpp
+gfx/layers/ipc/ShadowLayersChild.h
+gfx/layers/ipc/ShadowLayersParent.cpp
+gfx/layers/ipc/ShadowLayersParent.h
+gfx/layers/opengl/FPSCounter.h
+gfx/layers/ThebesLayerBuffer.cpp
+gfx/layers/ThebesLayerBuffer.h
+gfx/thebes/public/gfxGdkNativeRenderer.h
+gfx/thebes/public/gfxXlibNativeRenderer.h
+gfx/thebes/src/gfxGdkNativeRenderer.cpp
+gfx/thebes/src/gfxXlibNativeRenderer.cpp
+layout/base/FrameLayerBuilder.cpp
+layout/base/FrameLayerBuilder.h
+gfx/layers/d3d11/MLGDeviceD3D11.cpp
+gfx/layers/d3d11/MLGDeviceD3D11.h
+gfx/layers/mlgpu/BufferCache.cpp
+gfx/layers/mlgpu/BufferCache.h
+gfx/layers/mlgpu/CanvasLayerMLGPU.cpp
+gfx/layers/mlgpu/CanvasLayerMLGPU.h
+gfx/layers/mlgpu/ClearRegionHelper.h
+gfx/layers/mlgpu/ContainerLayerMLGPU.cpp
+gfx/layers/mlgpu/ContainerLayerMLGPU.h
+gfx/layers/mlgpu/FrameBuilder.cpp
+gfx/layers/mlgpu/FrameBuilder.h
+gfx/layers/mlgpu/ImageLayerMLGPU.cpp
+gfx/layers/mlgpu/ImageLayerMLGPU.h
+gfx/layers/mlgpu/LayerManagerMLGPU.cpp
+gfx/layers/mlgpu/LayerManagerMLGPU.h
+gfx/layers/mlgpu/LayerMLGPU.cpp
+gfx/layers/mlgpu/LayerMLGPU.h
+gfx/layers/mlgpu/MaskOperation.cpp
+gfx/layers/mlgpu/MaskOperation.h
+gfx/layers/mlgpu/MemoryReportingMLGPU.cpp
+gfx/layers/mlgpu/MemoryReportingMLGPU.h
+gfx/layers/mlgpu/MLGDevice.cpp
+gfx/layers/mlgpu/MLGDevice.h
+gfx/layers/mlgpu/MLGDeviceTypes.h
+gfx/layers/mlgpu/MLGPUScreenshotGrabber.cpp
+gfx/layers/mlgpu/MLGPUScreenshotGrabber.h
+gfx/layers/mlgpu/PaintedLayerMLGPU.cpp
+gfx/layers/mlgpu/PaintedLayerMLGPU.h
+gfx/layers/mlgpu/RenderPassMLGPU.cpp
+gfx/layers/mlgpu/RenderPassMLGPU.h
+gfx/layers/mlgpu/RenderPassMLGPU-inl.h
+gfx/layers/mlgpu/RenderViewMLGPU.cpp
+gfx/layers/mlgpu/RenderViewMLGPU.h
+gfx/layers/mlgpu/ShaderDefinitionsMLGPU.h
+gfx/layers/mlgpu/ShaderDefinitionsMLGPU-inl.h
+gfx/layers/mlgpu/SharedBufferMLGPU.cpp
+gfx/layers/mlgpu/SharedBufferMLGPU.h
+gfx/layers/mlgpu/StagingBuffer.cpp
+gfx/layers/mlgpu/StagingBuffer.h
+gfx/layers/mlgpu/TexturedLayerMLGPU.cpp
+gfx/layers/mlgpu/TexturedLayerMLGPU.h
+gfx/layers/mlgpu/TextureSourceProviderMLGPU.cpp
+gfx/layers/mlgpu/TextureSourceProviderMLGPU.h
+gfx/layers/mlgpu/UtilityMLGPU.h
+gfx/layers/LayerAttributes.h
+media/libstagefright/
+intl/uconv/
+intl/hyphenation/
+media/libcubeb/src/cubeb_audiounit.c
+media/libcubeb/src/cubeb_audiounit.cpp

+ 11 - 0
data/hand-annotated/relevant-dirs.csv

@@ -0,0 +1,11 @@
+layers,gfx/layers/
+css,layout/style/
+stagefright,media/libstagefright/
+C-qcms,gfx/qcms/
+uconv,intl/uconv/
+hyphenation,intl/hyphenation/
+japanese-encoding,extensions/universalchardet
+cubeb-macos,media/libcubeb/src/cubeb_audiounit.c media/libcubeb/src/cubeb_audiounit.cpp
+combined,gfx/layers/ layout/style/ media/libstagefright/ intl/uconv/ intl/hyphenation/ media/libcubeb/src/cubeb_audiounit.c media/libcubeb/src/cubeb_audiounit.cpp
+webrender_bindings,gfx/webrender_bindings/
+servo,components/style/ ports/geckolib/

+ 14 - 0
data/hand-annotated/rust-blame.csv

@@ -0,0 +1,14 @@
+57a0250fe01954a9a57dad5a75c31b8ac7fd09a4,1577439
+cb2f5a30ab9e0d14945dc13c1bf566a8ad86e536,1599181
+b1ef172149c701865ea154db07dbc7907d27ce4b,1631232
+4bf22b5d44eb06d633b3d841814fd1da0b2506a1,1680084
+276a6667ec759cef6e9495f2e5e0704cdb72588f,1758223
+3911cad670c63d559179d25b86504d5126be633f,1614971
+4000bfa8591916482e03a8cef932e61c05d40a3d,1557208
+492b36638ff93322c6e41e30982a90f6998a1e8e,1637112
+519a918649fda466d775244c003ce6064b2c7d09,1622291
+69b2a9fff27f2fb2eb9d12b195202a71ba7d10c5,1685145
+cc2f6f28d95788ffce80d3d7a2b0488f14e09b33,1696312
+dd38c376a7b2121f3abee1484946e5666ecc2977,1700235
+c99cdf315bb18860e76dcc61b9f6fe07c8fa625d,1701834
+2cca20ab5815c6ea13571afb7942cdf58926610f,1746545