Explorar el Código

Scripts and some pre-processed data

Vecna hace 1 mes
commit
8d386a5f85

+ 10 - 0
data/bridge-extra-infos.sha256

@@ -0,0 +1,10 @@
+6fa2aafdf9e3d89bc16dd1e3b74180c58df2de8c8949d7825644467d0e9efa77  data/bridge-extra-infos-2020-07.tar.xz
+7197feecdb2231a3b889ec46bb4b8172e2f1cdba4581beb571fcaa10faa27df4  data/bridge-extra-infos-2020-08.tar.xz
+ad38ac703aab6cd4f9224b72e0678eeef0b07dbd5c4d192c28e62a53871f87cd  data/bridge-extra-infos-2020-09.tar.xz
+753771d113cfdd40a698ae56d199e757c147a2a1c3556bca0a0fd78e61c0b2c7  data/bridge-extra-infos-2020-10.tar.xz
+4152439f604abb15ff5d9b878f5a5b4dc17adbab63a414cb37e9b851af6645c0  data/bridge-extra-infos-2020-11.tar.xz
+125fccc56b7ffa068b6c6de74124319ce14d401685274d1f877807d299ebb8f3  data/bridge-extra-infos-2020-12.tar.xz
+165bc89b4aed09acf08632a0aafbeb7b9c3277a6d158752ee25fdd0f370bcef6  data/bridge-extra-infos-2021-01.tar.xz
+f4f91383728afb2e4bfcf699e11e79874b5d77bfdb4d125bd0b595fd8c21181c  data/bridge-extra-infos-2021-02.tar.xz
+b0f278962fca4b9e7380e0b749948313bc7bdf169b48b58421064d3a9cee8909  data/bridge-extra-infos-2021-03.tar.xz
+be1ff1eb2828d9ddbc7a775ea8501fdfc62274a63ccf55c61b749d5a1d56a1ca  data/bridge-extra-infos-2021-04.tar.xz

+ 1 - 0
data/bridge-pool-assignments.sha256

@@ -0,0 +1 @@
+502b09078fc04567fd4a26860480d027d8816fcd904832375a38274982d4d3d6  data/bridge-pool-assignments-2021-02.tar.xz

BIN
data/bridge_data.tar.gz


+ 32 - 0
run.sh

@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Check that dependencies are installed before going further
+if ! command -v curl; then
+    echo "This script needs curl to be installed."
+    exit 1
+fi
+if ! command -v python3; then
+    echo "This script needs python3 to be installed."
+    exit 1
+fi
+./scripts/check-python-deps.py || echo "This script needs numpy to be installed." && exit 1
+
+# Get bridge data
+if [ "$1" == "--fast" ]; then
+    echo "Extracting some pre-processed data..."
+    cd data && tar xf bridge_data.tar.gz && cd ..
+else
+    echo "Downloading and processing data from step 1..."
+    echo "This will take quite a long time (around 12.5 hours on my device)"
+    echo "and require a few GB of free space while running."
+    ./scripts/get-bridge-data.sh
+fi
+
+# Get list of email-distributed bridges
+./scripts/get-email-bridges.sh
+
+# Clean up bridge data for the format we want
+./scripts/clean-bridge-data.sh
+
+# Evaluate blockages and get stats
+./scripts/get-stats.sh

+ 6 - 0
scripts/check-python-deps.py

@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+# This script doesn't do anything, but it tries to import the needed
+# dependencies for other scripts in this project, to ensure they're all
+# available.
+import csv, numpy, os, sys

+ 32 - 0
scripts/clean-bridge-data.sh

@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# If we haven't already extracted our set of bridges distributed in 2021
+# Feb, do that now.
+if [ ! -f data/all-bridges ]; then
+    echo "Getting list of bridges distributed in 2021 February"
+    ./scripts/get-email-bridges.sh
+fi
+
+# Clean bridge data (sort, remove duplicates)
+if [ ! -d data/bridge_data_cleaned ]; then
+    echo "Cleaning data for bridges distributed in 2021 February"
+    mkdir data/bridge_data_cleaned
+    while read fpr; do
+        fpr=$(echo -n "$fpr" | tr '[:lower:]' '[:upper:]')
+        if [ -n "$fpr" ]; then
+            if [ -f data/bridge_data/${fpr} ]; then
+                # Get only highest number of observed connections for each day
+                pref="placeholder"
+                sort -r -n -k1.1,1.7 -k1.9 data/bridge_data/${fpr} \
+                    | while read line; do
+                    if [[ "$line" != "$pref"* ]]; then
+                        echo "$line"
+                        pref="${line:0:7}"
+                    fi
+                done | sort > data/bridge_data_cleaned/${fpr}
+            else
+                echo "No data/bridge_data/${fpr}"
+            fi
+        fi
+    done < data/all-bridges
+fi

+ 73 - 0
scripts/evaluate-blockages.py

@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+import csv
+import os
+
+# If abs is True, we need to have observed more than 32 connections on
+# some day to be considered blocked.
+def is_blocked (harshness, bridge_ips_max, bridge_ips_today, abs, no_min):
+    if bridge_ips_max is None:
+        return False
+
+    threshold = 8 * harshness
+
+    if abs:
+        # absolute threshold
+        return bridge_ips_today <= threshold and (no_min or bridge_ips_max > 32)
+    else:
+        # relative threshold based on prior connection counts
+        return bridge_ips_today <= threshold and (no_min or bridge_ips_today <= bridge_ips_max - 32 + threshold)
+
+def evaluate (harshness, fingerprint, abs=True, no_min=False):
+    fingerprint = fingerprint.upper()
+    bridge_ips_max = None
+
+    with open (f"data/bridge_data_cleaned/{fingerprint}", 'r') as file:
+        bridge_data = csv.reader(file, delimiter=',')
+
+        for row in bridge_data:
+            bridge_ips_today = int(row[1])
+
+            if not bridge_ips_max is None:
+                if is_blocked (harshness, bridge_ips_max, bridge_ips_today, abs, no_min):
+                    return row[0]
+
+            # Start bridge_ips_max only when we have a non-zero
+            # connection count
+            elif bridge_ips_today > 0:
+                bridge_ips_max = bridge_ips_today
+
+        # If we got here, the bridge is not blocked
+        return None
+
+# Remove any previous blocked_* files, start over
+for i in range(5):
+    if os.path.exists (f"data/blocked_{i}"):
+        os.remove (f"data/blocked_{i}")
+    if os.path.exists (f"data/blocked_{i}_abs"):
+        os.remove (f"data/blocked_{i}_abs")
+    if os.path.exists (f"data/blocked_{i}_nomin"):
+        os.remove (f"data/blocked_{i}_nomin")
+
+with open ("data/all-bridges", 'r') as all_bridges:
+    for fingerprint in all_bridges:
+        fingerprint = fingerprint.strip()
+        if fingerprint:
+            # Go through all harshness values
+            for harshness in range(5):
+                blocked = evaluate (harshness, fingerprint, False)
+                # If the bridge is blocked add its fingerprint and
+                # blocked date to the list for that harshness level
+                if not blocked is None:
+                    with open (f"data/blocked_{harshness}", 'a') as f:
+                        f.write(f"{fingerprint},{blocked}\n")
+
+                blocked = evaluate (harshness, fingerprint, True)
+                if not blocked is None:
+                    with open (f"data/blocked_{harshness}_abs", 'a') as f:
+                        f.write(f"{fingerprint},{blocked}\n")
+
+                blocked = evaluate (harshness, fingerprint, False, True)
+                if not blocked is None:
+                    with open (f"data/blocked_{harshness}_nomin", 'a') as f:
+                        f.write(f"{fingerprint},{blocked}\n")

+ 74 - 0
scripts/get-bridge-data.sh

@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Populate array of months we care about
+months=()
+# 2020
+for i in $(seq 7 12); do
+    months+=( 2020-$(printf %02d $i) )
+done
+# 2021
+for i in $(seq 1 4); do
+    months+=( 2021-$(printf %02d $i) )
+done
+
+# Download the archives if we don't have them already
+for i in ${months[@]}; do
+    if [ ! -f data/bridge-extra-infos-${i}.tar.xz ]; then
+        curl -Lo data/bridge-extra-infos-${i}.tar.xz https://collector.torproject.org/archive/bridge-descriptors/extra-infos/bridge-extra-infos-${i}.tar.xz || exit 1
+    fi
+done
+
+# Check that we have the right archives
+sha256sum -c data/bridge-extra-infos.sha256 || exit 1
+
+# If we haven't already extracted the bridge data, then do so. This will
+# take a long time (around 12.5 hours on my device) because it needs to
+# process around 3 million small files, and it will require a few GB of
+# free space while running. In the end, this results in about 91 MB of
+# bridge data that we care about.
+if [ ! -d data/bridge_data ]; then
+
+    cd data
+
+    # This is around 20 GB of data uncompressed, so don't extract it all
+    # at once. Instead, extract and process one month at a time.
+    for i in ${months[@]}; do
+        if [ ! -d bridge-extra-infos-${i} ]; then
+            echo "Extracting bridge-extras-infos-${i}.tar.xz"
+            tar xf bridge-extra-infos-${i}.tar.xz || exit 1
+        fi
+
+        echo "Processing bridge-extra-infos-${i}"
+        for j in bridge-extra-infos-${i}/*; do
+            for k in ${j}/*; do
+                for l in ${k}/*; do
+                    if [[ -s "${k}" ]]; then
+                        fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${l}" | grep -Po '(?<= )(.*)(?=$)')
+                        date=$(grep -Po '(?<=^published )(.*)(?= )' "${l}")
+                        # Convert to Julian date, thanks to
+                        # https://stackoverflow.com/a/43318209
+                        date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
+                        count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${l}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
+                        if [ -z "$count" ]; then
+                            count=0
+                        fi
+
+                        if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
+                            echo "${date_julian},${count}" >> bridge_data/${fingerprint}
+                        else
+                            echo "Error for ${l}"
+                            echo "    fingerprint: ${fingerprint}"
+                            echo "    date:        ${date_julian}"
+                            echo "    count:       ${count}"
+                        fi
+                    fi
+                done
+            done
+        done
+
+        echo "Removing bridge-extra-infos-${i} directory to free up space"
+        rm -r bridge-extra-infos-${i}
+    done
+
+    cd ..
+fi

+ 42 - 0
scripts/get-email-bridges.sh

@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# If we don't already have the archive, download it
+if [ ! -f data/bridge-pool-assignments-2021-02.tar.xz ]; then
+    curl -Lo data/bridge-pool-assignments-2021-02.tar.xz https://collector.torproject.org/archive/bridge-pool-assignments/bridge-pool-assignments-2021-02.tar.xz
+fi
+
+# Check that we have the right archive
+sha256sum -c data/bridge-pool-assignments.sha256 || exit 1
+
+# If we haven't already extracted the archive, extract it
+if [ ! -d data/bridge-pool-assignments-2021-02 ]; then
+    cd data && tar xf bridge-pool-assignments-2021-02.tar.xz && cd ..
+fi
+
+# Extract obfs4 email bridges
+for i in $(seq 1 21); do
+    grep " email " data/bridge-pool-assignments-2021-02/$(printf %02d $i)/* \
+        | grep "obfs4" \
+        | grep -Po '(?<=:)(.*?)(?= )'
+done | sort | uniq > data/obfs4-email-bridges
+
+# Get list of all bridges
+for i in $(seq 1 21); do
+    grep -v "bridge-pool-assignment" data/bridge-pool-assignments-2021-02/$(printf %02d $i)/* \
+        | grep -Po '(?<=:)(.*?)(?= )'
+done | sort | uniq > data/all-bridges
+
+# Count bridges in each category
+all_bridges=$(cat data/all-bridges | wc -l)
+obfs4_email_bridges=$(cat data/obfs4-email-bridges | wc -l)
+email_bridges=$(for i in $(seq 1 21); do grep -v "bridge-pool-assignment" data/bridge-pool-assignments-2021-02/$(printf %02d $i)/* | grep "email" | grep -Po '(?<=:)(.*?)(?= )';done | sort | uniq | wc -l)
+https_bridges=$(for i in $(seq 1 21); do grep -v "bridge-pool-assignment" data/bridge-pool-assignments-2021-02/$(printf %02d $i)/* | grep "https" | grep -Po '(?<=:)(.*?)(?= )';done | sort | uniq | wc -l)
+moat_bridges=$(for i in $(seq 1 21); do grep -v "bridge-pool-assignment" data/bridge-pool-assignments-2021-02/$(printf %02d $i)/* | grep "moat" | grep -Po '(?<=:)(.*?)(?= )';done | sort | uniq | wc -l)
+unallocated_bridges=$(for i in $(seq 1 21); do grep -v "bridge-pool-assignment" data/bridge-pool-assignments-2021-02/$(printf %02d $i)/* | grep "unallocated" | grep -Po '(?<=:)(.*?)(?= )';done | sort | uniq | wc -l)
+
+echo "Total number of bridges: ${all_bridges}"
+echo "Number of obfs4 email bridges: ${obfs4_email_bridges}"
+echo "Number of email bridges: ${email_bridges}"
+echo "Number of HTTPS bridges: ${https_bridges}"
+echo "Number of moat bridges: ${moat_bridges}"
+echo "Number of unallocated bridges: ${unallocated_bridges}"

+ 217 - 0
scripts/get-stats.py

@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+
+import csv
+import numpy
+import os
+import sys
+
+# Starting day: February Nth
+#N = 1
+N = 20
+
+# See note in readme
+JAN_31 = 2459245
+
+# 2021 February Nth as Julian date
+FIRST_DAY = JAN_31 + N
+
+TOTAL_BRIDGES = 1890
+
+OBFS4_EMAIL_BRIDGES = 93
+
+def sigfigs(n):
+    if n == 0.0:
+        n = 0 # as an int
+    else:
+        i=0
+        while n * (10**i) < 1:
+            i += 1
+        n = round(n * 10**i) / 10**i
+
+    return n
+
+email_bridges = set()
+
+with open ("data/obfs4-email-bridges", 'r') as f:
+    for line in f:
+        if line != "":
+            email_bridges.add(line.strip())
+
+rel_table = """
+\\hline
+$h$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
+\\hline
+"""
+abs_table = rel_table
+nomin_table = rel_table
+
+for harshness in range(5):
+    for suffix in ["", "_abs", "_nomin"]:
+        with open (f"data/blocked_{harshness}{suffix}", 'r') as f:
+            # obfs4 email bridges correctly identified as blocked
+            correct = 0
+
+            # obfs4 email bridges identified as blocked before they actually were
+            too_soon = 0
+
+            # non-obfs4-email bridges incorrectly identified as blocked
+            incorrect = 0
+
+            for line in f:
+                if line != "":
+                    line = line.strip()
+                    fingerprint = line[:40]
+                    date = int(line[41:])
+
+                    if fingerprint in email_bridges:
+                        if date >= FIRST_DAY:
+                            correct += 1
+                        else:
+                            too_soon += 1
+                    else:
+                        incorrect += 1
+
+            tn = TOTAL_BRIDGES - OBFS4_EMAIL_BRIDGES - incorrect
+            tp = correct
+            fn = OBFS4_EMAIL_BRIDGES - correct - too_soon
+            fp = too_soon + incorrect
+
+            precision = sigfigs(tp / (tp + fp))
+
+            recall = sigfigs(tp / (tp + fn))
+
+            newline = f"{harshness} & {tp} & {tn} & {fp} & {fn} & {precision} & {recall} \\\\\n"
+
+            if suffix == "":
+                rel_table += newline
+            elif suffix == "_abs":
+                abs_table += newline
+            else:
+                nomin_table += newline
+
+print ("Absolute threshold without a minimum:")
+print (nomin_table)
+
+print ("Absolute threshold:")
+print (abs_table)
+
+print ("Relative threshold:")
+print (rel_table)
+
+
+# Now let's look at stddevs
+
+email_bridges = list(email_bridges)
+email_bridge_data = []
+email_bridge_max = []
+
+for fingerprint in email_bridges:
+    # We're going to get all the data for each bridge
+    bridge_data = dict()
+    begun = False
+    max_count = 0
+
+    filename = f"data/bridge_data_cleaned/{fingerprint.upper()}"
+
+    if os.path.isfile(filename) and os.path.getsize(filename) > 0:
+        with open(filename, 'r') as csvfile:
+            data = csv.reader(csvfile, delimiter=',')
+
+            for line in data:
+
+                # Ignore 0 values until we see a non-zero value
+                if not begun:
+                    if line[1] != "0":
+                        begun = True
+
+                if begun:
+                    date = int(line[0][:line[0].find(' ')])
+                    val = int(line[1])
+                    bridge_data[date] = val
+                    max_count = max(max_count, val)
+
+            if begun:
+                email_bridge_data.append(bridge_data)
+                email_bridge_max.append(max_count)
+
+# Look at bridges individually
+for i in range(len(email_bridge_data)):
+    bridge = email_bridge_data[i]
+
+    vals = []
+
+    # Get smallest key, i.e., first date we have data for
+    start_date = min(bridge)
+
+    # Get counts before censorship started
+    #for d in range(start_date, FIRST_DAY):
+    #    if d in bridge:
+    #        vals.append(bridge[d])
+        # If this day is not represented, the bridge did not report
+        # stats; this is different from 0.
+
+    # Note: This is cheaper than the above impelmentation.
+    for date, val in bridge.items():
+        if date < FIRST_DAY:
+            vals.append(val)
+
+    # If we have no data, don't worry about it
+    if len(vals) == 0:
+        continue
+
+    mu = numpy.mean(vals)
+    sigma = numpy.std(vals)
+
+    if sigma > 0:
+        print (f"Single: Bridge {i}: max={email_bridge_max[i]}, mean={mu}, std={sigma}")
+        print (f"Single: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
+        print (f"Single: We are looking at data from {len(vals)} days, starting on {start_date}\n")
+
+# Look at pairs of bridges
+for i in range(len(email_bridge_data)):
+    for j in range(i+1, len(email_bridge_data)):
+        max_count = 0
+
+        bridge_i = email_bridge_data[i]
+        bridge_j = email_bridge_data[j]
+
+        vals = []
+
+        # Get smallest key, i.e., the first date BOTH bridges have data for
+        start_date = max(min(bridge_i), min(bridge_j))
+
+        # Get counts before censorship started
+        #for d in range(start_date, FIRST_DAY):
+
+        # Get set of keys between start_date and FIRST_DAY
+        keys = set()
+        for d in bridge_i:
+            if d >= start_date and d <= FIRST_DAY:
+                keys.add(d)
+        for d in bridge_j:
+            if d >= start_date and d <= FIRST_DAY:
+                keys.add(d)
+
+        for d in keys:
+            val = 0
+            if d in bridge_i and d in bridge_j:
+                val = bridge_i[d] + bridge_j[d]
+            elif d in bridge_i:
+                val = bridge_i[d]
+            elif d in bridge_j:
+                val = bridge_j[d]
+
+            vals.append(val)
+            max_count = max(max_count, val)
+
+        # If we have no data, don't worry about it
+        if len(vals) == 0:
+            continue
+
+        mu = numpy.mean(vals)
+        sigma = numpy.std(vals)
+
+        if sigma > 0:
+            print (f"Double: Bridges {i} and {j}: max={max_count}, mean={mu}, std={sigma}")
+            print (f"Double: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
+            print (f"Double: We are looking at data from {len(vals)} days, starting on {start_date}\n")

+ 55 - 0
scripts/get-stats.sh

@@ -0,0 +1,55 @@
+#!/bin/sh
+
+# If the blockages have not already been evaluted, do that now
+if [ ! -f data/blocked_0 ]; then
+    echo "Running python code to evaluate blockages..."
+    ./scripts/evaluate-blockages.py
+fi
+
+# Do the actual math in python
+if [ ! -f output ]; then
+    echo "Running python code to compute stats..."
+    ./scripts/get-stats.py > output
+fi
+
+echo "Tables:"
+head -33 output
+
+echo -n "Number of bridges that received more than 8 connections: "
+grep '^Single: ' output | grep 'max=' | grep -v 'max=8' | wc -l
+
+echo -n "Number of bridges that received more than 16 connections: "
+grep '^Single: ' output | grep 'max=' | grep -v 'max=8' | grep -v 'max=16' | wc -l
+
+echo -n "Number of bridges that received more than 24 connections: "
+grep '^Single: ' output | grep 'max=' | grep -v 'max=8' | grep -v 'max=16' | grep -v 'max=24' | wc -l
+
+echo ""
+
+echo -n "Number of bridges with connection count mean more than 1 stddev away from 0: "
+grep "^Single: Zero is " output | grep -v "^Single: Zero is 0." | wc -l
+
+echo -n "Max number of stddevs from 0: "
+grep "^Single: Zero is " output | grep -v "^Single: Zero is 0." | grep -Po '(?<=^Single: Zero is )(.*?)(?= standard deviations away from the mean)' | sort -r | head -1
+
+echo ""
+
+echo -n "Number of pairs of bridges that received more than 8 connections: "
+grep '^Double: ' output | grep 'max=' | grep -v 'max=8' | wc -l
+
+echo -n "Number of pairs of bridges that received more than 16 connections: "
+grep '^Double: ' output | grep 'max=' | grep -v 'max=8' | grep -v 'max=16' | wc -l
+
+echo -n "Number of pairs of bridges that received more than 24 connections: "
+grep '^Double: ' output | grep 'max=' | grep -v 'max=8' | grep -v 'max=16' | grep -v 'max=24' | wc -l
+
+echo -n "Number of pairs of bridges that received more than 32 connections: "
+grep '^Double: ' output | grep 'max=' | grep -v 'max=8' | grep -v 'max=16' | grep -v 'max=24' | grep -v 'max=32' | wc -l
+
+echo ""
+
+echo -n "Number of pairs of bridges with connection count mean more than 1 stddev away from 0: "
+grep "^Double: Zero is " output | grep -v "^Double: Zero is 0." | wc -l
+
+echo -n "Max number of stddevs from 0: "
+grep "^Double: Zero is " output | grep -v "^Double: Zero is 0." | grep -Po '(?<=^Double: Zero is )(.*?)(?= standard deviations away from the mean)' | sort -r | head -1