vvecna
/
belarus-2020-2021


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
							#!/usr/bin/env python3

import csv
import numpy
import os
import sys

# Starting day: February Nth
#N = 1
N = 20

# See note in readme
JAN_31 = 2459245

# 2021 February Nth as Julian date
FIRST_DAY = JAN_31 + N

TOTAL_BRIDGES = 1890

OBFS4_EMAIL_BRIDGES = 93

def sigfigs(n):
    if n == 0.0:
        n = 0 # as an int
    else:
        i=0
        while n * (10**i) < 1:
            i += 1
        n = round(n * 10**i) / 10**i

    return n

email_bridges = set()

with open ("data/obfs4-email-bridges", 'r') as f:
    for line in f:
        if line != "":
            email_bridges.add(line.strip())


loesing_table = """
\\hline
\\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
\\hline
"""
abs_table = """
\\hline
$t$ & $m$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
\\hline
"""
rel_table = """
\\hline
$d$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
\\hline
"""

def accuracy (filename):
    with open (filename, 'r') as f:
        # obfs4 email bridges correctly identified as blocked
        correct = 0

        # obfs4 email bridges identified as blocked before they actually were
        too_soon = 0

        # non-obfs4-email bridges incorrectly identified as blocked
        incorrect = 0

        for line in f:
            if line != "":
                line = line.strip()
                fingerprint = line[:40]
                date = int(line[41:])

                if fingerprint in email_bridges:
                    if date >= FIRST_DAY:
                        correct += 1
                    else:
                        too_soon += 1
                else:
                    incorrect += 1

        tn = TOTAL_BRIDGES - OBFS4_EMAIL_BRIDGES - incorrect
        tp = correct
        fn = OBFS4_EMAIL_BRIDGES - correct - too_soon
        fp = too_soon + incorrect

        precision = sigfigs(tp / (tp + fp))

        recall = sigfigs(tp / (tp + fn))

        return f"{tp} & {tn} & {fp} & {fn} & {precision} & {recall} \\\\\n"

# Loesing
loesing_table += accuracy ("data/blocked_loesing")

# Absolute threshold
for t in range (8, 40, 8):
    for m in range (t+8, 112, 8):
        abs_table += f"{t} & {m} & " + accuracy (f"data/blocked_abs_{t}_{m}")

# Relative threshold
for d in range (8, 112, 8):
    rel_table += f"{d} & " + accuracy (f"data/blocked_rel_{d}")

print ("Loesing's algorithm:")
print (loesing_table)

print ("Absolute threshold:")
print (abs_table)

print ("Relative threshold:")
print (rel_table)


# Now let's look at stddevs

email_bridges = list(email_bridges)
email_bridge_data = []
email_bridge_max = []

for fingerprint in email_bridges:
    # We're going to get all the data for each bridge
    bridge_data = dict()
    begun = False
    max_count = 0

    filename = f"data/bridge_data_cleaned/{fingerprint.upper()}"

    if os.path.isfile(filename) and os.path.getsize(filename) > 0:
        with open(filename, 'r') as csvfile:
            data = csv.reader(csvfile, delimiter=',')

            for line in data:

                # Ignore 0 values until we see a non-zero value
                if not begun:
                    if line[1] != "0":
                        begun = True

                if begun:
                    date = int(line[0][:line[0].find(' ')])
                    val = int(line[1])
                    bridge_data[date] = val
                    max_count = max(max_count, val)

            if begun:
                email_bridge_data.append(bridge_data)
                email_bridge_max.append(max_count)

# Look at bridges individually
for i in range(len(email_bridge_data)):
    bridge = email_bridge_data[i]

    vals = []

    # Get smallest key, i.e., first date we have data for
    start_date = min(bridge)

    # Get counts before censorship started
    #for d in range(start_date, FIRST_DAY):
    #    if d in bridge:
    #        vals.append(bridge[d])
        # If this day is not represented, the bridge did not report
        # stats; this is different from 0.

    # Note: This is cheaper than the above impelmentation.
    for date, val in bridge.items():
        if date < FIRST_DAY:
            vals.append(val)

    # If we have no data, don't worry about it
    if len(vals) == 0:
        continue

    mu = numpy.mean(vals)
    sigma = numpy.std(vals)

    if sigma > 0:
        print (f"Single: Bridge {i}: max={email_bridge_max[i]}, mean={mu}, std={sigma}")
        print (f"Single: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
        print (f"Single: We are looking at data from {len(vals)} days, starting on {start_date}\n")

# Look at pairs of bridges
for i in range(len(email_bridge_data)):
    for j in range(i+1, len(email_bridge_data)):
        max_count = 0

        bridge_i = email_bridge_data[i]
        bridge_j = email_bridge_data[j]

        vals = []

        # Get smallest key, i.e., the first date BOTH bridges have data for
        start_date = max(min(bridge_i), min(bridge_j))

        # Get counts before censorship started
        #for d in range(start_date, FIRST_DAY):

        # Get set of keys between start_date and FIRST_DAY
        keys = set()
        for d in bridge_i:
            if d >= start_date and d <= FIRST_DAY:
                keys.add(d)
        for d in bridge_j:
            if d >= start_date and d <= FIRST_DAY:
                keys.add(d)

        for d in keys:
            val = 0
            if d in bridge_i and d in bridge_j:
                val = bridge_i[d] + bridge_j[d]
            elif d in bridge_i:
                val = bridge_i[d]
            elif d in bridge_j:
                val = bridge_j[d]

            vals.append(val)
            max_count = max(max_count, val)

        # If we have no data, don't worry about it
        if len(vals) == 0:
            continue

        mu = numpy.mean(vals)
        sigma = numpy.std(vals)

        if sigma > 0:
            print (f"Double: Bridges {i} and {j}: max={max_count}, mean={mu}, std={sigma}")
            print (f"Double: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
            print (f"Double: We are looking at data from {len(vals)} days, starting on {start_date}\n")