#!/usr/bin/env python3 import csv import numpy import os import sys # Starting day: February Nth #N = 1 N = 20 # See note in readme JAN_31 = 2459245 # 2021 February Nth as Julian date FIRST_DAY = JAN_31 + N TOTAL_BRIDGES = 1890 OBFS4_EMAIL_BRIDGES = 93 def sigfigs(n): if n == 0.0: n = 0 # as an int else: i=0 while n * (10**i) < 1: i += 1 n = round(n * 10**i) / 10**i return n email_bridges = set() with open ("data/obfs4-email-bridges", 'r') as f: for line in f: if line != "": email_bridges.add(line.strip()) loesing_table = """ \\hline \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\ \\hline """ abs_table = """ \\hline $t$ & $m$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\ \\hline """ rel_table = """ \\hline $d$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\ \\hline """ def accuracy (filename): with open (filename, 'r') as f: # obfs4 email bridges correctly identified as blocked correct = 0 # obfs4 email bridges identified as blocked before they actually were too_soon = 0 # non-obfs4-email bridges incorrectly identified as blocked incorrect = 0 for line in f: if line != "": line = line.strip() fingerprint = line[:40] date = int(line[41:]) if fingerprint in email_bridges: if date >= FIRST_DAY: correct += 1 else: too_soon += 1 else: incorrect += 1 tn = TOTAL_BRIDGES - OBFS4_EMAIL_BRIDGES - incorrect tp = correct fn = OBFS4_EMAIL_BRIDGES - correct - too_soon fp = too_soon + incorrect precision = sigfigs(tp / (tp + fp)) recall = sigfigs(tp / (tp + fn)) return f"{tp} & {tn} & {fp} & {fn} & {precision} & {recall} \\\\\n" # Loesing loesing_table += accuracy ("data/blocked_loesing") # Absolute threshold for t in range (8, 40, 8): for m in range (t+8, 112, 8): abs_table += f"{t} & {m} & " + accuracy (f"data/blocked_abs_{t}_{m}") # Relative threshold for d in range (8, 112, 8): rel_table += f"{d} & " + accuracy (f"data/blocked_rel_{d}") print ("Loesing's algorithm:") print (loesing_table) print ("Absolute threshold:") print (abs_table) print ("Relative threshold:") print (rel_table) # Now let's look at stddevs email_bridges = list(email_bridges) email_bridge_data = [] email_bridge_max = [] for fingerprint in email_bridges: # We're going to get all the data for each bridge bridge_data = dict() begun = False max_count = 0 filename = f"data/bridge_data_cleaned/{fingerprint.upper()}" if os.path.isfile(filename) and os.path.getsize(filename) > 0: with open(filename, 'r') as csvfile: data = csv.reader(csvfile, delimiter=',') for line in data: # Ignore 0 values until we see a non-zero value if not begun: if line[1] != "0": begun = True if begun: date = int(line[0][:line[0].find(' ')]) if date > FIRST_DAY: break val = int(line[1]) bridge_data[date] = val max_count = max(max_count, val) if begun: email_bridge_data.append(bridge_data) email_bridge_max.append(max_count) # Look at bridges individually for i in range(len(email_bridge_data)): bridge = email_bridge_data[i] vals = [] # Get smallest key, i.e., first date we have data for start_date = min(bridge) # Get counts before censorship started #for d in range(start_date, FIRST_DAY): # if d in bridge: # vals.append(bridge[d]) # If this day is not represented, the bridge did not report # stats; this is different from 0. # Note: This is cheaper than the above impelmentation. for date, val in bridge.items(): if date < FIRST_DAY: vals.append(val) # If we have no data, don't worry about it if len(vals) == 0: continue mu = numpy.mean(vals) sigma = numpy.std(vals) if sigma > 0: print (f"Single: Bridge {i}: max={email_bridge_max[i]}, mean={mu}, std={sigma}") print (f"Single: Zero is {mu / sigma} standard deviations away from the mean ({mu})") print (f"Single: We are looking at data from {len(vals)} days, starting on {start_date}\n") # Look at pairs of bridges for i in range(len(email_bridge_data)): for j in range(i+1, len(email_bridge_data)): max_count = 0 bridge_i = email_bridge_data[i] bridge_j = email_bridge_data[j] vals = [] # Get smallest key, i.e., the first date BOTH bridges have data for start_date = max(min(bridge_i), min(bridge_j)) # Get counts before censorship started #for d in range(start_date, FIRST_DAY): # Get set of keys between start_date and FIRST_DAY keys = set() for d in bridge_i: if d >= start_date and d <= FIRST_DAY: keys.add(d) for d in bridge_j: if d >= start_date and d <= FIRST_DAY: keys.add(d) for d in keys: val = 0 if d in bridge_i and d in bridge_j: val = bridge_i[d] + bridge_j[d] elif d in bridge_i: val = bridge_i[d] elif d in bridge_j: val = bridge_j[d] vals.append(val) max_count = max(max_count, val) # If we have no data, don't worry about it if len(vals) == 0: continue mu = numpy.mean(vals) sigma = numpy.std(vals) if sigma > 0: print (f"Double: Bridges {i} and {j}: max={max_count}, mean={mu}, std={sigma}") print (f"Double: Zero is {mu / sigma} standard deviations away from the mean ({mu})") print (f"Double: We are looking at data from {len(vals)} days, starting on {start_date}\n")