| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264 |
- #!/usr/bin/env python3
- import csv
- import numpy
- import os
- import sys
- # Starting day: February Nth
- #N = 1
- N = 20
- # See note in readme
- JAN_31 = 2459245
- # 2021 February Nth as Julian date
- FIRST_DAY = JAN_31 + N
- TOTAL_BRIDGES = 1890
- OBFS4_EMAIL_BRIDGES = 93
- def sigfigs(n):
- if n == 0.0:
- n = 0 # as an int
- else:
- i=0
- while n * (10**i) < 1:
- i += 1
- n = round(n * 10**i) / 10**i
- return n
- email_bridges = set()
- with open ("data/obfs4-email-bridges", 'r') as f:
- for line in f:
- if line != "":
- email_bridges.add(line.strip())
- ## Set up all the LaTeX
- table_start = """
- \\documentclass[sigconf]{acmart}
- \\begin{document}
- \\begin{table*}[t]
- \\caption{Results from running Algorithm~1 (using an absolute threshold of 32 and an absolute connection count minimum of 100), Algorithm~2 (using an absolute threshold $t$ and an absolute connection count minimum $m$), and Algorithm~3 (using a relative connection count difference $d$).}
- \\begin{tabular}{c c | c c c c c c c}
- \\hline
- """
- loesing_table = """
- \\multicolumn{8}{c}{\\textbf{Algorithm~1}} \\\\
- \\multicolumn{2}{c@{}}{} & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
- \\hline
- """
- abs_table = """
- \\\\
- \\hline
- \\multicolumn{8}{c}{\\textbf{Algorithm~2}} \\\\
- $t$ & $m$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
- \\hline
- """
- abs_table_start_2 = """
- \\end{tabular}
- \\hspace{3em}
- \\begin{tabular}{c c | c c c c c c}
- \\hline
- \\multicolumn{8}{c}{\\textbf{Algorithm~2 (continued)}} \\\\
- $t$ & $m$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
- \\hline
- """
- rel_table = """
- \\\\
- \\hline
- \\multicolumn{8}{c}{\\textbf{Algorithm~3}} \\\\
- & $d$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
- \\hline
- """
- table_end = """
- \\end{tabular}
- \\end{table*}
- \\end{document}
- """
- def accuracy (filename):
- with open (filename, 'r') as f:
- # obfs4 email bridges correctly identified as blocked
- correct = 0
- # obfs4 email bridges identified as blocked before they actually were
- too_soon = 0
- # non-obfs4-email bridges incorrectly identified as blocked
- incorrect = 0
- for line in f:
- if line != "":
- line = line.strip()
- fingerprint = line[:40]
- date = int(line[41:])
- if fingerprint in email_bridges:
- if date >= FIRST_DAY:
- correct += 1
- else:
- too_soon += 1
- else:
- incorrect += 1
- tn = TOTAL_BRIDGES - OBFS4_EMAIL_BRIDGES - incorrect
- tp = correct
- fn = OBFS4_EMAIL_BRIDGES - correct - too_soon
- fp = too_soon + incorrect
- precision = sigfigs(tp / (tp + fp))
- recall = sigfigs(tp / (tp + fn))
- return f"{tp} & {tn} & {fp} & {fn} & {precision} & {recall} \\\\\n"
- ## Add the data to the table
- # Loesing
- loesing_table += "\\multicolumn{2}{c@{}}{} & " + accuracy ("data/blocked_loesing")
- # Absolute threshold
- for t in range (8, 40, 8):
- for m in range (t+8, 112, 8):
- abs_table += f"{t} & {m} & " + accuracy (f"data/blocked_abs_{t}_{m}")
- if t == 24 and m == 56:
- # push to second column
- abs_table += abs_table_start_2
- # Relative threshold
- for d in range (8, 112, 8):
- rel_table += f"& {d} & " + accuracy (f"data/blocked_rel_{d}")
- ## Output the LaTeX file
- table = table_start + loesing_table + abs_table + rel_table + table_end
- with open("appendix-a-results.tex", 'w') as f:
- f.write(table)
- # Now let's look at stddevs
- email_bridges = list(email_bridges)
- email_bridge_data = []
- email_bridge_max = []
- for fingerprint in email_bridges:
- # We're going to get all the data for each bridge
- bridge_data = dict()
- begun = False
- max_count = 0
- filename = f"data/bridge_data_cleaned/{fingerprint.upper()}"
- if os.path.isfile(filename) and os.path.getsize(filename) > 0:
- with open(filename, 'r') as csvfile:
- data = csv.reader(csvfile, delimiter=',')
- for line in data:
- # Ignore 0 values until we see a non-zero value
- if not begun:
- if line[1] != "0":
- begun = True
- if begun:
- date = int(line[0][:line[0].find(' ')])
- if date > FIRST_DAY:
- break
- val = int(line[1])
- bridge_data[date] = val
- max_count = max(max_count, val)
- if begun:
- email_bridge_data.append(bridge_data)
- email_bridge_max.append(max_count)
- # Look at bridges individually
- for i in range(len(email_bridge_data)):
- bridge = email_bridge_data[i]
- vals = []
- # Get smallest key, i.e., first date we have data for
- start_date = min(bridge)
- # Get counts before censorship started
- #for d in range(start_date, FIRST_DAY):
- # if d in bridge:
- # vals.append(bridge[d])
- # If this day is not represented, the bridge did not report
- # stats; this is different from 0.
- # Note: This is cheaper than the above impelmentation.
- for date, val in bridge.items():
- if date < FIRST_DAY:
- vals.append(val)
- # If we have no data, don't worry about it
- if len(vals) == 0:
- continue
- mu = numpy.mean(vals)
- sigma = numpy.std(vals)
- if sigma > 0:
- print (f"Single: Bridge {i}: max={email_bridge_max[i]}, mean={mu}, std={sigma}")
- print (f"Single: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
- print (f"Single: We are looking at data from {len(vals)} days, starting on {start_date}\n")
- # Look at pairs of bridges
- for i in range(len(email_bridge_data)):
- for j in range(i+1, len(email_bridge_data)):
- max_count = 0
- bridge_i = email_bridge_data[i]
- bridge_j = email_bridge_data[j]
- vals = []
- # Get smallest key, i.e., the first date BOTH bridges have data for
- start_date = max(min(bridge_i), min(bridge_j))
- # Get set of keys between start_date and FIRST_DAY
- keys = set()
- for d in bridge_i:
- if d >= start_date and d <= FIRST_DAY:
- keys.add(d)
- for d in bridge_j:
- if d >= start_date and d <= FIRST_DAY:
- keys.add(d)
- for d in keys:
- val = 0
- if d in bridge_i and d in bridge_j:
- val = bridge_i[d] + bridge_j[d]
- elif d in bridge_i:
- val = bridge_i[d]
- elif d in bridge_j:
- val = bridge_j[d]
- vals.append(val)
- max_count = max(max_count, val)
- # If we have no data, don't worry about it
- if len(vals) == 0:
- continue
- mu = numpy.mean(vals)
- sigma = numpy.std(vals)
- if sigma > 0:
- print (f"Double: Bridges {i} and {j}: max={max_count}, mean={mu}, std={sigma}")
- print (f"Double: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
- print (f"Double: We are looking at data from {len(vals)} days, starting on {start_date}\n")
|