get-stats.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. #!/usr/bin/env python3
  2. import csv
  3. import numpy
  4. import os
  5. import sys
  6. # Starting day: February Nth
  7. #N = 1
  8. N = 20
  9. # See note in readme
  10. JAN_31 = 2459245
  11. # 2021 February Nth as Julian date
  12. FIRST_DAY = JAN_31 + N
  13. TOTAL_BRIDGES = 1890
  14. OBFS4_EMAIL_BRIDGES = 93
  15. def sigfigs(n):
  16. if n == 0.0:
  17. n = 0 # as an int
  18. else:
  19. i=0
  20. while n * (10**i) < 1:
  21. i += 1
  22. n = round(n * 10**i) / 10**i
  23. return n
  24. email_bridges = set()
  25. with open ("data/obfs4-email-bridges", 'r') as f:
  26. for line in f:
  27. if line != "":
  28. email_bridges.add(line.strip())
  29. ## Set up all the LaTeX
  30. table_start = """
  31. \\documentclass[sigconf]{acmart}
  32. \\begin{document}
  33. \\begin{table*}[t]
  34. \\renewcommand\\thetable{4}
  35. \\caption{Results from running Algorithm~1 (using an absolute threshold of 32 and an absolute connection count minimum of 100), Algorithm~2 (using an absolute threshold $t$ and an absolute connection count minimum $m$), and Algorithm~3 (using a relative connection count difference $d$).}
  36. \\begin{tabular}{c c | c c c c c c c}
  37. \\hline
  38. """
  39. loesing_table = """
  40. \\multicolumn{8}{c}{\\textbf{Algorithm~1}} \\\\
  41. \\multicolumn{2}{c@{}}{} & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
  42. \\hline
  43. """
  44. abs_table = """
  45. \\\\
  46. \\hline
  47. \\multicolumn{8}{c}{\\textbf{Algorithm~2}} \\\\
  48. $t$ & $m$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
  49. \\hline
  50. """
  51. abs_table_start_2 = """
  52. \\end{tabular}
  53. \\hspace{3em}
  54. \\begin{tabular}{c c | c c c c c c}
  55. \\hline
  56. \\multicolumn{8}{c}{\\textbf{Algorithm~2 (continued)}} \\\\
  57. $t$ & $m$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
  58. \\hline
  59. """
  60. rel_table = """
  61. \\\\
  62. \\hline
  63. \\multicolumn{8}{c}{\\textbf{Algorithm~3}} \\\\
  64. & $d$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
  65. \\hline
  66. """
  67. table_end = """
  68. \\end{tabular}
  69. \\end{table*}
  70. \\end{document}
  71. """
  72. def accuracy (filename):
  73. with open (filename, 'r') as f:
  74. # obfs4 email bridges correctly identified as blocked
  75. correct = 0
  76. # obfs4 email bridges identified as blocked before they actually were
  77. too_soon = 0
  78. # non-obfs4-email bridges incorrectly identified as blocked
  79. incorrect = 0
  80. for line in f:
  81. if line != "":
  82. line = line.strip()
  83. fingerprint = line[:40]
  84. date = int(line[41:])
  85. if fingerprint in email_bridges:
  86. if date >= FIRST_DAY:
  87. correct += 1
  88. else:
  89. too_soon += 1
  90. else:
  91. incorrect += 1
  92. tn = TOTAL_BRIDGES - OBFS4_EMAIL_BRIDGES - incorrect
  93. tp = correct
  94. fn = OBFS4_EMAIL_BRIDGES - correct - too_soon
  95. fp = too_soon + incorrect
  96. precision = sigfigs(tp / (tp + fp))
  97. recall = sigfigs(tp / (tp + fn))
  98. return f"{tp} & {tn} & {fp} & {fn} & {precision} & {recall} \\\\\n"
  99. ## Add the data to the table
  100. # Loesing
  101. loesing_table += "\\multicolumn{2}{c@{}}{} & " + accuracy ("data/blocked_loesing")
  102. # Absolute threshold
  103. for t in range (8, 40, 8):
  104. for m in range (t+8, 112, 8):
  105. abs_table += f"{t} & {m} & " + accuracy (f"data/blocked_abs_{t}_{m}")
  106. if t == 24 and m == 56:
  107. # push to second column
  108. abs_table += abs_table_start_2
  109. # Relative threshold
  110. for d in range (8, 112, 8):
  111. rel_table += f"& {d} & " + accuracy (f"data/blocked_rel_{d}")
  112. ## Output the LaTeX file
  113. table = table_start + loesing_table + abs_table + rel_table + table_end
  114. with open("appendix-a-results.tex", 'w') as f:
  115. f.write(table)
  116. # Now let's look at stddevs
  117. email_bridges = list(email_bridges)
  118. email_bridge_data = []
  119. email_bridge_max = []
  120. for fingerprint in email_bridges:
  121. # We're going to get all the data for each bridge
  122. bridge_data = dict()
  123. begun = False
  124. max_count = 0
  125. filename = f"data/bridge_data_cleaned/{fingerprint.upper()}"
  126. if os.path.isfile(filename) and os.path.getsize(filename) > 0:
  127. with open(filename, 'r') as csvfile:
  128. data = csv.reader(csvfile, delimiter=',')
  129. for line in data:
  130. # Ignore 0 values until we see a non-zero value
  131. if not begun:
  132. if line[1] != "0":
  133. begun = True
  134. if begun:
  135. date = int(line[0][:line[0].find(' ')])
  136. if date > FIRST_DAY:
  137. break
  138. val = int(line[1])
  139. bridge_data[date] = val
  140. max_count = max(max_count, val)
  141. if begun:
  142. email_bridge_data.append(bridge_data)
  143. email_bridge_max.append(max_count)
  144. # Look at bridges individually
  145. for i in range(len(email_bridge_data)):
  146. bridge = email_bridge_data[i]
  147. vals = []
  148. # Get smallest key, i.e., first date we have data for
  149. start_date = min(bridge)
  150. # Get counts before censorship started
  151. #for d in range(start_date, FIRST_DAY):
  152. # if d in bridge:
  153. # vals.append(bridge[d])
  154. # If this day is not represented, the bridge did not report
  155. # stats; this is different from 0.
  156. # Note: This is cheaper than the above impelmentation.
  157. for date, val in bridge.items():
  158. if date < FIRST_DAY:
  159. vals.append(val)
  160. # If we have no data, don't worry about it
  161. if len(vals) == 0:
  162. continue
  163. mu = numpy.mean(vals)
  164. sigma = numpy.std(vals)
  165. if sigma > 0:
  166. print (f"Single: Bridge {i}: max={email_bridge_max[i]}, mean={mu}, std={sigma}")
  167. print (f"Single: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
  168. print (f"Single: We are looking at data from {len(vals)} days, starting on {start_date}\n")
  169. # Look at pairs of bridges
  170. for i in range(len(email_bridge_data)):
  171. for j in range(i+1, len(email_bridge_data)):
  172. max_count = 0
  173. bridge_i = email_bridge_data[i]
  174. bridge_j = email_bridge_data[j]
  175. vals = []
  176. # Get smallest key, i.e., the first date BOTH bridges have data for
  177. start_date = max(min(bridge_i), min(bridge_j))
  178. # Get set of keys between start_date and FIRST_DAY
  179. keys = set()
  180. for d in bridge_i:
  181. if d >= start_date and d <= FIRST_DAY:
  182. keys.add(d)
  183. for d in bridge_j:
  184. if d >= start_date and d <= FIRST_DAY:
  185. keys.add(d)
  186. for d in keys:
  187. val = 0
  188. if d in bridge_i and d in bridge_j:
  189. val = bridge_i[d] + bridge_j[d]
  190. elif d in bridge_i:
  191. val = bridge_i[d]
  192. elif d in bridge_j:
  193. val = bridge_j[d]
  194. vals.append(val)
  195. max_count = max(max_count, val)
  196. # If we have no data, don't worry about it
  197. if len(vals) == 0:
  198. continue
  199. mu = numpy.mean(vals)
  200. sigma = numpy.std(vals)
  201. if sigma > 0:
  202. print (f"Double: Bridges {i} and {j}: max={max_count}, mean={mu}, std={sigma}")
  203. print (f"Double: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
  204. print (f"Double: We are looking at data from {len(vals)} days, starting on {start_date}\n")