get-stats.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #!/usr/bin/env python3
  2. import csv
  3. import numpy
  4. import os
  5. import sys
  6. # Starting day: February Nth
  7. #N = 1
  8. N = 20
  9. # See note in readme
  10. JAN_31 = 2459245
  11. # 2021 February Nth as Julian date
  12. FIRST_DAY = JAN_31 + N
  13. TOTAL_BRIDGES = 1890
  14. OBFS4_EMAIL_BRIDGES = 93
  15. def sigfigs(n):
  16. if n == 0.0:
  17. n = 0 # as an int
  18. else:
  19. i=0
  20. while n * (10**i) < 1:
  21. i += 1
  22. n = round(n * 10**i) / 10**i
  23. return n
  24. email_bridges = set()
  25. with open ("data/obfs4-email-bridges", 'r') as f:
  26. for line in f:
  27. if line != "":
  28. email_bridges.add(line.strip())
  29. rel_table = """
  30. \\hline
  31. $h$ & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} & \\textbf{Precision} & \\textbf{Recall} \\\\
  32. \\hline
  33. """
  34. abs_table = rel_table
  35. nomin_table = rel_table
  36. for harshness in range(5):
  37. for suffix in ["", "_abs", "_nomin"]:
  38. with open (f"data/blocked_{harshness}{suffix}", 'r') as f:
  39. # obfs4 email bridges correctly identified as blocked
  40. correct = 0
  41. # obfs4 email bridges identified as blocked before they actually were
  42. too_soon = 0
  43. # non-obfs4-email bridges incorrectly identified as blocked
  44. incorrect = 0
  45. for line in f:
  46. if line != "":
  47. line = line.strip()
  48. fingerprint = line[:40]
  49. date = int(line[41:])
  50. if fingerprint in email_bridges:
  51. if date >= FIRST_DAY:
  52. correct += 1
  53. else:
  54. too_soon += 1
  55. else:
  56. incorrect += 1
  57. tn = TOTAL_BRIDGES - OBFS4_EMAIL_BRIDGES - incorrect
  58. tp = correct
  59. fn = OBFS4_EMAIL_BRIDGES - correct - too_soon
  60. fp = too_soon + incorrect
  61. precision = sigfigs(tp / (tp + fp))
  62. recall = sigfigs(tp / (tp + fn))
  63. newline = f"{harshness} & {tp} & {tn} & {fp} & {fn} & {precision} & {recall} \\\\\n"
  64. if suffix == "":
  65. rel_table += newline
  66. elif suffix == "_abs":
  67. abs_table += newline
  68. else:
  69. nomin_table += newline
  70. print ("Absolute threshold without a minimum:")
  71. print (nomin_table)
  72. print ("Absolute threshold:")
  73. print (abs_table)
  74. print ("Relative threshold:")
  75. print (rel_table)
  76. # Now let's look at stddevs
  77. email_bridges = list(email_bridges)
  78. email_bridge_data = []
  79. email_bridge_max = []
  80. for fingerprint in email_bridges:
  81. # We're going to get all the data for each bridge
  82. bridge_data = dict()
  83. begun = False
  84. max_count = 0
  85. filename = f"data/bridge_data_cleaned/{fingerprint.upper()}"
  86. if os.path.isfile(filename) and os.path.getsize(filename) > 0:
  87. with open(filename, 'r') as csvfile:
  88. data = csv.reader(csvfile, delimiter=',')
  89. for line in data:
  90. # Ignore 0 values until we see a non-zero value
  91. if not begun:
  92. if line[1] != "0":
  93. begun = True
  94. if begun:
  95. date = int(line[0][:line[0].find(' ')])
  96. val = int(line[1])
  97. bridge_data[date] = val
  98. max_count = max(max_count, val)
  99. if begun:
  100. email_bridge_data.append(bridge_data)
  101. email_bridge_max.append(max_count)
  102. # Look at bridges individually
  103. for i in range(len(email_bridge_data)):
  104. bridge = email_bridge_data[i]
  105. vals = []
  106. # Get smallest key, i.e., first date we have data for
  107. start_date = min(bridge)
  108. # Get counts before censorship started
  109. #for d in range(start_date, FIRST_DAY):
  110. # if d in bridge:
  111. # vals.append(bridge[d])
  112. # If this day is not represented, the bridge did not report
  113. # stats; this is different from 0.
  114. # Note: This is cheaper than the above impelmentation.
  115. for date, val in bridge.items():
  116. if date < FIRST_DAY:
  117. vals.append(val)
  118. # If we have no data, don't worry about it
  119. if len(vals) == 0:
  120. continue
  121. mu = numpy.mean(vals)
  122. sigma = numpy.std(vals)
  123. if sigma > 0:
  124. print (f"Single: Bridge {i}: max={email_bridge_max[i]}, mean={mu}, std={sigma}")
  125. print (f"Single: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
  126. print (f"Single: We are looking at data from {len(vals)} days, starting on {start_date}\n")
  127. # Look at pairs of bridges
  128. for i in range(len(email_bridge_data)):
  129. for j in range(i+1, len(email_bridge_data)):
  130. max_count = 0
  131. bridge_i = email_bridge_data[i]
  132. bridge_j = email_bridge_data[j]
  133. vals = []
  134. # Get smallest key, i.e., the first date BOTH bridges have data for
  135. start_date = max(min(bridge_i), min(bridge_j))
  136. # Get counts before censorship started
  137. #for d in range(start_date, FIRST_DAY):
  138. # Get set of keys between start_date and FIRST_DAY
  139. keys = set()
  140. for d in bridge_i:
  141. if d >= start_date and d <= FIRST_DAY:
  142. keys.add(d)
  143. for d in bridge_j:
  144. if d >= start_date and d <= FIRST_DAY:
  145. keys.add(d)
  146. for d in keys:
  147. val = 0
  148. if d in bridge_i and d in bridge_j:
  149. val = bridge_i[d] + bridge_j[d]
  150. elif d in bridge_i:
  151. val = bridge_i[d]
  152. elif d in bridge_j:
  153. val = bridge_j[d]
  154. vals.append(val)
  155. max_count = max(max_count, val)
  156. # If we have no data, don't worry about it
  157. if len(vals) == 0:
  158. continue
  159. mu = numpy.mean(vals)
  160. sigma = numpy.std(vals)
  161. if sigma > 0:
  162. print (f"Double: Bridges {i} and {j}: max={max_count}, mean={mu}, std={sigma}")
  163. print (f"Double: Zero is {mu / sigma} standard deviations away from the mean ({mu})")
  164. print (f"Double: We are looking at data from {len(vals)} days, starting on {start_date}\n")