analysis.rs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. use crate::{BridgeInfo, BridgeInfoType};
  2. use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
  3. use nalgebra::{Cholesky, DMatrix, DVector};
  4. use rand::Rng;
  5. use statrs::distribution::{Continuous, MultivariateNormal, Normal};
  6. use std::{
  7. cmp::min,
  8. collections::{BTreeMap, HashSet},
  9. };
  10. /// Provides a function for predicting which countries block this bridge
  11. pub trait Analyzer {
  12. /// Evaluate open-entry bridge. Returns true if blocked, false otherwise.
  13. fn stage_one(
  14. &self,
  15. confidence: f64,
  16. bridge_ips: &[u32],
  17. bridge_ips_today: u32,
  18. negative_reports: &[u32],
  19. negative_reports_today: u32,
  20. ) -> bool;
  21. /// Evaluate invite-only bridge without positive reports. Return true if
  22. /// blocked, false otherwise.
  23. fn stage_two(
  24. &self,
  25. confidence: f64,
  26. bridge_ips: &[u32],
  27. bridge_ips_today: u32,
  28. negative_reports: &[u32],
  29. negative_reports_today: u32,
  30. ) -> bool;
  31. /// Evaluate invite-only bridge with positive reports. Return true if
  32. /// blocked, false otherwise.
  33. fn stage_three(
  34. &self,
  35. confidence: f64,
  36. bridge_ips: &[u32],
  37. bridge_ips_today: u32,
  38. negative_reports: &[u32],
  39. negative_reports_today: u32,
  40. positive_reports: &[u32],
  41. positive_reports_today: u32,
  42. ) -> bool;
  43. }
  44. /// Accepts an analyzer, information about a bridge, and a confidence value.
  45. /// Returns a set of country codes where the bridge is believed to be blocked.
  46. pub fn blocked_in(
  47. analyzer: &dyn Analyzer,
  48. bridge_info: &BridgeInfo,
  49. confidence: f64,
  50. date: u32,
  51. ) -> HashSet<String> {
  52. let mut blocked_in = HashSet::<String>::new();
  53. let today = date;
  54. for (country, info) in &bridge_info.info_by_country {
  55. let age = today - info.first_seen;
  56. if info.blocked {
  57. // Assume bridges never become unblocked
  58. blocked_in.insert(country.to_string());
  59. } else {
  60. // Get today's values
  61. let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
  62. // TODO: Evaluate on yesterday if we don't have data for today?
  63. let today_info = match info.info_by_day.get(&today) {
  64. Some(v) => v,
  65. None => &new_map_binding,
  66. };
  67. let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {
  68. Some(&v) => v,
  69. None => 0,
  70. };
  71. let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {
  72. Some(&v) => v,
  73. None => 0,
  74. };
  75. let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {
  76. Some(&v) => v,
  77. None => 0,
  78. };
  79. let num_days = min(age, UNTRUSTED_INTERVAL);
  80. // Get time series for last num_days
  81. let mut bridge_ips = vec![0; num_days as usize];
  82. let mut negative_reports = vec![0; num_days as usize];
  83. let mut positive_reports = vec![0; num_days as usize];
  84. for i in 0..num_days {
  85. let date = today - num_days + i - 1;
  86. let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
  87. let day_info = match info.info_by_day.get(&date) {
  88. Some(v) => v,
  89. None => &new_map_binding,
  90. };
  91. bridge_ips[i as usize] = match day_info.get(&BridgeInfoType::BridgeIps) {
  92. Some(&v) => v,
  93. None => 0,
  94. };
  95. negative_reports[i as usize] = match day_info.get(&BridgeInfoType::NegativeReports)
  96. {
  97. Some(&v) => v,
  98. None => 0,
  99. };
  100. positive_reports[i as usize] = match day_info.get(&BridgeInfoType::PositiveReports)
  101. {
  102. Some(&v) => v,
  103. None => 0,
  104. };
  105. }
  106. // Evaluate using appropriate stage based on age of the bridge
  107. if age < UNTRUSTED_INTERVAL {
  108. // open-entry bridge
  109. if analyzer.stage_one(
  110. confidence,
  111. &bridge_ips,
  112. bridge_ips_today,
  113. &negative_reports,
  114. negative_reports_today,
  115. ) {
  116. blocked_in.insert(country.to_string());
  117. }
  118. } else if info.first_pr.is_none() || today < info.first_pr.unwrap() + UNTRUSTED_INTERVAL
  119. {
  120. // invite-only bridge without 30+ days of historical data on
  121. // positive reports
  122. if analyzer.stage_two(
  123. confidence,
  124. &bridge_ips,
  125. bridge_ips_today,
  126. &negative_reports,
  127. negative_reports_today,
  128. ) {
  129. blocked_in.insert(country.to_string());
  130. }
  131. } else {
  132. // invite-only bridge that has been up long enough that it
  133. // might have 30+ days of historical data on positive reports
  134. if analyzer.stage_three(
  135. confidence,
  136. &bridge_ips,
  137. bridge_ips_today,
  138. &negative_reports,
  139. negative_reports_today,
  140. &positive_reports,
  141. positive_reports_today,
  142. ) {
  143. blocked_in.insert(country.to_string());
  144. }
  145. }
  146. }
  147. }
  148. blocked_in
  149. }
  150. // Analyzer implementations
  151. /// Dummy example that never thinks bridges are blocked
  152. pub struct ExampleAnalyzer {}
  153. impl Analyzer for ExampleAnalyzer {
  154. fn stage_one(
  155. &self,
  156. _confidence: f64,
  157. _bridge_ips: &[u32],
  158. _bridge_ips_today: u32,
  159. _negative_reports: &[u32],
  160. _negative_reports_today: u32,
  161. ) -> bool {
  162. false
  163. }
  164. fn stage_two(
  165. &self,
  166. _confidence: f64,
  167. _bridge_ips: &[u32],
  168. _bridge_ips_today: u32,
  169. _negative_reports: &[u32],
  170. _negative_reports_today: u32,
  171. ) -> bool {
  172. false
  173. }
  174. fn stage_three(
  175. &self,
  176. _confidence: f64,
  177. _bridge_ips: &[u32],
  178. _bridge_ips_today: u32,
  179. _negative_reports: &[u32],
  180. _negative_reports_today: u32,
  181. _positive_reports: &[u32],
  182. _positive_reports_today: u32,
  183. ) -> bool {
  184. false
  185. }
  186. }
  187. /// Model data as multivariate normal distribution
  188. pub struct NormalAnalyzer {
  189. max_threshold: u32,
  190. scaling_factor: f64,
  191. }
  192. impl NormalAnalyzer {
  193. pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {
  194. Self {
  195. max_threshold,
  196. scaling_factor,
  197. }
  198. }
  199. // Returns the mean vector, vector of individual standard deviations, and
  200. // covariance matrix. If the standard deviation for a variable is 0 and/or
  201. // the covariance matrix is not positive definite, add some noise to the
  202. // data and recompute.
  203. fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
  204. let n = data.len();
  205. // Compute mean and standard deviation vectors
  206. let (mean_vec, sd_vec) = {
  207. let mut mean_vec = Vec::<f64>::new();
  208. let mut sd_vec = Vec::<f64>::new();
  209. for var in data {
  210. // Compute mean
  211. let mut sum = 0.0;
  212. for count in *var {
  213. sum += *count as f64;
  214. }
  215. let mean = sum / var.len() as f64;
  216. // Compute standard deviation
  217. let mut sum = 0.0;
  218. for count in *var {
  219. sum += (*count as f64 - mean).powi(2);
  220. }
  221. let sd = (sum / var.len() as f64).sqrt();
  222. mean_vec.push(mean);
  223. sd_vec.push(sd);
  224. }
  225. (mean_vec, sd_vec)
  226. };
  227. // Compute covariance matrix
  228. let cov_mat = {
  229. let mut cov_mat = Vec::<f64>::new();
  230. // We don't need to recompute Syx, but we currently do
  231. for i in 0..n {
  232. for j in 0..n {
  233. cov_mat.push({
  234. let var1 = data[i];
  235. let var1_mean = mean_vec[i];
  236. let var2 = data[j];
  237. let var2_mean = mean_vec[j];
  238. assert_eq!(var1.len(), var2.len());
  239. let mut sum = 0.0;
  240. for index in 0..var1.len() {
  241. sum +=
  242. (var1[index] as f64 - var1_mean) * (var2[index] as f64 - var2_mean);
  243. }
  244. sum / (var1.len() - 1) as f64
  245. });
  246. }
  247. }
  248. cov_mat
  249. };
  250. // If any standard deviation is 0 or the covariance matrix is not
  251. // positive definite, add some noise and recompute.
  252. let mut recompute = false;
  253. for sd in &sd_vec {
  254. if *sd <= 0.0 {
  255. recompute = true;
  256. }
  257. }
  258. if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() {
  259. recompute = true;
  260. }
  261. if !recompute {
  262. (mean_vec, sd_vec, cov_mat)
  263. } else {
  264. // Add random noise and recompute
  265. let mut new_data = vec![vec![0; data[0].len()]; n];
  266. let mut rng = rand::thread_rng();
  267. for i in 0..n {
  268. for j in 0..data[i].len() {
  269. // Add 1 to some randomly selected values
  270. new_data[i][j] = data[i][j] + rng.gen_range(0..=1);
  271. }
  272. }
  273. // Compute stats on modified data
  274. Self::stats(&new_data.iter().map(Vec::as_slice).collect::<Vec<&[u32]>>())
  275. }
  276. }
  277. }
  278. impl Analyzer for NormalAnalyzer {
  279. /// Evaluate open-entry bridge based on only today's data
  280. fn stage_one(
  281. &self,
  282. _confidence: f64,
  283. _bridge_ips: &[u32],
  284. bridge_ips_today: u32,
  285. _negative_reports: &[u32],
  286. negative_reports_today: u32,
  287. ) -> bool {
  288. negative_reports_today > self.max_threshold
  289. || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
  290. }
  291. /// Evaluate invite-only bridge based on last 30 days
  292. fn stage_two(
  293. &self,
  294. confidence: f64,
  295. bridge_ips: &[u32],
  296. bridge_ips_today: u32,
  297. negative_reports: &[u32],
  298. negative_reports_today: u32,
  299. ) -> bool {
  300. assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
  301. assert_eq!(bridge_ips.len(), negative_reports.len());
  302. let alpha = 1.0 - confidence;
  303. let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports]);
  304. let negative_reports_mean = mean_vec[1];
  305. let bridge_ips_sd = sd_vec[0];
  306. let negative_reports_sd = sd_vec[1];
  307. // Artificially create data for alternative hypothesis
  308. let num_days = bridge_ips.len() as usize;
  309. let mut bridge_ips_blocked = vec![0; num_days];
  310. let mut negative_reports_blocked = vec![0; num_days];
  311. let bridge_ips_deviation = (2.0 * bridge_ips_sd).round() as u32;
  312. for i in 0..num_days {
  313. // Suppose bridge stats will go down by 2 SDs
  314. bridge_ips_blocked[i] = if bridge_ips_deviation > bridge_ips[i] {
  315. 0
  316. } else {
  317. bridge_ips[i] - bridge_ips_deviation
  318. };
  319. // Suppose negative reports will go up by 2 SDs
  320. negative_reports_blocked[i] =
  321. negative_reports[i] + (2.0 * negative_reports_sd).round() as u32;
  322. }
  323. let (mean_vec_blocked, _sd_vec_blocked, cov_mat_blocked) =
  324. Self::stats(&[&bridge_ips_blocked, &negative_reports_blocked]);
  325. let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
  326. let pdf = mvn.pdf(&DVector::from_vec(vec![
  327. bridge_ips_today as f64,
  328. negative_reports_today as f64,
  329. ]));
  330. let mvn_blocked = MultivariateNormal::new(mean_vec_blocked, cov_mat_blocked).unwrap();
  331. let pdf_blocked = mvn_blocked.pdf(&DVector::from_vec(vec![
  332. bridge_ips_today as f64,
  333. negative_reports_today as f64,
  334. ]));
  335. // Also model negative reports in isolation
  336. let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
  337. let nr_pdf = nr_normal.pdf(negative_reports_today as f64);
  338. let nr_normal_blocked = Normal::new(
  339. negative_reports_mean + 2.0 * negative_reports_sd,
  340. negative_reports_sd,
  341. )
  342. .unwrap();
  343. let nr_pdf_blocked = nr_normal_blocked.pdf(negative_reports_today as f64);
  344. (pdf / pdf_blocked).ln() < alpha || (nr_pdf / nr_pdf_blocked).ln() < alpha
  345. }
  346. /// Evaluate invite-only bridge with lv3+ users submitting positive reports
  347. fn stage_three(
  348. &self,
  349. confidence: f64,
  350. bridge_ips: &[u32],
  351. bridge_ips_today: u32,
  352. negative_reports: &[u32],
  353. negative_reports_today: u32,
  354. positive_reports: &[u32],
  355. positive_reports_today: u32,
  356. ) -> bool {
  357. assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
  358. assert_eq!(bridge_ips.len(), negative_reports.len());
  359. assert_eq!(bridge_ips.len(), positive_reports.len());
  360. let alpha = 1.0 - confidence;
  361. let (mean_vec, sd_vec, cov_mat) =
  362. Self::stats(&[bridge_ips, negative_reports, positive_reports]);
  363. let negative_reports_mean = mean_vec[1];
  364. let bridge_ips_sd = sd_vec[0];
  365. let negative_reports_sd = sd_vec[1];
  366. let positive_reports_sd = sd_vec[2];
  367. // Artificially create data for alternative hypothesis
  368. let num_days = bridge_ips.len() as usize;
  369. let mut bridge_ips_blocked = vec![0; num_days];
  370. let mut negative_reports_blocked = vec![0; num_days];
  371. let mut positive_reports_blocked = vec![0; num_days];
  372. let bridge_ips_deviation = (2.0 * bridge_ips_sd).round() as u32;
  373. let positive_reports_deviation = (2.0 * positive_reports_sd).round() as u32;
  374. for i in 0..num_days {
  375. // Suppose positive reports will go down by 2 SDs
  376. positive_reports_blocked[i] = if positive_reports_deviation > positive_reports[i] {
  377. 0
  378. } else {
  379. positive_reports[i] - positive_reports_deviation
  380. };
  381. // Suppose bridge stats will go down by 2 SDs
  382. bridge_ips_blocked[i] = if bridge_ips_deviation > bridge_ips[i] {
  383. 0
  384. } else {
  385. bridge_ips[i] - bridge_ips_deviation
  386. };
  387. // Suppose each user who would have submitted a positive report but
  388. // didn't submits a negative report instead.
  389. negative_reports_blocked[i] =
  390. negative_reports[i] + positive_reports[i] - positive_reports_blocked[i];
  391. }
  392. let (mean_vec_blocked, _sd_vec_blocked, cov_mat_blocked) = Self::stats(&[
  393. &bridge_ips_blocked,
  394. &negative_reports_blocked,
  395. &positive_reports_blocked,
  396. ]);
  397. let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
  398. let pdf = mvn.pdf(&DVector::from_vec(vec![
  399. bridge_ips_today as f64,
  400. negative_reports_today as f64,
  401. positive_reports_today as f64,
  402. ]));
  403. let mvn_blocked = MultivariateNormal::new(mean_vec_blocked, cov_mat_blocked).unwrap();
  404. let pdf_blocked = mvn_blocked.pdf(&DVector::from_vec(vec![
  405. bridge_ips_today as f64,
  406. negative_reports_today as f64,
  407. positive_reports_today as f64,
  408. ]));
  409. // Also model negative reports in isolation
  410. let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
  411. let nr_pdf = nr_normal.pdf(negative_reports_today as f64);
  412. // Note we do NOT make this a function of positive signals
  413. let nr_normal_blocked = Normal::new(
  414. negative_reports_mean + 2.0 * negative_reports_sd,
  415. negative_reports_sd,
  416. )
  417. .unwrap();
  418. let nr_pdf_blocked = nr_normal_blocked.pdf(negative_reports_today as f64);
  419. (pdf / pdf_blocked).ln() < alpha || (nr_pdf / nr_pdf_blocked).ln() < alpha
  420. }
  421. }