analysis.rs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. use crate::{BridgeInfo, BridgeInfoType};
  2. use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
  3. use nalgebra::DVector;
  4. use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
  5. use statrs::statistics::Statistics;
  6. use std::{
  7. cmp::min,
  8. collections::{BTreeMap, HashSet},
  9. };
  10. #[cfg(feature = "simulation")]
  11. use crate::get_date;
  12. /// Provides a function for predicting which countries block this bridge
  13. pub trait Analyzer {
  14. /// Evaluate open-entry bridge. Returns true if blocked, false otherwise.
  15. fn stage_one(
  16. &self,
  17. age: u32,
  18. confidence: f64,
  19. bridge_ips: &[u32],
  20. bridge_ips_today: u32,
  21. negative_reports: &[u32],
  22. negative_reports_today: u32,
  23. ) -> bool;
  24. /// Evaluate invite-only bridge without positive reports. Return true if
  25. /// blocked, false otherwise.
  26. fn stage_two(
  27. &self,
  28. age: u32,
  29. confidence: f64,
  30. bridge_ips: &[u32],
  31. bridge_ips_today: u32,
  32. negative_reports: &[u32],
  33. negative_reports_today: u32,
  34. ) -> bool;
  35. /// Evaluate invite-only bridge with positive reports. Return true if
  36. /// blocked, false otherwise.
  37. fn stage_three(
  38. &self,
  39. age: u32,
  40. confidence: f64,
  41. bridge_ips: &[u32],
  42. bridge_ips_today: u32,
  43. negative_reports: &[u32],
  44. negative_reports_today: u32,
  45. positive_reports: &[u32],
  46. positive_reports_today: u32,
  47. ) -> bool;
  48. }
  49. /// Accepts an analyzer, information about a bridge, and a confidence value.
  50. /// Returns a set of country codes where the bridge is believed to be blocked.
  51. pub fn blocked_in(
  52. analyzer: &dyn Analyzer,
  53. bridge_info: &BridgeInfo,
  54. confidence: f64,
  55. date: u32,
  56. min_historical_days: u32,
  57. max_historical_days: u32,
  58. ) -> HashSet<String> {
  59. let mut blocked_in = HashSet::<String>::new();
  60. let today = date;
  61. for (country, info) in &bridge_info.info_by_country {
  62. let age = today - info.first_seen;
  63. if info.blocked {
  64. // Assume bridges never become unblocked
  65. blocked_in.insert(country.to_string());
  66. } else {
  67. // Get today's values
  68. let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
  69. // TODO: Evaluate on yesterday if we don't have data for today?
  70. let today_info = match info.info_by_day.get(&today) {
  71. Some(v) => v,
  72. None => &new_map_binding,
  73. };
  74. let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {
  75. Some(&v) => v,
  76. None => 0,
  77. };
  78. let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {
  79. Some(&v) => v,
  80. None => 0,
  81. };
  82. let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {
  83. Some(&v) => v,
  84. None => 0,
  85. };
  86. let num_days = min(age, max_historical_days);
  87. // Get time series for last num_days
  88. let mut bridge_ips = vec![0; num_days as usize];
  89. let mut negative_reports = vec![0; num_days as usize];
  90. let mut positive_reports = vec![0; num_days as usize];
  91. for i in 0..num_days {
  92. let date = today - num_days + i - 1;
  93. let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
  94. let day_info = match info.info_by_day.get(&date) {
  95. Some(v) => v,
  96. None => &new_map_binding,
  97. };
  98. bridge_ips[i as usize] = match day_info.get(&BridgeInfoType::BridgeIps) {
  99. Some(&v) => v,
  100. None => 0,
  101. };
  102. negative_reports[i as usize] = match day_info.get(&BridgeInfoType::NegativeReports)
  103. {
  104. Some(&v) => v,
  105. None => 0,
  106. };
  107. positive_reports[i as usize] = match day_info.get(&BridgeInfoType::PositiveReports)
  108. {
  109. Some(&v) => v,
  110. None => 0,
  111. };
  112. }
  113. // Evaluate using appropriate stage based on age of the bridge
  114. if age < UNTRUSTED_INTERVAL || age < min_historical_days {
  115. // open-entry bridge and/or not enough days of
  116. // historical days for stages 2 and 3
  117. if analyzer.stage_one(
  118. age,
  119. confidence,
  120. &bridge_ips,
  121. bridge_ips_today,
  122. &negative_reports,
  123. negative_reports_today,
  124. ) {
  125. blocked_in.insert(country.to_string());
  126. }
  127. } else if info.first_pr.is_none()
  128. || today < info.first_pr.unwrap() + min_historical_days
  129. {
  130. // invite-only bridge without min_historical_days of
  131. // historical data on positive reports
  132. if analyzer.stage_two(
  133. age,
  134. confidence,
  135. &bridge_ips,
  136. bridge_ips_today,
  137. &negative_reports,
  138. negative_reports_today,
  139. ) {
  140. blocked_in.insert(country.to_string());
  141. }
  142. } else {
  143. // invite-only bridge that has min_historical_days or
  144. // more of historical data since the first positive report
  145. if analyzer.stage_three(
  146. age,
  147. confidence,
  148. &bridge_ips,
  149. bridge_ips_today,
  150. &negative_reports,
  151. negative_reports_today,
  152. &positive_reports,
  153. positive_reports_today,
  154. ) {
  155. blocked_in.insert(country.to_string());
  156. } else {
  157. // Logging in simulation mode
  158. #[cfg(feature = "simulation")]
  159. if analyzer.stage_two(
  160. age,
  161. confidence,
  162. &bridge_ips,
  163. bridge_ips_today,
  164. &negative_reports,
  165. negative_reports_today,
  166. ) {
  167. println!(
  168. "{} detected not blocked due to positive reports on day {}",
  169. array_bytes::bytes2hex("", bridge_info.fingerprint),
  170. get_date()
  171. );
  172. }
  173. }
  174. }
  175. }
  176. }
  177. blocked_in
  178. }
  179. // Analyzer implementations
  180. /// Dummy example that never thinks bridges are blocked
  181. pub struct ExampleAnalyzer {}
  182. impl Analyzer for ExampleAnalyzer {
  183. fn stage_one(
  184. &self,
  185. _age: u32,
  186. _confidence: f64,
  187. _bridge_ips: &[u32],
  188. _bridge_ips_today: u32,
  189. _negative_reports: &[u32],
  190. _negative_reports_today: u32,
  191. ) -> bool {
  192. false
  193. }
  194. fn stage_two(
  195. &self,
  196. _age: u32,
  197. _confidence: f64,
  198. _bridge_ips: &[u32],
  199. _bridge_ips_today: u32,
  200. _negative_reports: &[u32],
  201. _negative_reports_today: u32,
  202. ) -> bool {
  203. false
  204. }
  205. fn stage_three(
  206. &self,
  207. _age: u32,
  208. _confidence: f64,
  209. _bridge_ips: &[u32],
  210. _bridge_ips_today: u32,
  211. _negative_reports: &[u32],
  212. _negative_reports_today: u32,
  213. _positive_reports: &[u32],
  214. _positive_reports_today: u32,
  215. ) -> bool {
  216. false
  217. }
  218. }
  219. /// Model data as multivariate normal distribution
  220. pub struct NormalAnalyzer {
  221. max_threshold: u32,
  222. scaling_factor: f64,
  223. }
  224. impl NormalAnalyzer {
  225. pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {
  226. Self {
  227. max_threshold,
  228. scaling_factor,
  229. }
  230. }
  231. }
  232. impl Analyzer for NormalAnalyzer {
  233. /// Evaluate open-entry bridge based on only today's data
  234. fn stage_one(
  235. &self,
  236. _age: u32,
  237. _confidence: f64,
  238. _bridge_ips: &[u32],
  239. bridge_ips_today: u32,
  240. _negative_reports: &[u32],
  241. negative_reports_today: u32,
  242. ) -> bool {
  243. negative_reports_today > self.max_threshold
  244. || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
  245. }
  246. /// Evaluate invite-only bridge based on historical data
  247. fn stage_two(
  248. &self,
  249. _age: u32,
  250. confidence: f64,
  251. bridge_ips: &[u32],
  252. bridge_ips_today: u32,
  253. negative_reports: &[u32],
  254. negative_reports_today: u32,
  255. ) -> bool {
  256. assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
  257. assert_eq!(bridge_ips.len(), negative_reports.len());
  258. let alpha = 1.0 - confidence;
  259. // Convert to f64 for stats
  260. let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
  261. let negative_reports_f64 = &negative_reports
  262. .iter()
  263. .map(|n| *n as f64)
  264. .collect::<Vec<f64>>();
  265. // Evaluate based on negative reports
  266. let negative_reports_mean = negative_reports_f64.mean();
  267. let negative_reports_sd = negative_reports_f64.std_dev();
  268. // Only use CCDF test if today's numbers are worse than average
  269. if (negative_reports_today as f64) > negative_reports_mean {
  270. let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
  271. if negative_reports_sd > 0.0 {
  272. // We use CCDF because more negative reports is worse.
  273. if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
  274. return true;
  275. }
  276. } else {
  277. // If the standard deviation is 0, we need another option.
  278. // Consider the bridge blocked negative reports increase by
  279. // more than 1 after a long static period. (Note that the
  280. // mean is the exact value because we had no deviation.)
  281. if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
  282. return true;
  283. }
  284. }
  285. }
  286. // Evaluate based on bridge stats
  287. let bridge_ips_mean = bridge_ips_f64.mean();
  288. let bridge_ips_sd = bridge_ips_f64.std_dev();
  289. // Only use CDF test if today's numbers are worse than average
  290. if (bridge_ips_today as f64) < bridge_ips_mean {
  291. let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
  292. if bridge_ips_sd > 0.0 {
  293. if bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha {
  294. return true;
  295. }
  296. } else {
  297. // If the standard deviation is 0, we need another option.
  298. // Consider the bridge blocked if its usage dropped by more
  299. // than 1 bin. (Note that the mean is the exact value
  300. // because we had no deviation.)
  301. if (bridge_ips_today as f64) < bridge_ips_mean - 8.0 {
  302. return true;
  303. }
  304. }
  305. }
  306. // If none of the tests concluded that the bridge is blocked,
  307. // return false
  308. false
  309. }
  310. /// Evaluate invite-only bridge with lv3+ users submitting positive reports
  311. fn stage_three(
  312. &self,
  313. age: u32,
  314. confidence: f64,
  315. bridge_ips: &[u32],
  316. bridge_ips_today: u32,
  317. negative_reports: &[u32],
  318. negative_reports_today: u32,
  319. positive_reports: &[u32],
  320. positive_reports_today: u32,
  321. ) -> bool {
  322. assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
  323. assert_eq!(bridge_ips.len(), negative_reports.len());
  324. assert_eq!(bridge_ips.len(), positive_reports.len());
  325. let alpha = 1.0 - confidence;
  326. // Convert to f64 for stats
  327. let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
  328. let negative_reports_f64 = &negative_reports
  329. .iter()
  330. .map(|n| *n as f64)
  331. .collect::<Vec<f64>>();
  332. let positive_reports_f64 = &positive_reports
  333. .iter()
  334. .map(|n| *n as f64)
  335. .collect::<Vec<f64>>();
  336. // Evaluate based on negative reports. It is better to compute
  337. // negative reports test first because the positive test may be
  338. // expensive.
  339. let negative_reports_mean = negative_reports_f64.mean();
  340. let negative_reports_sd = negative_reports_f64.std_dev();
  341. // Only use CCDF test if today's numbers are worse than average
  342. if (negative_reports_today as f64) > negative_reports_mean {
  343. let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
  344. if negative_reports_sd > 0.0 {
  345. // We use CCDF because more negative reports is worse.
  346. if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
  347. return true;
  348. }
  349. } else {
  350. // Consider the bridge blocked negative reports increase by
  351. // more than 1 after a long static period. (Note that the
  352. // mean is the exact value because we had no deviation.)
  353. if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
  354. return true;
  355. }
  356. }
  357. }
  358. // Evaluate based on bridge stats and positive reports.
  359. let bridge_ips_mean = bridge_ips_f64.mean();
  360. let positive_reports_mean = positive_reports_f64.mean();
  361. let cov_mat = {
  362. let x = bridge_ips_f64;
  363. let y = positive_reports_f64;
  364. let xx = x.covariance(x);
  365. let xy = x.covariance(y);
  366. let yy = y.covariance(y);
  367. vec![xx, xy, xy, yy]
  368. };
  369. // Only use CDF test if today's numbers are worse than average
  370. if (bridge_ips_today as f64) < bridge_ips_mean
  371. || (positive_reports_today as f64) < positive_reports_mean
  372. {
  373. let mvn =
  374. MultivariateNormal::new(vec![bridge_ips_mean, positive_reports_mean], cov_mat);
  375. if mvn.is_ok() {
  376. let mvn = mvn.unwrap();
  377. // Start 3 standard deviations below the mean, based on
  378. // 68-95-99.7 rule, assuming the confidence will be high
  379. // enough that 99.7 is close enough to "the whole
  380. // distribution" to be reasonable
  381. let bip_start = (bridge_ips_mean - (3.0 * bridge_ips_f64.std_dev()).ceil()) as i32;
  382. let pr_start =
  383. (positive_reports_mean - (3.0 * positive_reports_f64.std_dev()).ceil()) as i32;
  384. // Estimate the CDF by integrating the PDF by hand with step
  385. // size 1
  386. let mut cdf = 0.0;
  387. for bip in bip_start..bridge_ips_today as i32 {
  388. for pr in pr_start..positive_reports_today as i32 {
  389. cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
  390. }
  391. }
  392. if cdf < alpha {
  393. return true;
  394. }
  395. } else {
  396. // If we have 0 standard deviation or a covariance matrix
  397. // that is not positive definite, we need another way to
  398. // evaluate each variable. Ignore positive reports and
  399. // compute as in stage 2
  400. if self.stage_two(
  401. age,
  402. confidence,
  403. bridge_ips,
  404. bridge_ips_today,
  405. negative_reports,
  406. negative_reports_today,
  407. ) {
  408. return true;
  409. }
  410. }
  411. }
  412. // If none of the tests concluded that the bridge is blocked,
  413. // return false
  414. false
  415. }
  416. }