analysis.rs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. use crate::{BridgeInfo, BridgeInfoType};
  2. use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
  3. use nalgebra::DVector;
  4. use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
  5. use statrs::statistics::Statistics;
  6. use std::{
  7. cmp::min,
  8. collections::{BTreeMap, HashSet},
  9. };
  10. /// Provides a function for predicting which countries block this bridge
  11. pub trait Analyzer {
  12. /// Evaluate open-entry bridge. Returns true if blocked, false otherwise.
  13. fn stage_one(
  14. &self,
  15. confidence: f64,
  16. bridge_ips: &[u32],
  17. bridge_ips_today: u32,
  18. negative_reports: &[u32],
  19. negative_reports_today: u32,
  20. ) -> bool;
  21. /// Evaluate invite-only bridge without positive reports. Return true if
  22. /// blocked, false otherwise.
  23. fn stage_two(
  24. &self,
  25. confidence: f64,
  26. bridge_ips: &[u32],
  27. bridge_ips_today: u32,
  28. negative_reports: &[u32],
  29. negative_reports_today: u32,
  30. ) -> bool;
  31. /// Evaluate invite-only bridge with positive reports. Return true if
  32. /// blocked, false otherwise.
  33. fn stage_three(
  34. &self,
  35. confidence: f64,
  36. bridge_ips: &[u32],
  37. bridge_ips_today: u32,
  38. negative_reports: &[u32],
  39. negative_reports_today: u32,
  40. positive_reports: &[u32],
  41. positive_reports_today: u32,
  42. ) -> bool;
  43. }
  44. /// Accepts an analyzer, information about a bridge, and a confidence value.
  45. /// Returns a set of country codes where the bridge is believed to be blocked.
  46. pub fn blocked_in(
  47. analyzer: &dyn Analyzer,
  48. bridge_info: &BridgeInfo,
  49. confidence: f64,
  50. date: u32,
  51. min_historical_days: u32,
  52. max_historical_days: u32,
  53. ) -> HashSet<String> {
  54. let mut blocked_in = HashSet::<String>::new();
  55. let today = date;
  56. for (country, info) in &bridge_info.info_by_country {
  57. let age = today - info.first_seen;
  58. if info.blocked {
  59. // Assume bridges never become unblocked
  60. blocked_in.insert(country.to_string());
  61. } else {
  62. // Get today's values
  63. let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
  64. // TODO: Evaluate on yesterday if we don't have data for today?
  65. let today_info = match info.info_by_day.get(&today) {
  66. Some(v) => v,
  67. None => &new_map_binding,
  68. };
  69. let bridge_ips_today = match today_info.get(&BridgeInfoType::BridgeIps) {
  70. Some(&v) => v,
  71. None => 0,
  72. };
  73. let negative_reports_today = match today_info.get(&BridgeInfoType::NegativeReports) {
  74. Some(&v) => v,
  75. None => 0,
  76. };
  77. let positive_reports_today = match today_info.get(&BridgeInfoType::PositiveReports) {
  78. Some(&v) => v,
  79. None => 0,
  80. };
  81. let num_days = min(age, max_historical_days);
  82. // Get time series for last num_days
  83. let mut bridge_ips = vec![0; num_days as usize];
  84. let mut negative_reports = vec![0; num_days as usize];
  85. let mut positive_reports = vec![0; num_days as usize];
  86. for i in 0..num_days {
  87. let date = today - num_days + i - 1;
  88. let new_map_binding = BTreeMap::<BridgeInfoType, u32>::new();
  89. let day_info = match info.info_by_day.get(&date) {
  90. Some(v) => v,
  91. None => &new_map_binding,
  92. };
  93. bridge_ips[i as usize] = match day_info.get(&BridgeInfoType::BridgeIps) {
  94. Some(&v) => v,
  95. None => 0,
  96. };
  97. negative_reports[i as usize] = match day_info.get(&BridgeInfoType::NegativeReports)
  98. {
  99. Some(&v) => v,
  100. None => 0,
  101. };
  102. positive_reports[i as usize] = match day_info.get(&BridgeInfoType::PositiveReports)
  103. {
  104. Some(&v) => v,
  105. None => 0,
  106. };
  107. }
  108. // Evaluate using appropriate stage based on age of the bridge
  109. if age < UNTRUSTED_INTERVAL || age < min_historical_days {
  110. // open-entry bridge and/or not enough days of
  111. // historical days for stages 2 and 3
  112. if analyzer.stage_one(
  113. confidence,
  114. &bridge_ips,
  115. bridge_ips_today,
  116. &negative_reports,
  117. negative_reports_today,
  118. ) {
  119. blocked_in.insert(country.to_string());
  120. }
  121. } else if info.first_pr.is_none()
  122. || today < info.first_pr.unwrap() + min_historical_days
  123. {
  124. // invite-only bridge without min_historical_days of
  125. // historical data on positive reports
  126. if analyzer.stage_two(
  127. confidence,
  128. &bridge_ips,
  129. bridge_ips_today,
  130. &negative_reports,
  131. negative_reports_today,
  132. ) {
  133. blocked_in.insert(country.to_string());
  134. }
  135. } else {
  136. // invite-only bridge that has min_historical_days or
  137. // more of historical data since the first positive report
  138. if analyzer.stage_three(
  139. confidence,
  140. &bridge_ips,
  141. bridge_ips_today,
  142. &negative_reports,
  143. negative_reports_today,
  144. &positive_reports,
  145. positive_reports_today,
  146. ) {
  147. blocked_in.insert(country.to_string());
  148. }
  149. }
  150. }
  151. }
  152. blocked_in
  153. }
  154. // Analyzer implementations
  155. /// Dummy example that never thinks bridges are blocked
  156. pub struct ExampleAnalyzer {}
  157. impl Analyzer for ExampleAnalyzer {
  158. fn stage_one(
  159. &self,
  160. _confidence: f64,
  161. _bridge_ips: &[u32],
  162. _bridge_ips_today: u32,
  163. _negative_reports: &[u32],
  164. _negative_reports_today: u32,
  165. ) -> bool {
  166. false
  167. }
  168. fn stage_two(
  169. &self,
  170. _confidence: f64,
  171. _bridge_ips: &[u32],
  172. _bridge_ips_today: u32,
  173. _negative_reports: &[u32],
  174. _negative_reports_today: u32,
  175. ) -> bool {
  176. false
  177. }
  178. fn stage_three(
  179. &self,
  180. _confidence: f64,
  181. _bridge_ips: &[u32],
  182. _bridge_ips_today: u32,
  183. _negative_reports: &[u32],
  184. _negative_reports_today: u32,
  185. _positive_reports: &[u32],
  186. _positive_reports_today: u32,
  187. ) -> bool {
  188. false
  189. }
  190. }
  191. /// Model data as multivariate normal distribution
  192. pub struct NormalAnalyzer {
  193. max_threshold: u32,
  194. scaling_factor: f64,
  195. }
  196. impl NormalAnalyzer {
  197. pub fn new(max_threshold: u32, scaling_factor: f64) -> Self {
  198. Self {
  199. max_threshold,
  200. scaling_factor,
  201. }
  202. }
  203. }
  204. impl Analyzer for NormalAnalyzer {
  205. /// Evaluate open-entry bridge based on only today's data
  206. fn stage_one(
  207. &self,
  208. _confidence: f64,
  209. _bridge_ips: &[u32],
  210. bridge_ips_today: u32,
  211. _negative_reports: &[u32],
  212. negative_reports_today: u32,
  213. ) -> bool {
  214. negative_reports_today > self.max_threshold
  215. || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
  216. }
  217. /// Evaluate invite-only bridge based on historical data
  218. fn stage_two(
  219. &self,
  220. confidence: f64,
  221. bridge_ips: &[u32],
  222. bridge_ips_today: u32,
  223. negative_reports: &[u32],
  224. negative_reports_today: u32,
  225. ) -> bool {
  226. assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
  227. assert_eq!(bridge_ips.len(), negative_reports.len());
  228. let alpha = 1.0 - confidence;
  229. // Convert to f64 for stats
  230. let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
  231. let negative_reports_f64 = &negative_reports
  232. .iter()
  233. .map(|n| *n as f64)
  234. .collect::<Vec<f64>>();
  235. // Evaluate based on negative reports
  236. let negative_reports_mean = negative_reports_f64.mean();
  237. let negative_reports_sd = negative_reports_f64.std_dev();
  238. // Only use CCDF test if today's numbers are worse than average
  239. if (negative_reports_today as f64) > negative_reports_mean {
  240. let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
  241. if negative_reports_sd > 0.0 {
  242. // We use CCDF because more negative reports is worse.
  243. if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
  244. return true;
  245. }
  246. } else {
  247. // If the standard deviation is 0, we need another option.
  248. // Consider the bridge blocked negative reports increase by
  249. // more than 1 after a long static period. (Note that the
  250. // mean is the exact value because we had no deviation.)
  251. if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
  252. return true;
  253. }
  254. }
  255. }
  256. // Evaluate based on bridge stats
  257. let bridge_ips_mean = bridge_ips_f64.mean();
  258. let bridge_ips_sd = bridge_ips_f64.std_dev();
  259. // Only use CDF test if today's numbers are worse than average
  260. if (bridge_ips_today as f64) < bridge_ips_mean {
  261. let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
  262. if bridge_ips_sd > 0.0 {
  263. if bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha {
  264. return true;
  265. }
  266. } else {
  267. // If the standard deviation is 0, we need another option.
  268. // Consider the bridge blocked if its usage dropped by more
  269. // than 1 bin. (Note that the mean is the exact value
  270. // because we had no deviation.)
  271. if (bridge_ips_today as f64) < bridge_ips_mean - 8.0 {
  272. return true;
  273. }
  274. }
  275. }
  276. // If none of the tests concluded that the bridge is blocked,
  277. // return false
  278. false
  279. }
  280. /// Evaluate invite-only bridge with lv3+ users submitting positive reports
  281. fn stage_three(
  282. &self,
  283. confidence: f64,
  284. bridge_ips: &[u32],
  285. bridge_ips_today: u32,
  286. negative_reports: &[u32],
  287. negative_reports_today: u32,
  288. positive_reports: &[u32],
  289. positive_reports_today: u32,
  290. ) -> bool {
  291. assert!(bridge_ips.len() >= UNTRUSTED_INTERVAL as usize);
  292. assert_eq!(bridge_ips.len(), negative_reports.len());
  293. assert_eq!(bridge_ips.len(), positive_reports.len());
  294. let alpha = 1.0 - confidence;
  295. // Convert to f64 for stats
  296. let bridge_ips_f64 = &bridge_ips.iter().map(|n| *n as f64).collect::<Vec<f64>>();
  297. let negative_reports_f64 = &negative_reports
  298. .iter()
  299. .map(|n| *n as f64)
  300. .collect::<Vec<f64>>();
  301. let positive_reports_f64 = &positive_reports
  302. .iter()
  303. .map(|n| *n as f64)
  304. .collect::<Vec<f64>>();
  305. // Evaluate based on negative reports. It is better to compute
  306. // negative reports test first because the positive test may be
  307. // expensive.
  308. let negative_reports_mean = negative_reports_f64.mean();
  309. let negative_reports_sd = negative_reports_f64.std_dev();
  310. // Only use CCDF test if today's numbers are worse than average
  311. if (negative_reports_today as f64) > negative_reports_mean {
  312. let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
  313. if negative_reports_sd > 0.0 {
  314. // We use CCDF because more negative reports is worse.
  315. if (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha {
  316. return true;
  317. }
  318. } else {
  319. // Consider the bridge blocked negative reports increase by
  320. // more than 1 after a long static period. (Note that the
  321. // mean is the exact value because we had no deviation.)
  322. if (negative_reports_today as f64) > negative_reports_mean + 1.0 {
  323. return true;
  324. }
  325. }
  326. }
  327. // Evaluate based on bridge stats and positive reports.
  328. let bridge_ips_mean = bridge_ips_f64.mean();
  329. let positive_reports_mean = positive_reports_f64.mean();
  330. let cov_mat = {
  331. let x = bridge_ips_f64;
  332. let y = positive_reports_f64;
  333. let xx = x.covariance(x);
  334. let xy = x.covariance(y);
  335. let yy = y.covariance(y);
  336. vec![xx, xy, xy, yy]
  337. };
  338. // Only use CDF test if today's numbers are worse than average
  339. if (bridge_ips_today as f64) < bridge_ips_mean
  340. || (positive_reports_today as f64) < positive_reports_mean
  341. {
  342. let mvn =
  343. MultivariateNormal::new(vec![bridge_ips_mean, positive_reports_mean], cov_mat);
  344. if mvn.is_ok() {
  345. let mvn = mvn.unwrap();
  346. // Start 3 standard deviations below the mean, based on
  347. // 68-95-99.7 rule, assuming the confidence will be high
  348. // enough that 99.7 is close enough to "the whole
  349. // distribution" to be reasonable
  350. let bip_start = (bridge_ips_mean - (3.0 * bridge_ips_f64.std_dev()).ceil()) as i32;
  351. let pr_start =
  352. (positive_reports_mean - (3.0 * positive_reports_f64.std_dev()).ceil()) as i32;
  353. // Estimate the CDF by integrating the PDF by hand with step
  354. // size 1
  355. let mut cdf = 0.0;
  356. for bip in bip_start..bridge_ips_today as i32 {
  357. for pr in pr_start..positive_reports_today as i32 {
  358. cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
  359. }
  360. }
  361. if cdf < alpha {
  362. return true;
  363. }
  364. } else {
  365. // If we have 0 standard deviation or a covariance matrix
  366. // that is not positive definite, we need another way to
  367. // evaluate each variable. Ignore positive reports and
  368. // compute as in stage 2
  369. if self.stage_two(
  370. confidence,
  371. bridge_ips,
  372. bridge_ips_today,
  373. negative_reports,
  374. negative_reports_today,
  375. ) {
  376. return true;
  377. }
  378. }
  379. }
  380. // If none of the tests concluded that the bridge is blocked,
  381. // return false
  382. false
  383. }
  384. }