Parcourir la source

Stage 2: Model as two distributions, handle 0 standard deviation

Vecna il y a 9 mois
Parent
commit
3512adc425
1 fichiers modifiés avec 52 ajouts et 25 suppressions
  1. 52 25
      src/analysis.rs

+ 52 - 25
src/analysis.rs

@@ -216,6 +216,28 @@ impl NormalAnalyzer {
         }
     }
 
+    fn mean(data: &[u32]) -> f64 {
+        let mut sum = 0.0;
+        for count in data {
+            sum += *count as f64;
+        }
+        sum / data.len() as f64
+    }
+
+    fn std_dev(data: &[u32], mean: f64) -> f64 {
+        let mut sum = 0.0;
+        for count in data {
+            sum += (*count as f64 - mean).powi(2);
+        }
+        (sum / data.len() as f64).sqrt()
+    }
+
+    fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {
+        let mean = Self::mean(data);
+        let std = Self::std_dev(data, mean);
+        (mean, std)
+    }
+
     // Returns the mean vector, vector of individual standard deviations, and
     // covariance matrix. If the standard deviation for a variable is 0 and/or
     // the covariance matrix is not positive definite, add some noise to the
@@ -318,7 +340,7 @@ impl Analyzer for NormalAnalyzer {
             || f64::from(negative_reports_today) > self.scaling_factor * f64::from(bridge_ips_today)
     }
 
-    /// Evaluate invite-only bridge based on last 30 days
+    /// Evaluate invite-only bridge based on historical data
     fn stage_two(
         &self,
         confidence: f64,
@@ -332,30 +354,35 @@ impl Analyzer for NormalAnalyzer {
 
         let alpha = 1.0 - confidence;
 
-        let (mean_vec, sd_vec, cov_mat) = Self::stats(&[bridge_ips, negative_reports]);
-        let bridge_ips_mean = mean_vec[0];
-        let negative_reports_mean = mean_vec[1];
-        let bridge_ips_sd = sd_vec[0];
-        let negative_reports_sd = sd_vec[1];
+        let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);
+        let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
 
-        /*
-                let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
-                let pdf = mvn.pdf(&DVector::from_vec(vec![
-                    bridge_ips_today as f64,
-                    negative_reports_today as f64,
-                ]));
-        */
+        // Model each variable with a normal distribution.
+        let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
+        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
 
-        // Model each variable in isolation. We use 1 - the CDF for
-        // negative reports because more negative reports is worse.
-        let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
-        let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
-        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
-        let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
+        // If we have 0 standard deviation, we need another way to
+        // evaluate each variable
+        let bip_test = if bridge_ips_sd > 0.0 {
+            bip_normal.unwrap().cdf(bridge_ips_today as f64) < alpha
+        } else {
+            // Consider the bridge blocked if its usage dropped by more
+            // than 1 bin. (Note that the mean is the exact value
+            // because we had no deviation.)
+            (bridge_ips_today as f64) < bridge_ips_mean - 8.0
+        };
+        let nr_test = if negative_reports_sd > 0.0 {
+            // We use CCDF because more negative reports is worse.
+            (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
+        } else {
+            // Consider the bridge blocked negative reports increase by
+            // more than 1 after a long static period. (Note that the
+            // mean is the exact value because we had no deviation.)
+            (negative_reports_today as f64) > negative_reports_mean + 1.0
+        };
 
-        // For now, just look at each variable in isolation
-        // TODO: How do we do a multivariate normal CDF?
-        bip_cdf < alpha || nr_cdf < alpha
+        // Return true if any test concluded the bridge is blocked
+        bip_test || nr_test
     }
 
     /// Evaluate invite-only bridge with lv3+ users submitting positive reports
@@ -393,17 +420,17 @@ impl Analyzer for NormalAnalyzer {
                 ]));
         */
 
-        // Model each variable in isolation. We use 1 - the CDF for
+        // Model each variable in isolation. We use the CCDF for
         // negative reports because more negative reports is worse.
         let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
         let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
         let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
-        let nr_cdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
+        let nr_ccdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
         let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap();
         let pr_cdf = pr_normal.cdf(positive_reports_today as f64);
 
         // For now, just look at each variable in isolation
         // TODO: How do we do a multivariate normal CDF?
-        bip_cdf < alpha || nr_cdf < alpha || pr_cdf < alpha
+        bip_cdf < alpha || nr_ccdf < alpha || pr_cdf < alpha
     }
 }