Pārlūkot izejas kodu

Use estimated multivariate CDF when we have positive reports

Vecna 9 mēneši atpakaļ
vecāks
revīzija
d9aa616d77
1 mainītis faili ar 58 papildinājumiem un 88 dzēšanām
  1. 58 88
      src/analysis.rs

+ 58 - 88
src/analysis.rs

@@ -1,10 +1,9 @@
 use crate::{BridgeInfo, BridgeInfoType};
 use lox_library::proto::trust_promotion::UNTRUSTED_INTERVAL;
-use nalgebra::{Cholesky, DMatrix, DVector};
-use rand::Rng;
-use statrs::distribution::{ContinuousCDF, MultivariateNormal, Normal};
+use nalgebra::DVector;
+use statrs::distribution::{Continuous, ContinuousCDF, MultivariateNormal, Normal};
 use std::{
-    cmp::{max, min},
+    cmp::min,
     collections::{BTreeMap, HashSet},
 };
 
@@ -234,39 +233,21 @@ impl NormalAnalyzer {
 
     fn mean_and_std_dev(data: &[u32]) -> (f64, f64) {
         let mean = Self::mean(data);
-        let std = Self::std_dev(data, mean);
-        (mean, std)
+        let std_dev = Self::std_dev(data, mean);
+        (mean, std_dev)
     }
 
-    // Returns the mean vector, vector of individual standard deviations, and
-    // covariance matrix. If the standard deviation for a variable is 0 and/or
-    // the covariance matrix is not positive definite, add some noise to the
-    // data and recompute.
-    fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
+    // Returns the mean vector and covariance matrix
+    fn stats(data: &[&[u32]]) -> (Vec<f64>, Vec<f64>) {
         let n = data.len();
 
-        // Compute mean and standard deviation vectors
-        let (mean_vec, sd_vec) = {
+        // Compute mean vector
+        let mean_vec = {
             let mut mean_vec = Vec::<f64>::new();
-            let mut sd_vec = Vec::<f64>::new();
             for var in data {
-                // Compute mean
-                let mut sum = 0.0;
-                for count in *var {
-                    sum += *count as f64;
-                }
-                let mean = sum / var.len() as f64;
-
-                // Compute standard deviation
-                let mut sum = 0.0;
-                for count in *var {
-                    sum += (*count as f64 - mean).powi(2);
-                }
-                let sd = (sum / var.len() as f64).sqrt();
-                mean_vec.push(mean);
-                sd_vec.push(sd);
+                mean_vec.push(Self::mean(var));
             }
-            (mean_vec, sd_vec)
+            mean_vec
         };
 
         // Compute covariance matrix
@@ -296,33 +277,7 @@ impl NormalAnalyzer {
             cov_mat
         };
 
-        // If any standard deviation is 0 or the covariance matrix is not
-        // positive definite, add some noise and recompute.
-        let mut recompute = false;
-        for sd in &sd_vec {
-            if *sd <= 0.0 {
-                recompute = true;
-            }
-        }
-        if Cholesky::new(DMatrix::from_vec(n, n, cov_mat.clone())).is_none() {
-            recompute = true;
-        }
-
-        if !recompute {
-            (mean_vec, sd_vec, cov_mat)
-        } else {
-            // Add random noise and recompute
-            let mut new_data = vec![vec![0; data[0].len()]; n];
-            let mut rng = rand::thread_rng();
-            for i in 0..n {
-                for j in 0..data[i].len() {
-                    // Add 1 to some randomly selected values
-                    new_data[i][j] = data[i][j] + rng.gen_range(0..=1);
-                }
-            }
-            // Compute stats on modified data
-            Self::stats(&new_data.iter().map(Vec::as_slice).collect::<Vec<&[u32]>>())
-        }
+        (mean_vec, cov_mat)
     }
 }
 
@@ -357,7 +312,7 @@ impl Analyzer for NormalAnalyzer {
         let (bridge_ips_mean, bridge_ips_sd) = Self::mean_and_std_dev(bridge_ips);
         let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
 
-        // Model each variable with a normal distribution.
+        // Model negative reports separately
         let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd);
         let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
 
@@ -402,35 +357,50 @@ impl Analyzer for NormalAnalyzer {
 
         let alpha = 1.0 - confidence;
 
-        let (mean_vec, sd_vec, cov_mat) =
-            Self::stats(&[bridge_ips, negative_reports, positive_reports]);
-        let bridge_ips_mean = mean_vec[0];
-        let negative_reports_mean = mean_vec[1];
-        let positive_reports_mean = mean_vec[2];
-        let bridge_ips_sd = sd_vec[0];
-        let negative_reports_sd = sd_vec[1];
-        let positive_reports_sd = sd_vec[2];
-
-        /*
-                let mvn = MultivariateNormal::new(mean_vec, cov_mat).unwrap();
-                let pdf = mvn.pdf(&DVector::from_vec(vec![
-                    bridge_ips_today as f64,
-                    negative_reports_today as f64,
-                    positive_reports_today as f64,
-                ]));
-        */
-
-        // Model each variable in isolation. We use the CCDF for
-        // negative reports because more negative reports is worse.
-        let bip_normal = Normal::new(bridge_ips_mean, bridge_ips_sd).unwrap();
-        let bip_cdf = bip_normal.cdf(bridge_ips_today as f64);
-        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd).unwrap();
-        let nr_ccdf = 1.0 - nr_normal.cdf(negative_reports_today as f64);
-        let pr_normal = Normal::new(positive_reports_mean, positive_reports_sd).unwrap();
-        let pr_cdf = pr_normal.cdf(positive_reports_today as f64);
-
-        // For now, just look at each variable in isolation
-        // TODO: How do we do a multivariate normal CDF?
-        bip_cdf < alpha || nr_ccdf < alpha || pr_cdf < alpha
+        // Model bridge IPs and positive reports with multivariate
+        // normal distribution
+        let (mean_vec, cov_mat) = Self::stats(&[bridge_ips, positive_reports]);
+        let mvn = MultivariateNormal::new(mean_vec, cov_mat);
+
+        // Model negative reports separately
+        let (negative_reports_mean, negative_reports_sd) = Self::mean_and_std_dev(negative_reports);
+        let nr_normal = Normal::new(negative_reports_mean, negative_reports_sd);
+
+        // If we have 0 standard deviation or a covariance matrix that
+        // is not positive definite, we need another way to evaluate
+        // each variable
+        let positive_test = if mvn.is_ok() {
+            let mvn = mvn.unwrap();
+
+            // Estimate the CDF by integrating the PDF by hand with step
+            // size 1
+            let mut cdf = 0.0;
+            for bip in 0..bridge_ips_today {
+                for pr in 0..positive_reports_today {
+                    cdf += mvn.pdf(&DVector::from_vec(vec![bip as f64, pr as f64]));
+                }
+            }
+            cdf < alpha
+        } else {
+            // Ignore positive reports and compute as in stage 2
+            self.stage_two(
+                confidence,
+                bridge_ips,
+                bridge_ips_today,
+                negative_reports,
+                negative_reports_today,
+            )
+        };
+        let nr_test = if negative_reports_sd > 0.0 {
+            // We use CCDF because more negative reports is worse.
+            (1.0 - nr_normal.unwrap().cdf(negative_reports_today as f64)) < alpha
+        } else {
+            // Consider the bridge blocked negative reports increase by
+            // more than 1 after a long static period. (Note that the
+            // mean is the exact value because we had no deviation.)
+            (negative_reports_today as f64) > negative_reports_mean + 1.0
+        };
+
+        positive_test || nr_test
     }
 }