ソースを参照

Code for processing extra-infos files

Vecna 4 ヶ月 前
コミット
08cfacbf85
3 ファイル変更241 行追加19 行削除
  1. 11 0
      Cargo.toml
  2. 195 0
      src/extra_info.rs
  3. 35 19
      src/lib.rs

+ 11 - 0
Cargo.toml

@@ -8,16 +8,27 @@ edition = "2021"
 [dependencies]
 array-bytes = "6.2.0"
 bincode = "1"
+chrono = "0.4"
+clap = { version = "4.4.14", features = ["derive"] }
 curve25519-dalek = { version = "4", default-features = false, features = ["serde", "rand_core", "digest"] }
 ed25519-dalek = { version = "2", features = ["serde", "rand_core"] }
+http-body-util = "0.1"
+hyper = { version = "1", features = ["full"] }
+hyper-rustls = "0.26.0"
+hyper-util = { version = "0.1", features = ["full"] }
+julianday = "1.2.0"
 lazy_static = "1"
 lox-library = { git = "https://gitlab.torproject.org/vecna/lox.git", version = "0.1.0" }
+#scraper = "0.18"
+select = "0.6.0"
 serde = "1.0.195"
 serde_json = "1.0"
 serde_with = {version = "3.4.0", features = ["json"]}
 sha1 = "0.10"
 sha3 = "0.10"
+sled = "0.34.7"
 time = "0.3.30"
+tokio = { version = "1", features = ["full"] }
 
 # probably not needed once I can query an API
 rand = { version = "0.8", features = ["std_rng"]}

+ 195 - 0
src/extra_info.rs

@@ -0,0 +1,195 @@
+/*! Fields we need from the extra-info documents for bridges...
+Note, this is NOT a complete implementation of the document format.
+(https://spec.torproject.org/dir-spec/extra-info-document-format.html) */
+
+use chrono::DateTime;
+use http_body_util::{BodyExt, Empty};
+use hyper::body::Bytes;
+use hyper_util::{client::legacy::Client, rt::TokioExecutor};
+use julianday::JulianDay;
+use select::{document::Document, predicate::Name};
+use serde::{Deserialize, Serialize};
+use std::{
+    collections::{BTreeMap, HashMap, HashSet},
+    fs::File,
+    io::{prelude::*, BufReader, Write},
+    path::Path,
+};
+
+/// Directory where we store these files
+pub const DIRECTORY: &str = "extra_infos";
+
+/// Fields we need from extra-info document
+#[derive(Eq, PartialEq, Hash, Serialize, Deserialize)]
+pub struct ExtraInfo {
+    /// Bridge nickname, probably unused
+    pub nickname: String,
+    /// Bridge fingerprint, a SHA-1 hash of the bridge ID
+    pub fingerprint: [u8; 20],
+    /// Date (in UTC) that this document was published, stored as a Julian
+    /// date because we don't need to know more precisely than the day.
+    pub published: u32,
+    /// Map of country codes and how many users (rounded up to a multiple of
+    /// 8) have connected to that bridge during the day.
+    /// Uses BTreeMap instead of HashMap so ExtraInfo can implement Hash.
+    pub bridge_ips: BTreeMap<String, u32>, // TODO: What size for count?
+}
+
+fn get_extra_info_or_error(entry: &HashMap<String, String>) -> Result<ExtraInfo, String> {
+    if !entry.contains_key("nickname") || !entry.contains_key("fingerprint") {
+        // How did we get here??
+        return Err("Cannot parse extra-info: Missing nickname or fingerprint".to_string());
+    }
+    if !entry.contains_key("published") || !entry.contains_key("bridge-ips") {
+        // Some extra-infos are missing data on connecting IPs...
+        // But we can't do anything in that case.
+        return Err(format!(
+            "Failed to parse extra-info for {} {}",
+            entry.get("nickname").unwrap(),
+            entry.get("fingerprint").unwrap()
+        ));
+    }
+    let nickname = entry.get("nickname").unwrap().to_string();
+    let fingerprint_str = entry.get("fingerprint").unwrap();
+    if fingerprint_str.len() != 40 {
+        return Err("Fingerprint must be 20 bytes".to_string());
+    }
+    let fingerprint = array_bytes::hex2array(fingerprint_str).unwrap();
+    let published: u32 = JulianDay::from(
+        DateTime::parse_from_str(
+            &(entry.get("published").unwrap().to_owned() + " +0000"),
+            "%F %T %z",
+        )
+        .unwrap()
+        .date_naive(),
+    )
+    .inner()
+    .try_into()
+    .unwrap();
+    let bridge_ips_str = entry.get("bridge-ips").unwrap();
+    let mut bridge_ips: BTreeMap<String, u32> = BTreeMap::new();
+    let countries: Vec<&str> = bridge_ips_str.split(',').collect();
+    for country in countries {
+        if country != "" {
+            // bridge-ips may be empty
+            let (cc, count) = country.split_once('=').unwrap();
+            bridge_ips.insert(cc.to_string(), count.parse::<u32>().unwrap());
+        }
+    }
+
+    Ok(ExtraInfo {
+        nickname,
+        fingerprint,
+        published,
+        bridge_ips,
+    })
+}
+
+pub fn add_extra_infos<'a>(filename: &str, set: &mut HashSet<ExtraInfo>) {
+    let infile = File::open(format!("{}/{}", DIRECTORY, filename)).unwrap();
+    let reader = BufReader::new(infile);
+
+    let mut entry = HashMap::<String, String>::new();
+    for line in reader.lines() {
+        let line = line.unwrap();
+        if line.starts_with("@type bridge-extra-info ") {
+            if !entry.is_empty() {
+                let extra_info = get_extra_info_or_error(&entry);
+                if extra_info.is_ok() {
+                    set.insert(extra_info.unwrap());
+                } else {
+                    // Just print the error and continue.
+                    println!("{}", extra_info.err().unwrap());
+                }
+                entry = HashMap::<String, String>::new();
+            }
+        } else {
+            if line.starts_with("extra-info ") {
+                // extra-info line has format:
+                // extra-info <nickname> <fingerprint>
+                let line_split: Vec<&str> = line.split(' ').collect();
+                if line_split.len() != 3 {
+                    println!("Misformed extra-info line");
+                } else {
+                    entry.insert("nickname".to_string(), line_split[1].to_string());
+                    entry.insert("fingerprint".to_string(), line_split[2].to_string());
+                }
+            } else {
+                let (key, value) = match line.split_once(' ') {
+                    Some((k, v)) => (k, v),
+                    None => (line.as_str(), ""),
+                };
+                entry.insert(key.to_string(), value.to_string());
+            }
+        }
+    }
+    // Do for the last one
+    let extra_info = get_extra_info_or_error(&entry);
+    if extra_info.is_ok() {
+        set.insert(extra_info.unwrap());
+    } else {
+        println!("{}", extra_info.err().unwrap());
+    }
+}
+
+/// Download new extra-infos files and save them in DIRECTORY. This function
+/// returns the set of newly downloaded filenames.
+pub async fn download_extra_infos(
+) -> Result<HashSet<String>, Box<dyn std::error::Error + Send + Sync>> {
+    // Download directory of recent extra-infos
+    let base_url = "https://collector.torproject.org/recent/bridge-descriptors/extra-infos/";
+    let url = base_url.parse().unwrap();
+    let https = hyper_rustls::HttpsConnectorBuilder::new()
+        .with_native_roots() // TODO: Pin certificate? Is this data signed/verifiable?
+        .expect("no native root CA certificates found")
+        .https_only()
+        .enable_http1()
+        .build();
+
+    let client: Client<_, Empty<Bytes>> = Client::builder(TokioExecutor::new()).build(https);
+
+    println!("Downloading {}", base_url);
+    let mut res = client.get(url).await?;
+
+    assert_eq!(res.status(), hyper::StatusCode::OK);
+    let mut body_str = String::from("");
+    while let Some(next) = res.frame().await {
+        let frame = next?;
+        if let Some(chunk) = frame.data_ref() {
+            body_str.push_str(&String::from_utf8(chunk.to_vec())?);
+        }
+    }
+
+    let doc = Document::from(body_str.as_str());
+
+    // Create extra-infos directory if it doesn't exist
+    std::fs::create_dir_all(&DIRECTORY)?;
+
+    let mut new_files = HashSet::<String>::new();
+
+    // Go through all the links in the page and download new files
+    let links = doc.find(Name("a")).filter_map(|n| n.attr("href"));
+    for link in links {
+        if link.ends_with("-extra-infos") {
+            let filename = format!("{}/{}", DIRECTORY, link);
+
+            // Download file if it's not already downloaded
+            if !Path::new(&filename).exists() {
+                let extra_infos_url = format!("{}{}", base_url, link);
+                println!("Downloading {}", extra_infos_url);
+                let mut res = client.get(extra_infos_url.parse().unwrap()).await?;
+                assert_eq!(res.status(), hyper::StatusCode::OK);
+                let mut file = std::fs::File::create(filename).unwrap();
+                while let Some(next) = res.frame().await {
+                    let frame = next?;
+                    if let Some(chunk) = frame.data_ref() {
+                        file.write_all(&chunk)?;
+                    }
+                }
+                new_files.insert(link.to_string());
+            }
+        }
+    }
+
+    Ok(new_files)
+}

ファイルの差分が大きいため隠しています
+ 35 - 19
src/lib.rs


この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません