| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- /*! Fields we need from the extra-info documents for bridges...
- Note, this is NOT a complete implementation of the document format.
- (https://spec.torproject.org/dir-spec/extra-info-document-format.html) */
- use chrono::DateTime;
- use http::status::StatusCode;
- use http_body_util::{BodyExt, Empty};
- use hyper::body::Bytes;
- use hyper_util::{client::legacy::Client, rt::TokioExecutor};
- use julianday::JulianDay;
- use select::{document::Document, predicate::Name};
- use serde::{Deserialize, Serialize};
- use std::{
- collections::{BTreeMap, HashMap, HashSet},
- fs::File,
- io::{prelude::*, BufReader, Write},
- path::Path,
- };
- /// Directory where we store these files
- pub const DIRECTORY: &str = "extra_infos";
- /// Fields we need from extra-info document
- #[derive(Eq, PartialEq, Hash, Serialize, Deserialize)]
- pub struct ExtraInfo {
- /// Bridge nickname, probably unused
- pub nickname: String,
- /// Bridge fingerprint, a SHA-1 hash of the bridge ID
- pub fingerprint: [u8; 20],
- /// Date (in UTC) that this document covered (bridge-stats-end if
- /// available) or that the document was published (published), stored
- /// as a Julian date because we don't need to know more precisely than
- /// the day.
- pub date: u32,
- /// Map of country codes and how many users (rounded up to a multiple of
- /// 8) have connected to that bridge during the day.
- /// Uses BTreeMap instead of HashMap so ExtraInfo can implement Hash.
- pub bridge_ips: BTreeMap<String, u32>, // TODO: What size for count?
- }
- fn get_extra_info_or_error(entry: &HashMap<String, String>) -> Result<ExtraInfo, String> {
- if !entry.contains_key("nickname") || !entry.contains_key("fingerprint") {
- // How did we get here??
- return Err("Cannot parse extra-info: Missing nickname or fingerprint".to_string());
- }
- if !(entry.contains_key("bridge-stats-end") || entry.contains_key("published"))
- || !entry.contains_key("bridge-ips")
- {
- // Some extra-infos are missing data on connecting IPs...
- // But we can't do anything in that case.
- return Err(format!(
- "Failed to parse extra-info for {} {}",
- entry.get("nickname").unwrap(),
- entry.get("fingerprint").unwrap()
- ));
- }
- let nickname = entry.get("nickname").unwrap().to_string();
- let fingerprint_str = entry.get("fingerprint").unwrap();
- if fingerprint_str.len() != 40 {
- return Err("Fingerprint must be 20 bytes".to_string());
- }
- let fingerprint = array_bytes::hex2array(fingerprint_str).unwrap();
- let date: u32 = {
- let date_str = if entry.contains_key("bridge-stats-end") {
- let line = entry.get("bridge-stats-end").unwrap();
- // Parse out (86400 s) from end of line
- &line[..line.find("(").unwrap() - 1]
- } else {
- entry.get("published").unwrap().as_str()
- };
- JulianDay::from(
- DateTime::parse_from_str(&(date_str.to_owned() + " +0000"), "%F %T %z")
- .unwrap()
- .date_naive(),
- )
- .inner()
- .try_into()
- .unwrap()
- };
- let bridge_ips_str = entry.get("bridge-ips").unwrap();
- let mut bridge_ips: BTreeMap<String, u32> = BTreeMap::new();
- let countries: Vec<&str> = bridge_ips_str.split(',').collect();
- for country in countries {
- if country != "" {
- // bridge-ips may be empty
- let (cc, count) = country.split_once('=').unwrap();
- bridge_ips.insert(cc.to_string(), count.parse::<u32>().unwrap());
- }
- }
- Ok(ExtraInfo {
- nickname,
- fingerprint,
- date,
- bridge_ips,
- })
- }
- pub fn add_extra_infos<'a>(filename: &str, set: &mut HashSet<ExtraInfo>) {
- let infile = File::open(format!("{}/{}", DIRECTORY, filename)).unwrap();
- let reader = BufReader::new(infile);
- let mut entry = HashMap::<String, String>::new();
- for line in reader.lines() {
- let line = line.unwrap();
- if line.starts_with("@type bridge-extra-info ") {
- if !entry.is_empty() {
- let extra_info = get_extra_info_or_error(&entry);
- if extra_info.is_ok() {
- set.insert(extra_info.unwrap());
- } else {
- // Just print the error and continue.
- println!("{}", extra_info.err().unwrap());
- }
- entry = HashMap::<String, String>::new();
- }
- } else {
- if line.starts_with("extra-info ") {
- // extra-info line has format:
- // extra-info <nickname> <fingerprint>
- let line_split: Vec<&str> = line.split(' ').collect();
- if line_split.len() != 3 {
- println!("Misformed extra-info line");
- } else {
- entry.insert("nickname".to_string(), line_split[1].to_string());
- entry.insert("fingerprint".to_string(), line_split[2].to_string());
- }
- } else {
- let (key, value) = match line.split_once(' ') {
- Some((k, v)) => (k, v),
- None => (line.as_str(), ""),
- };
- entry.insert(key.to_string(), value.to_string());
- }
- }
- }
- // Do for the last one
- let extra_info = get_extra_info_or_error(&entry);
- if extra_info.is_ok() {
- set.insert(extra_info.unwrap());
- } else {
- println!("{}", extra_info.err().unwrap());
- }
- }
- /// Download new extra-infos files and save them in DIRECTORY. This function
- /// returns the set of newly downloaded filenames.
- pub async fn download_extra_infos(
- ) -> Result<HashSet<String>, Box<dyn std::error::Error + Send + Sync>> {
- // Download directory of recent extra-infos
- let base_url = "https://collector.torproject.org/recent/bridge-descriptors/extra-infos/";
- let url = base_url.parse().unwrap();
- let https = hyper_rustls::HttpsConnectorBuilder::new()
- .with_native_roots() // TODO: Pin certificate? Is this data signed/verifiable?
- .expect("no native root CA certificates found")
- .https_only()
- .enable_http1()
- .build();
- let client: Client<_, Empty<Bytes>> = Client::builder(TokioExecutor::new()).build(https);
- println!("Downloading {}", base_url);
- let mut res = client.get(url).await?;
- assert_eq!(res.status(), StatusCode::OK);
- let mut body_str = String::from("");
- while let Some(next) = res.frame().await {
- let frame = next?;
- if let Some(chunk) = frame.data_ref() {
- body_str.push_str(&String::from_utf8(chunk.to_vec())?);
- }
- }
- let doc = Document::from(body_str.as_str());
- // Create extra-infos directory if it doesn't exist
- std::fs::create_dir_all(&DIRECTORY)?;
- let mut new_files = HashSet::<String>::new();
- // Go through all the links in the page and download new files
- let links = doc.find(Name("a")).filter_map(|n| n.attr("href"));
- for link in links {
- if link.ends_with("-extra-infos") {
- let filename = format!("{}/{}", DIRECTORY, link);
- // Download file if it's not already downloaded
- if !Path::new(&filename).exists() {
- let extra_infos_url = format!("{}{}", base_url, link);
- println!("Downloading {}", extra_infos_url);
- let mut res = client.get(extra_infos_url.parse().unwrap()).await?;
- assert_eq!(res.status(), StatusCode::OK);
- let mut file = std::fs::File::create(filename).unwrap();
- while let Some(next) = res.frame().await {
- let frame = next?;
- if let Some(chunk) = frame.data_ref() {
- file.write_all(&chunk)?;
- }
- }
- new_files.insert(link.to_string());
- }
- }
- }
- Ok(new_files)
- }
|