extra_info.rs 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. /*! Fields we need from the extra-info documents for bridges...
  2. Note, this is NOT a complete implementation of the document format.
  3. (https://spec.torproject.org/dir-spec/extra-info-document-format.html) */
  4. use chrono::DateTime;
  5. use http::status::StatusCode;
  6. use http_body_util::{BodyExt, Empty};
  7. use hyper::body::Bytes;
  8. use hyper_util::{client::legacy::Client, rt::TokioExecutor};
  9. use julianday::JulianDay;
  10. use select::{document::Document, predicate::Name};
  11. use serde::{Deserialize, Serialize};
  12. use std::{
  13. collections::{BTreeMap, HashMap, HashSet},
  14. fs::File,
  15. io::{prelude::*, BufReader, Write},
  16. path::Path,
  17. };
  18. /// Directory where we store these files
  19. pub const DIRECTORY: &str = "extra_infos";
  20. /// Fields we need from extra-info document
  21. #[derive(Eq, PartialEq, Hash, Serialize, Deserialize)]
  22. pub struct ExtraInfo {
  23. /// Bridge nickname, probably unused
  24. pub nickname: String,
  25. /// Bridge fingerprint, a SHA-1 hash of the bridge ID
  26. pub fingerprint: [u8; 20],
  27. /// Date (in UTC) that this document covered (bridge-stats-end if
  28. /// available) or that the document was published (published), stored
  29. /// as a Julian date because we don't need to know more precisely than
  30. /// the day.
  31. pub date: u32,
  32. /// Map of country codes and how many users (rounded up to a multiple of
  33. /// 8) have connected to that bridge during the day.
  34. /// Uses BTreeMap instead of HashMap so ExtraInfo can implement Hash.
  35. pub bridge_ips: BTreeMap<String, u32>, // TODO: What size for count?
  36. }
  37. fn get_extra_info_or_error(entry: &HashMap<String, String>) -> Result<ExtraInfo, String> {
  38. if !entry.contains_key("nickname") || !entry.contains_key("fingerprint") {
  39. // How did we get here??
  40. return Err("Cannot parse extra-info: Missing nickname or fingerprint".to_string());
  41. }
  42. if !(entry.contains_key("bridge-stats-end") || entry.contains_key("published"))
  43. || !entry.contains_key("bridge-ips")
  44. {
  45. // Some extra-infos are missing data on connecting IPs...
  46. // But we can't do anything in that case.
  47. return Err(format!(
  48. "Failed to parse extra-info for {} {}",
  49. entry.get("nickname").unwrap(),
  50. entry.get("fingerprint").unwrap()
  51. ));
  52. }
  53. let nickname = entry.get("nickname").unwrap().to_string();
  54. let fingerprint_str = entry.get("fingerprint").unwrap();
  55. if fingerprint_str.len() != 40 {
  56. return Err("Fingerprint must be 20 bytes".to_string());
  57. }
  58. let fingerprint = array_bytes::hex2array(fingerprint_str).unwrap();
  59. let date: u32 = {
  60. let date_str = if entry.contains_key("bridge-stats-end") {
  61. let line = entry.get("bridge-stats-end").unwrap();
  62. // Parse out (86400 s) from end of line
  63. &line[..line.find("(").unwrap() - 1]
  64. } else {
  65. entry.get("published").unwrap().as_str()
  66. };
  67. JulianDay::from(
  68. DateTime::parse_from_str(&(date_str.to_owned() + " +0000"), "%F %T %z")
  69. .unwrap()
  70. .date_naive(),
  71. )
  72. .inner()
  73. .try_into()
  74. .unwrap()
  75. };
  76. let bridge_ips_str = entry.get("bridge-ips").unwrap();
  77. let mut bridge_ips: BTreeMap<String, u32> = BTreeMap::new();
  78. let countries: Vec<&str> = bridge_ips_str.split(',').collect();
  79. for country in countries {
  80. if country != "" {
  81. // bridge-ips may be empty
  82. let (cc, count) = country.split_once('=').unwrap();
  83. bridge_ips.insert(cc.to_string(), count.parse::<u32>().unwrap());
  84. }
  85. }
  86. Ok(ExtraInfo {
  87. nickname,
  88. fingerprint,
  89. date,
  90. bridge_ips,
  91. })
  92. }
  93. pub fn add_extra_infos<'a>(filename: &str, set: &mut HashSet<ExtraInfo>) {
  94. let infile = File::open(format!("{}/{}", DIRECTORY, filename)).unwrap();
  95. let reader = BufReader::new(infile);
  96. let mut entry = HashMap::<String, String>::new();
  97. for line in reader.lines() {
  98. let line = line.unwrap();
  99. if line.starts_with("@type bridge-extra-info ") {
  100. if !entry.is_empty() {
  101. let extra_info = get_extra_info_or_error(&entry);
  102. if extra_info.is_ok() {
  103. set.insert(extra_info.unwrap());
  104. } else {
  105. // Just print the error and continue.
  106. println!("{}", extra_info.err().unwrap());
  107. }
  108. entry = HashMap::<String, String>::new();
  109. }
  110. } else {
  111. if line.starts_with("extra-info ") {
  112. // extra-info line has format:
  113. // extra-info <nickname> <fingerprint>
  114. let line_split: Vec<&str> = line.split(' ').collect();
  115. if line_split.len() != 3 {
  116. println!("Misformed extra-info line");
  117. } else {
  118. entry.insert("nickname".to_string(), line_split[1].to_string());
  119. entry.insert("fingerprint".to_string(), line_split[2].to_string());
  120. }
  121. } else {
  122. let (key, value) = match line.split_once(' ') {
  123. Some((k, v)) => (k, v),
  124. None => (line.as_str(), ""),
  125. };
  126. entry.insert(key.to_string(), value.to_string());
  127. }
  128. }
  129. }
  130. // Do for the last one
  131. let extra_info = get_extra_info_or_error(&entry);
  132. if extra_info.is_ok() {
  133. set.insert(extra_info.unwrap());
  134. } else {
  135. println!("{}", extra_info.err().unwrap());
  136. }
  137. }
  138. /// Download new extra-infos files and save them in DIRECTORY. This function
  139. /// returns the set of newly downloaded filenames.
  140. pub async fn download_extra_infos(
  141. ) -> Result<HashSet<String>, Box<dyn std::error::Error + Send + Sync>> {
  142. // Download directory of recent extra-infos
  143. let base_url = "https://collector.torproject.org/recent/bridge-descriptors/extra-infos/";
  144. let url = base_url.parse().unwrap();
  145. let https = hyper_rustls::HttpsConnectorBuilder::new()
  146. .with_native_roots() // TODO: Pin certificate? Is this data signed/verifiable?
  147. .expect("no native root CA certificates found")
  148. .https_only()
  149. .enable_http1()
  150. .build();
  151. let client: Client<_, Empty<Bytes>> = Client::builder(TokioExecutor::new()).build(https);
  152. println!("Downloading {}", base_url);
  153. let mut res = client.get(url).await?;
  154. assert_eq!(res.status(), StatusCode::OK);
  155. let mut body_str = String::from("");
  156. while let Some(next) = res.frame().await {
  157. let frame = next?;
  158. if let Some(chunk) = frame.data_ref() {
  159. body_str.push_str(&String::from_utf8(chunk.to_vec())?);
  160. }
  161. }
  162. let doc = Document::from(body_str.as_str());
  163. // Create extra-infos directory if it doesn't exist
  164. std::fs::create_dir_all(&DIRECTORY)?;
  165. let mut new_files = HashSet::<String>::new();
  166. // Go through all the links in the page and download new files
  167. let links = doc.find(Name("a")).filter_map(|n| n.attr("href"));
  168. for link in links {
  169. if link.ends_with("-extra-infos") {
  170. let filename = format!("{}/{}", DIRECTORY, link);
  171. // Download file if it's not already downloaded
  172. if !Path::new(&filename).exists() {
  173. let extra_infos_url = format!("{}{}", base_url, link);
  174. println!("Downloading {}", extra_infos_url);
  175. let mut res = client.get(extra_infos_url.parse().unwrap()).await?;
  176. assert_eq!(res.status(), StatusCode::OK);
  177. let mut file = std::fs::File::create(filename).unwrap();
  178. while let Some(next) = res.frame().await {
  179. let frame = next?;
  180. if let Some(chunk) = frame.data_ref() {
  181. file.write_all(&chunk)?;
  182. }
  183. }
  184. new_files.insert(link.to_string());
  185. }
  186. }
  187. }
  188. Ok(new_files)
  189. }