get-bridge-data.sh 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/bin/bash
  2. # Populate array of months we care about
  3. months=()
  4. # 2020
  5. for i in $(seq 7 12); do
  6. months+=( 2020-$(printf %02d $i) )
  7. done
  8. # 2021
  9. for i in $(seq 1 4); do
  10. months+=( 2021-$(printf %02d $i) )
  11. done
  12. # Download the archives if we don't have them already
  13. for i in ${months[@]}; do
  14. if [ ! -f data/bridge-extra-infos-${i}.tar.xz ]; then
  15. curl -Lo data/bridge-extra-infos-${i}.tar.xz https://collector.torproject.org/archive/bridge-descriptors/extra-infos/bridge-extra-infos-${i}.tar.xz || exit 1
  16. fi
  17. done
  18. # Check that we have the right archives
  19. sha256sum -c data/bridge-extra-infos.sha256 || exit 1
  20. # If we haven't already extracted the bridge data, then do so. This will
  21. # take a long time (around 12.5 hours on my device) because it needs to
  22. # process around 3 million small files, and it will require a few GB of
  23. # free space while running. In the end, this results in about 91 MB of
  24. # bridge data that we care about.
  25. if [ ! -d data/bridge_data ]; then
  26. cd data
  27. mkdir bridge_data
  28. # This is around 20 GB of data uncompressed, so don't extract it all
  29. # at once. Instead, extract and process one month at a time.
  30. for i in ${months[@]}; do
  31. if [ ! -d bridge-extra-infos-${i} ]; then
  32. echo "Extracting bridge-extras-infos-${i}.tar.xz"
  33. tar xf bridge-extra-infos-${i}.tar.xz || exit 1
  34. fi
  35. echo "Processing bridge-extra-infos-${i}"
  36. for j in bridge-extra-infos-${i}/*; do
  37. for k in ${j}/*; do
  38. for l in ${k}/*; do
  39. if [[ -s "${k}" ]]; then
  40. fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${l}" | grep -Po '(?<= )(.*)(?=$)')
  41. date=$(grep -Po '(?<=^published )(.*)(?= )' "${l}")
  42. # Convert to Julian date, thanks to
  43. # https://stackoverflow.com/a/43318209
  44. date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
  45. count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${l}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
  46. if [ -z "$count" ]; then
  47. count=0
  48. fi
  49. if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
  50. echo "${date_julian},${count}" >> bridge_data/${fingerprint}
  51. else
  52. echo "Error for ${l}"
  53. echo " fingerprint: ${fingerprint}"
  54. echo " date: ${date_julian}"
  55. echo " count: ${count}"
  56. fi
  57. fi
  58. done
  59. done
  60. done
  61. echo "Removing bridge-extra-infos-${i} directory to free up space"
  62. rm -r bridge-extra-infos-${i}
  63. done
  64. cd ..
  65. fi