get-bridge-data.sh 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. #!/bin/bash
  2. # Populate array of months we care about
  3. months=()
  4. # 2020
  5. for i in $(seq 7 12); do
  6. months+=( 2020-$(printf %02d $i) )
  7. done
  8. # 2021
  9. for i in $(seq 1 4); do
  10. months+=( 2021-$(printf %02d $i) )
  11. done
  12. # Download the archives if we don't have them already
  13. for i in ${months[@]}; do
  14. if [ ! -f data/bridge-extra-infos-${i}.tar.xz ]; then
  15. curl -Lo data/bridge-extra-infos-${i}.tar.xz https://collector.torproject.org/archive/bridge-descriptors/extra-infos/bridge-extra-infos-${i}.tar.xz || exit 1
  16. fi
  17. done
  18. # Check that we have the right archives
  19. sha256sum -c data/bridge-extra-infos.sha256 || exit 1
  20. # If we haven't already extracted the bridge data, then do so. This will
  21. # take a long time (around 12.5 hours on my device) because it needs to
  22. # process around 3 million small files, and it will require a few GB of
  23. # free space while running. In the end, this results in about 91 MB of
  24. # bridge data that we care about.
  25. if [ ! -d data/bridge_data ]; then
  26. cd data
  27. # This is around 20 GB of data uncompressed, so don't extract it all
  28. # at once. Instead, extract and process one month at a time.
  29. for i in ${months[@]}; do
  30. if [ ! -d bridge-extra-infos-${i} ]; then
  31. echo "Extracting bridge-extras-infos-${i}.tar.xz"
  32. tar xf bridge-extra-infos-${i}.tar.xz || exit 1
  33. fi
  34. echo "Processing bridge-extra-infos-${i}"
  35. for j in bridge-extra-infos-${i}/*; do
  36. for k in ${j}/*; do
  37. for l in ${k}/*; do
  38. if [[ -s "${k}" ]]; then
  39. fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${l}" | grep -Po '(?<= )(.*)(?=$)')
  40. date=$(grep -Po '(?<=^published )(.*)(?= )' "${l}")
  41. # Convert to Julian date, thanks to
  42. # https://stackoverflow.com/a/43318209
  43. date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
  44. count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${l}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
  45. if [ -z "$count" ]; then
  46. count=0
  47. fi
  48. if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
  49. echo "${date_julian},${count}" >> bridge_data/${fingerprint}
  50. else
  51. echo "Error for ${l}"
  52. echo " fingerprint: ${fingerprint}"
  53. echo " date: ${date_julian}"
  54. echo " count: ${count}"
  55. fi
  56. fi
  57. done
  58. done
  59. done
  60. echo "Removing bridge-extra-infos-${i} directory to free up space"
  61. rm -r bridge-extra-infos-${i}
  62. done
  63. cd ..
  64. fi