| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- #!/bin/bash
- # Populate array of months we care about
- months=()
- # 2020
- for i in $(seq 7 12); do
- months+=( 2020-$(printf %02d $i) )
- done
- # 2021
- for i in $(seq 1 4); do
- months+=( 2021-$(printf %02d $i) )
- done
- # Download the archives if we don't have them already
- for i in ${months[@]}; do
- if [ ! -f data/bridge-extra-infos-${i}.tar.xz ]; then
- curl -Lo data/bridge-extra-infos-${i}.tar.xz https://collector.torproject.org/archive/bridge-descriptors/extra-infos/bridge-extra-infos-${i}.tar.xz || exit 1
- fi
- done
- # Check that we have the right archives
- sha256sum -c data/bridge-extra-infos.sha256 || exit 1
- # If we haven't already extracted the bridge data, then do so. This will
- # take a long time (around 12.5 hours on my device) because it needs to
- # process around 3 million small files, and it will require a few GB of
- # free space while running. In the end, this results in about 91 MB of
- # bridge data that we care about.
- if [ ! -d data/bridge_data ]; then
- cd data
- mkdir bridge_data
- # This is around 20 GB of data uncompressed, so don't extract it all
- # at once. Instead, extract and process one month at a time.
- for i in ${months[@]}; do
- if [ ! -d bridge-extra-infos-${i} ]; then
- echo "Extracting bridge-extras-infos-${i}.tar.xz"
- tar xf bridge-extra-infos-${i}.tar.xz || exit 1
- fi
- echo "Processing bridge-extra-infos-${i}"
- for j in bridge-extra-infos-${i}/*; do
- for k in ${j}/*; do
- for l in ${k}/*; do
- if [[ -s "${k}" ]]; then
- fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${l}" | grep -Po '(?<= )(.*)(?=$)')
- date=$(grep -Po '(?<=^published )(.*)(?= )' "${l}")
- # Convert to Julian date, thanks to
- # https://stackoverflow.com/a/43318209
- date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
- count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${l}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
- if [ -z "$count" ]; then
- count=0
- fi
- if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
- echo "${date_julian},${count}" >> bridge_data/${fingerprint}
- else
- echo "Error for ${l}"
- echo " fingerprint: ${fingerprint}"
- echo " date: ${date_julian}"
- echo " count: ${count}"
- fi
- fi
- done
- done
- done
- echo "Removing bridge-extra-infos-${i} directory to free up space"
- rm -r bridge-extra-infos-${i}
- done
- cd ..
- fi
|