#!/bin/bash # Populate array of months we care about months=() # 2020 for i in $(seq 7 12); do months+=( 2020-$(printf %02d $i) ) done # 2021 for i in $(seq 1 4); do months+=( 2021-$(printf %02d $i) ) done # Download the archives if we don't have them already for i in ${months[@]}; do if [ ! -f data/bridge-extra-infos-${i}.tar.xz ]; then curl -Lo data/bridge-extra-infos-${i}.tar.xz https://collector.torproject.org/archive/bridge-descriptors/extra-infos/bridge-extra-infos-${i}.tar.xz || exit 1 fi done # Check that we have the right archives sha256sum -c data/bridge-extra-infos.sha256 || exit 1 # If we haven't already extracted the bridge data, then do so. This will # take a long time (around 12.5 hours on my device) because it needs to # process around 3 million small files, and it will require a few GB of # free space while running. In the end, this results in about 91 MB of # bridge data that we care about. if [ ! -d data/bridge_data ]; then cd data mkdir bridge_data # This is around 20 GB of data uncompressed, so don't extract it all # at once. Instead, extract and process one month at a time. for i in ${months[@]}; do if [ ! -d bridge-extra-infos-${i} ]; then echo "Extracting bridge-extras-infos-${i}.tar.xz" tar xf bridge-extra-infos-${i}.tar.xz || exit 1 fi echo "Processing bridge-extra-infos-${i}" for j in bridge-extra-infos-${i}/*; do for k in ${j}/*; do for l in ${k}/*; do if [[ -s "${k}" ]]; then fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${l}" | grep -Po '(?<= )(.*)(?=$)') date=$(grep -Po '(?<=^published )(.*)(?= )' "${l}") # Convert to Julian date, thanks to # https://stackoverflow.com/a/43318209 date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 )) count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${l}" | grep -Po '(?<=by=)(.*?)(?=(,|$))') if [ -z "$count" ]; then count=0 fi if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then echo "${date_julian},${count}" >> bridge_data/${fingerprint} else echo "Error for ${l}" echo " fingerprint: ${fingerprint}" echo " date: ${date_julian}" echo " count: ${count}" fi fi done done done echo "Removing bridge-extra-infos-${i} directory to free up space" rm -r bridge-extra-infos-${i} done cd .. fi