#!/bin/bash # If we haven't already extracted our set of bridges distributed in 2021 # Feb, do that now. if [ ! -f data/all-bridges ]; then echo "Getting list of bridges distributed in 2021 February" ./scripts/get-email-bridges.sh fi # Clean bridge data (sort, remove duplicates) if [ ! -d data/bridge_data_cleaned ]; then echo "Extracting checkpoints" cd data/bridge_data for i in *_processed.tar.xz; do echo "$i" tar xf "$i" || exit 1 done cd ../.. echo "Cleaning data for bridges distributed in 2021 February" mkdir data/bridge_data_cleaned while read fpr; do fpr=$(echo -n "$fpr" | tr '[:lower:]' '[:upper:]') if [ -n "$fpr" ]; then # If there's any data on this bridge... if $(find data/bridge_data/ | grep -q "$fpr"); then # Get only highest number of observed connections for each day pref="placeholder" cat data/bridge_data/*/bridge_data/${fpr} | \ sort -r -n -k1.1,1.7 -k1.9 | \ while read line; do if [[ "$line" != "$pref"* ]]; then echo "$line" pref="${line:0:7}" fi done | sort > data/bridge_data_cleaned/${fpr} else echo "No data/bridge_data/${fpr}" fi fi done < data/all-bridges fi