clean-bridge-data.sh 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. #!/bin/bash
  2. # If we haven't already extracted our set of bridges distributed in 2021
  3. # Feb, do that now.
  4. if [ ! -f data/all-bridges ]; then
  5. echo "Getting list of bridges distributed in 2021 February"
  6. ./scripts/get-email-bridges.sh
  7. fi
  8. # Clean bridge data (sort, remove duplicates)
  9. if [ ! -d data/bridge_data_cleaned ]; then
  10. echo "Extracting checkpoints"
  11. cd data/bridge_data
  12. for i in *_processed.tar.xz; do
  13. echo "$i"
  14. tar xf "$i" || exit 1
  15. done
  16. cd ../..
  17. echo "Cleaning data for bridges distributed in 2021 February"
  18. mkdir data/bridge_data_cleaned
  19. while read fpr; do
  20. fpr=$(echo -n "$fpr" | tr '[:lower:]' '[:upper:]')
  21. if [ -n "$fpr" ]; then
  22. # If there's any data on this bridge...
  23. if $(find data/bridge_data/ | grep -q "$fpr"); then
  24. # Get only highest number of observed connections for each day
  25. pref="placeholder"
  26. cat data/bridge_data/*/bridge_data/${fpr} | \
  27. sort -r -n -k1.1,1.7 -k1.9 | \
  28. while read line; do
  29. if [[ "$line" != "$pref"* ]]; then
  30. echo "$line"
  31. pref="${line:0:7}"
  32. fi
  33. done | sort > data/bridge_data_cleaned/${fpr}
  34. else
  35. echo "No data/bridge_data/${fpr}"
  36. fi
  37. fi
  38. done < data/all-bridges
  39. fi