clean-bridge-data.sh 1.1 KB

1234567891011121314151617181920212223242526272829303132
  1. #!/bin/bash
  2. # If we haven't already extracted our set of bridges distributed in 2021
  3. # Feb, do that now.
  4. if [ ! -f data/all-bridges ]; then
  5. echo "Getting list of bridges distributed in 2021 February"
  6. ./scripts/get-email-bridges.sh
  7. fi
  8. # Clean bridge data (sort, remove duplicates)
  9. if [ ! -d data/bridge_data_cleaned ]; then
  10. echo "Cleaning data for bridges distributed in 2021 February"
  11. mkdir data/bridge_data_cleaned
  12. while read fpr; do
  13. fpr=$(echo -n "$fpr" | tr '[:lower:]' '[:upper:]')
  14. if [ -n "$fpr" ]; then
  15. if [ -f data/bridge_data/${fpr} ]; then
  16. # Get only highest number of observed connections for each day
  17. pref="placeholder"
  18. sort -r -n -k1.1,1.7 -k1.9 data/bridge_data/${fpr} \
  19. | while read line; do
  20. if [[ "$line" != "$pref"* ]]; then
  21. echo "$line"
  22. pref="${line:0:7}"
  23. fi
  24. done | sort > data/bridge_data_cleaned/${fpr}
  25. else
  26. echo "No data/bridge_data/${fpr}"
  27. fi
  28. fi
  29. done < data/all-bridges
  30. fi