|
|
@@ -1,5 +1,10 @@
|
|
|
#!/bin/bash
|
|
|
|
|
|
+parallel=false
|
|
|
+if [ "$1" == "-p" ]; then
|
|
|
+ parallel=true
|
|
|
+fi
|
|
|
+
|
|
|
# Populate array of months we care about
|
|
|
months=()
|
|
|
# 2020
|
|
|
@@ -21,56 +26,17 @@ done
|
|
|
# Check that we have the right archives
|
|
|
sha256sum -c data/bridge-extra-infos.sha256 || exit 1
|
|
|
|
|
|
-# If we haven't already extracted the bridge data, then do so. This will
|
|
|
-# take a long time (around 12.5 hours on my device) because it needs to
|
|
|
-# process around 3 million small files, and it will require a few GB of
|
|
|
-# free space while running. In the end, this results in about 91 MB of
|
|
|
-# bridge data that we care about.
|
|
|
-if [ ! -d data/bridge_data ]; then
|
|
|
-
|
|
|
- cd data
|
|
|
-
|
|
|
- mkdir bridge_data
|
|
|
-
|
|
|
- # This is around 20 GB of data uncompressed, so don't extract it all
|
|
|
- # at once. Instead, extract and process one month at a time.
|
|
|
+# Extract the data for each month
|
|
|
+if [ "$parallel" == "true" ]; then
|
|
|
+ # Do it in parallel
|
|
|
for i in ${months[@]}; do
|
|
|
- if [ ! -d bridge-extra-infos-${i} ]; then
|
|
|
- echo "Extracting bridge-extras-infos-${i}.tar.xz"
|
|
|
- tar xf bridge-extra-infos-${i}.tar.xz || exit 1
|
|
|
- fi
|
|
|
-
|
|
|
- echo "Processing bridge-extra-infos-${i}"
|
|
|
- for j in bridge-extra-infos-${i}/*; do
|
|
|
- for k in ${j}/*; do
|
|
|
- for l in ${k}/*; do
|
|
|
- if [[ -s "${k}" ]]; then
|
|
|
- fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${l}" | grep -Po '(?<= )(.*)(?=$)')
|
|
|
- date=$(grep -Po '(?<=^published )(.*)(?= )' "${l}")
|
|
|
- # Convert to Julian date, thanks to
|
|
|
- # https://stackoverflow.com/a/43318209
|
|
|
- date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
|
|
|
- count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${l}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
|
|
|
- if [ -z "$count" ]; then
|
|
|
- count=0
|
|
|
- fi
|
|
|
-
|
|
|
- if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
|
|
|
- echo "${date_julian},${count}" >> bridge_data/${fingerprint}
|
|
|
- else
|
|
|
- echo "Error for ${l}"
|
|
|
- echo " fingerprint: ${fingerprint}"
|
|
|
- echo " date: ${date_julian}"
|
|
|
- echo " count: ${count}"
|
|
|
- fi
|
|
|
- fi
|
|
|
- done
|
|
|
- done
|
|
|
- done
|
|
|
-
|
|
|
- echo "Removing bridge-extra-infos-${i} directory to free up space"
|
|
|
- rm -r bridge-extra-infos-${i}
|
|
|
+ ./scripts/extract-extra-infos-archive.sh bridge-extra-infos-${i}.tar.xz &
|
|
|
done
|
|
|
|
|
|
- cd ..
|
|
|
+ # Wait until we're done extracting everything
|
|
|
+ wait
|
|
|
+else
|
|
|
+ for i in ${months[@]}; do
|
|
|
+ ./scripts/extract-extra-infos-archive.sh bridge-extra-infos-${i}.tar.xz
|
|
|
+ done
|
|
|
fi
|