Просмотр исходного кода

Checkpoint extractions and support extracting in parallel

Vecna 1 неделя назад
Родитель
Сommit
6527208c2a
4 измененных файлов с 121 добавлено и 59 удалено
  1. 36 7
      run.sh
  2. 13 3
      scripts/clean-bridge-data.sh
  3. 57 0
      scripts/extract-extra-infos-archive.sh
  4. 15 49
      scripts/get-bridge-data.sh

+ 36 - 7
run.sh

@@ -15,15 +15,44 @@ if [ "$?" != 0 ]; then
     exit 1
 fi
 
+fast=false
+parallel=false
+
+while [ -n "$1" ]; do
+    if [ "$1" == "--fast" ]; then
+        fast=true
+    elif [ "$1" == "-p" ]; then
+        parallel=true
+    fi
+    shift
+done
+
 # Get bridge data
-if [ "$1" == "--fast" ]; then
-    echo "Extracting some pre-processed data..."
-    cd data && tar xf bridge_data.tar.gz && cd ..
+if [ "$fast" == "true" ]; then
+    echo "Repacking some pre-processed data for the next step..."
+    # We want data/bridge_data/pre_processed/bridge_data/
+    cd data && \
+        rm -rf bridge_data && \
+        tar xzf bridge_data.tar.gz && \
+        mkdir processed && \
+        mv bridge_data processed && \
+        tar czf pre_processed.tar.xz processed/* && \
+        rm -r processed && \
+        mkdir -p bridge_data && \
+        mv pre_processed.tar.xz bridge_data && \
+        cd ..
 else
-    echo "Downloading and processing data from step 1..."
-    echo "This will take quite a long time (around 12.5 hours on my device)"
-    echo "and require a few GB of free space while running."
-    ./scripts/get-bridge-data.sh
+    if [ "$parallel" == "true" ]; then
+        echo "Downloading and processing data from step 1 in 10 parallel steps..."
+        echo "This will take a while (around an hour on my device)"
+        echo "and require around 20 GB of free space while running."
+        ./scripts/get-bridge-data.sh -p
+    else
+        echo "Downloading and processing data from step 1 sequentially..."
+        echo "This will take quite a long time (around 12.5 hours on my device)"
+        echo "and require a few GB of free space while running."
+        ./scripts/get-bridge-data.sh
+    fi
 fi
 
 # Get list of email-distributed bridges

+ 13 - 3
scripts/clean-bridge-data.sh

@@ -9,16 +9,26 @@ fi
 
 # Clean bridge data (sort, remove duplicates)
 if [ ! -d data/bridge_data_cleaned ]; then
+    echo "Extracting checkpoints"
+    cd data/bridge_data
+    for i in *_processed.tar.xz; do
+        echo "$i"
+        tar xf "$i" || exit 1
+    done
+    cd ../..
+
     echo "Cleaning data for bridges distributed in 2021 February"
     mkdir data/bridge_data_cleaned
     while read fpr; do
         fpr=$(echo -n "$fpr" | tr '[:lower:]' '[:upper:]')
         if [ -n "$fpr" ]; then
-            if [ -f data/bridge_data/${fpr} ]; then
+            # If there's any data on this bridge...
+            if $(find data/bridge_data/ | grep -q "$fpr"); then
                 # Get only highest number of observed connections for each day
                 pref="placeholder"
-                sort -r -n -k1.1,1.7 -k1.9 data/bridge_data/${fpr} \
-                    | while read line; do
+                cat data/bridge_data/*/bridge_data/${fpr} | \
+                    sort -r -n -k1.1,1.7 -k1.9 | \
+                    while read line; do
                     if [[ "$line" != "$pref"* ]]; then
                         echo "$line"
                         pref="${line:0:7}"

+ 57 - 0
scripts/extract-extra-infos-archive.sh

@@ -0,0 +1,57 @@
+#!/bin/bash
+
+filename="${1%.tar.xz}"
+
+if [ ! -f "data/bridge_data/${filename}_processed.tar.xz" ]; then
+    # Clean up any files from past runs
+    rm -rf "data/bridge_data/${filename}"
+
+    echo "Extracting ${filename}.tar.xz"
+    mkdir -p data/bridge_data/"$filename"/bridge_data && \
+        cp data/"${filename}.tar.xz" data/bridge_data/"$filename"/ && \
+        cd data/bridge_data/"$filename"/
+
+    tar xf "${filename}.tar.xz" || exit 1
+
+    echo "Processing ${filename}"
+    for i in "${filename}"/*; do
+        for j in "${i}"/*; do
+            for k in "${j}"/*; do
+                if [[ -s "${j}" ]]; then
+                    fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${k}" | grep -Po '(?<= )(.*)(?=$)')
+                    date=$(grep -Po '(?<=^published )(.*)(?= )' "${k}")
+                    # Convert to Julian date, thanks to
+                    # https://stackoverflow.com/a/43318209
+                    date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
+                    count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${k}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
+                    if [ -z "$count" ]; then
+                        count=0
+                    fi
+
+                    if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
+                        echo "${date_julian},${count}" >> bridge_data/"${fingerprint}"
+                    else
+                        echo "Error for ${l}"
+                        echo "    fingerprint: ${fingerprint}"
+                        echo "    date:        ${date_julian}"
+                        echo "    count:       ${count}"
+                    fi
+                fi
+            done
+        done
+    done
+
+    echo "Finished processing ${filename}; saving progress"
+    rm "${filename}.tar.xz"
+
+    # Return to data/bridge_data/
+    cd ..
+    tar czf "${filename}_processed.tar.xz" "${filename}"/bridge_data/* || exit 1
+    echo "Removing ${filename} directory to free up space"
+    rm -r "${filename}"
+
+    # Return to original directory
+    cd ../..
+else
+    echo "Already processed ${filename}.tar.xz"
+fi

+ 15 - 49
scripts/get-bridge-data.sh

@@ -1,5 +1,10 @@
 #!/bin/bash
 
+parallel=false
+if [ "$1" == "-p" ]; then
+    parallel=true
+fi
+
 # Populate array of months we care about
 months=()
 # 2020
@@ -21,56 +26,17 @@ done
 # Check that we have the right archives
 sha256sum -c data/bridge-extra-infos.sha256 || exit 1
 
-# If we haven't already extracted the bridge data, then do so. This will
-# take a long time (around 12.5 hours on my device) because it needs to
-# process around 3 million small files, and it will require a few GB of
-# free space while running. In the end, this results in about 91 MB of
-# bridge data that we care about.
-if [ ! -d data/bridge_data ]; then
-
-    cd data
-
-    mkdir bridge_data
-
-    # This is around 20 GB of data uncompressed, so don't extract it all
-    # at once. Instead, extract and process one month at a time.
+# Extract the data for each month
+if [ "$parallel" == "true" ]; then
+    # Do it in parallel
     for i in ${months[@]}; do
-        if [ ! -d bridge-extra-infos-${i} ]; then
-            echo "Extracting bridge-extras-infos-${i}.tar.xz"
-            tar xf bridge-extra-infos-${i}.tar.xz || exit 1
-        fi
-
-        echo "Processing bridge-extra-infos-${i}"
-        for j in bridge-extra-infos-${i}/*; do
-            for k in ${j}/*; do
-                for l in ${k}/*; do
-                    if [[ -s "${k}" ]]; then
-                        fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${l}" | grep -Po '(?<= )(.*)(?=$)')
-                        date=$(grep -Po '(?<=^published )(.*)(?= )' "${l}")
-                        # Convert to Julian date, thanks to
-                        # https://stackoverflow.com/a/43318209
-                        date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
-                        count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${l}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
-                        if [ -z "$count" ]; then
-                            count=0
-                        fi
-
-                        if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
-                            echo "${date_julian},${count}" >> bridge_data/${fingerprint}
-                        else
-                            echo "Error for ${l}"
-                            echo "    fingerprint: ${fingerprint}"
-                            echo "    date:        ${date_julian}"
-                            echo "    count:       ${count}"
-                        fi
-                    fi
-                done
-            done
-        done
-
-        echo "Removing bridge-extra-infos-${i} directory to free up space"
-        rm -r bridge-extra-infos-${i}
+        ./scripts/extract-extra-infos-archive.sh bridge-extra-infos-${i}.tar.xz &
     done
 
-    cd ..
+    # Wait until we're done extracting everything
+    wait
+else
+    for i in ${months[@]}; do
+        ./scripts/extract-extra-infos-archive.sh bridge-extra-infos-${i}.tar.xz
+    done
 fi