Explorar o código

Extract and process; don't save all the extra-info files to disk

Vecna hai 1 semana
pai
achega
7be7f3f721
Modificáronse 2 ficheiros con 20 adicións e 31 borrados
  1. 3 31
      scripts/extract-extra-infos-archive.sh
  2. 17 0
      scripts/parse-data-from-extra-info.sh

+ 3 - 31
scripts/extract-extra-infos-archive.sh

@@ -6,43 +6,15 @@ if [ ! -f "data/bridge_data/${filename}_processed.tar.xz" ]; then
     # Clean up any files from past runs
     rm -rf "data/bridge_data/${filename}"
 
-    echo "Extracting ${filename}.tar.xz"
     mkdir -p data/bridge_data/"$filename"/bridge_data && \
-        cp data/"${filename}.tar.xz" data/bridge_data/"$filename"/ && \
+        ln -s ../..//"${filename}.tar.xz" data/bridge_data/"$filename"/ && \
         cd data/bridge_data/"$filename"/
 
-    tar xf "${filename}.tar.xz" || exit 1
-
     echo "Processing ${filename}"
-    for i in "${filename}"/*; do
-        for j in "${i}"/*; do
-            for k in "${j}"/*; do
-                if [[ -s "${j}" ]]; then
-                    fingerprint=$(grep -Po '(?<=^extra-info )(.*)(?=$)' "${k}" | grep -Po '(?<= )(.*)(?=$)')
-                    date=$(grep -Po '(?<=^published )(.*)(?= )' "${k}")
-                    # Convert to Julian date, thanks to
-                    # https://stackoverflow.com/a/43318209
-                    date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
-                    count=$(grep -Po '(?<=^bridge-ips )(.*)(?=$)' "${k}" | grep -Po '(?<=by=)(.*?)(?=(,|$))')
-                    if [ -z "$count" ]; then
-                        count=0
-                    fi
-
-                    if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
-                        echo "${date_julian},${count}" >> bridge_data/"${fingerprint}"
-                    else
-                        echo "Error for ${l}"
-                        echo "    fingerprint: ${fingerprint}"
-                        echo "    date:        ${date_julian}"
-                        echo "    count:       ${count}"
-                    fi
-                fi
-            done
-        done
-    done
+
+    tar xf "${filename}.tar.xz" --to-command=../../../scripts/parse-data-from-extra-info.sh
 
     echo "Finished processing ${filename}; saving progress"
-    rm "${filename}.tar.xz"
 
     # Return to data/bridge_data/
     cd ..

+ 17 - 0
scripts/parse-data-from-extra-info.sh

@@ -0,0 +1,17 @@
+#!/bin/bash
+
+data=$(</dev/stdin)
+
+fingerprint=$(echo "$data" | grep -Po '(?<=^extra-info )(.*)(?=$)' | grep -Po '(?<= )(.*)(?=$)')
+date=$(echo "$data" | grep -Po '(?<=^published )(.*)(?= )')
+# Convert to Julian date, thanks to
+# https://stackoverflow.com/a/43318209
+date_julian=$(( $(date +%s -d "${date}") / 86400 + 2440587 ))
+count=$(echo "$data" | grep -Po '(?<=^bridge-ips )(.*)(?=$)' | grep -Po '(?<=by=)(.*?)(?=(,|$))')
+if [ -z "$count" ]; then
+    count=0
+fi
+
+if [[ -n "${date_julian}" && -n "${fingerprint}" ]]; then
+    echo "${date_julian},${count}" >> bridge_data/"${fingerprint}"
+fi