2 Commits 4b84f3685a ... 3fc10e1441

Autore SHA1 Messaggio Data
  Justin Tracey 3fc10e1441 add better docs 6 mesi fa
  Justin Tracey 338aead827 hmm: fix bug in parallel_run.sh preventing waiting 6 mesi fa
2 ha cambiato i file con 15 aggiunte e 2 eliminazioni
  1. 13 0
      README.md
  2. 2 2
      hmm/parallel_run.sh

+ 13 - 0
README.md

@@ -3,6 +3,19 @@ This repo contains tools to extract empirical distributions from the ["Share and
 More thorough documentation is coming soon, but the gist is:
  - Download the `json_files.zip` file they provide, and extract it somewhere.
  - Run the `extract` tool to pare and serialize the SaM data.
+   (Using `chat*.json` in any of the following commands means using all available chats; you can use a subset for faster processing, so long as you're consistent.)
+   ``cargo run --bin extract stats/ json_files/chat*.json``
  - Use the tools in `hmm` to label messages as "active" or "idle".
+   - install the dependencies via `pip install -r requirements.txt`
+   - run the shell script to invoke the python script in parallel
+     ``./parallel_run.sh ../stats/ stats2/``
  - Run the `process` tool to generate all empirical distributions other than message sizes.
+   ``cargo run --bin process dists/ hmm/stats2/ json_files/chat*.json``
  - Run the `message-lens` tool to generate distributions for message sizes.
+   This takes an optional argument for file sizes (must be first if provided, sorry for the jank).
+   If you have a source for file sizes, you can provide it here.
+   If you don't want to simulate sending files, you can omit it.
+   If you don't have a source, you can use the one we provide based on public WhatsApp groups in 2023.
+   ``cargo run --bin message-lens -- -s data/file_sizes.dat dists/ json_files/chat*.json``
+
+At this point, `dists/` will contain distributions ready for use in MGen, organized by the user being simulated.

+ 2 - 2
hmm/parallel_run.sh

@@ -11,10 +11,10 @@ stats_dir_out="$2"
 n_files=$(ls "$stats_dir_in" | wc -l)
 N=$(( $n_files / $(nproc) ))
 
-ls "$stats_dir_in" | while mapfile -n $N files_per_proc && [ ${#files_per_proc[@]} -gt 0 ]; do
+while mapfile -n $N files_per_proc && [ ${#files_per_proc[@]} -gt 0 ]; do
         files="$(printf "$stats_dir_in/%s" "${files_per_proc[@]}")"
         python3 get_w.py "$stats_dir_out" $files &
-done
+done < <(ls "$stats_dir_in")
 wait
 
 echo "all done"