Browse Source

Port code for oblivious compaction, shuffle, sort

from "Fast Fully Oblivious Compaction and Shuffling" and
"Waks-On/Waks-Off: Fast Oblivious Offline/Online Shuffling and Sorting
with Waksman Networks" by Sajin Sasy, Aaron Johnson, and Ian Goldberg:

https://crysp.uwaterloo.ca/software/obliv/

with some additional enhancements, such as a multithreaded
implementation of WaksShuffle + Sort and WaksSort.
Ian Goldberg 1 year ago
parent
commit
0a23f8293c

+ 3 - 0
Enclave/Enclave.edl

@@ -4,6 +4,9 @@ enclave {
     include "sgx_tseal.h"
     include "../Enclave/enclave_api.h"
 
+    from "sgx_pthread.edl" import *;
+    from "sgx_tstdc.edl" import sgx_thread_wait_untrusted_event_ocall, sgx_thread_set_untrusted_event_ocall, sgx_thread_setwait_untrusted_events_ocall, sgx_thread_set_multiple_untrusted_events_ocall;
+
     trusted {
         public void ecall_identity_key_new(
             [out] sgx_ec256_public_t *outpub,

+ 85 - 0
Enclave/OblivAlgs/CONFIG.h

@@ -0,0 +1,85 @@
+
+// ************************************************************************** //
+// Global parameters:
+
+  // Use Pseudorandom bytes instead of Random bytes
+  // (Instead of sgx_read_rand we pull random bytes from a sgx_read_rand seeded AES_CTR
+  //  of buffer size PRB_BUFFER_SIZE)
+  #define USE_PRB
+  #define PRB_BUFFER_SIZE 100000
+
+  // Debugging flag to use inputs [1,N]:
+  #define RANDOMIZE_INPUTS 1
+  //#define SHOW_INPUT_KEYS 1
+
+// ************************************************************************** //
+// Bucket Oblivious Random Permutation (BORP) / Bucket Oblivious Sort (BOS) parameters:
+
+  // Output verbose timings
+  //#define VERBOSE_TIMINGS_BORP
+
+  // Store the real packets buffer after completing the ORP phase
+  // in memory outside the PRM
+  // TODO 1: This is not really useful. Remove BOS_OUTSIDE_PRM_STORAGE completely from the rest of the source
+  //       and then take it out of here
+  // #define BOS_OUTSIDE_PRM_STORAGE 1
+
+  // Multi-Threading Flags
+  // TODO 2: BORP/BOS used to have a multi-threaded implementation. Since we no longer maintain it we should
+  //       remove it all out of the rest of the source.
+  #define MULTITHREADED
+
+  // When running Single threaded, set NUM_THREADS to 1
+  // TODO 3: When handling TODO 2, this NUM_THREADS should go away as well.
+
+  #define COUNT_OSWAPS
+
+  // To print BRN configuration
+  // #define PRINT_BRN_CONFIGURATION 
+  
+  // To time all the individual components of BORP/BOS 
+  // Namely ProcessPacketsThroughBRN, FlushBuffers, RemoveFakes_TC, 
+  // #define DETAILED_BOS_TIMING 1
+
+  // Useful for debugging reals packets in removeFakes_TC of BORP
+  // #define DEBUG_RFTC
+
+  // To print a log line whenever BORP evicts incorrectly
+  #define DEBUG_BORP_FAILURE
+
+// ************************************************************************** //
+
+// Sorting network parameters:
+
+//data/packet size in bytes
+//#define SN_DATA_SIZE 8
+#define SN_KEY_SIZE 8
+//#define SN_PACKET_SIZE (SN_DATA_SIZE + SN_KEY_SIZE)
+
+
+// ************************************************************************** //
+
+// Waksman network options:
+
+  // #define TEST_WN_DJB 1 
+  // #define TEST_WN_OA 1 
+
+// ************************************************************************** //
+
+// Recursive shuffle parameters:
+
+  //data/packet size in bytes
+  #define RS_PACKET_SIZE 16
+  #define RS_INTERNAL 1
+
+  // RS_M2
+  #define RS_M2_MEM_OPT1 1
+  // #define RS_RB_BUFFER_SIZE 1000000 
+
+// ************************************************************************** //
+// Tight compaction parameters
+
+  #define TC_PRECOMPUTE_COUNTS 1
+  #define TC_OPT_SWAP_FLAG 1
+
+// ************************************************************************** //

+ 9 - 0
Enclave/OblivAlgs/README

@@ -0,0 +1,9 @@
+This directory contains code ported from "Fast Fully Oblivious
+Compaction and Shuffling" and "Waks-On/Waks-Off: Fast Oblivious
+Offline/Online Shuffling and Sorting with Waksman Networks" by
+Sajin Sasy, Aaron Johnson, and Ian Goldberg:
+
+https://crysp.uwaterloo.ca/software/obliv/
+
+with some additional enhancements, such as a multithreaded
+implementation of WaksShuffle + Sort and WaksSort.

+ 267 - 0
Enclave/OblivAlgs/RecursiveShuffle.cpp

@@ -0,0 +1,267 @@
+
+#ifndef BEFTS_MODE
+  #include <array>
+  #include <sgx_tcrypto.h>
+  #include "oasm_lib.h"
+  #include "utils.hpp"
+  #include "RecursiveShuffle.hpp"
+#endif
+
+size_t RS_RB_BUFFER_SIZE;
+unsigned char *random_bytes_buffer = NULL;
+uint32_t *random_bytes_buffer_ptr;
+uint32_t *random_bytes_buffer_ptr_end;
+
+/*
+  MarkHalf: Marks half of the elements of an N sized array randomly.
+  Pass in a bool array of size N, which will be populated with 1's at indexes which 
+r get marked by MarkHalf
+  NOTE: MarkHalf assumes selected_list is initialized to all 0's before passed to MarkHalf
+*/
+
+void MarkHalf(uint64_t N, bool *selected_list) {
+  
+  uint64_t left_to_mark = N/2;
+  uint64_t total_left = N;
+  PRB_buffer *randpool = PRB_pool + g_thread_id;
+  uint32_t coins[RS_MARKHALF_MAX_COINS];
+  size_t coinsleft=0;
+  
+  FOAV_SAFE_CNTXT(MarkHalf_marking_half, N)
+  for(uint64_t i=0; i<N; i++){
+  FOAV_SAFE2_CNTXT(MarkHalf_marking_half, i, coinsleft)
+    if (coinsleft == 0) {
+        size_t numcoins = (N-i);
+        FOAV_SAFE_CNTXT(MarkHalf_marking_half, numcoins)
+        if (numcoins > RS_MARKHALF_MAX_COINS) {
+            numcoins = RS_MARKHALF_MAX_COINS;
+        }
+        randpool->getRandomBytes((unsigned char *) coins,
+            sizeof(coins[0])*numcoins);
+        coinsleft = numcoins;
+    }
+    //Mark with probability left_to_mark/total_left;
+    uint32_t random_coin;
+    random_coin = (total_left * coins[--coinsleft]) >> 32;
+    uint32_t mark_threshold = total_left - left_to_mark;
+    uint8_t mark_element = oge_set_flag(random_coin, mark_threshold);
+
+    //If mark_element, obliviously set selected_list[i] to 1
+    FOAV_SAFE_CNTXT(MarkHalf_marking_half, i)
+    selected_list[i] = mark_element;
+    left_to_mark-= mark_element;
+    total_left--;
+    FOAV_SAFE2_CNTXT(MarkHalf_marking_half, i, N)
+  }
+  
+}
+
+#if 0
+#ifndef BEFTS_MODE
+  void RecursiveShuffle_M1(unsigned char *buf, uint64_t N, size_t block_size) {
+    FOAV_SAFE2_CNTXT(RS_M1, N, block_size)
+    size_t num_random_bytes = calculatelog2(N) * N * sizeof(uint32_t);
+    #ifdef RS_M2_MEM_OPT1
+      FOAV_SAFE2_CNTXT(RS_M1, num_random_bytes, RS_RB_BUFFER_LIMIT)
+      if(num_random_bytes > RS_RB_BUFFER_LIMIT) {
+        RS_RB_BUFFER_SIZE = RS_RB_BUFFER_LIMIT;
+      }
+      else{
+        RS_RB_BUFFER_SIZE = num_random_bytes;
+      }
+      try {
+        random_bytes_buffer = new unsigned char[RS_RB_BUFFER_SIZE];
+        //FOAV_SAFE_CNTXT(RS_M1_initializing_selected_list, N)
+        selected_list = new bool[N]{};
+      } catch (std::bad_alloc&){
+        printf("Allocating memory failed in RS_M2\n");
+      }
+      getBulkRandomBytes((unsigned char*)random_bytes_buffer, RS_RB_BUFFER_SIZE);
+      random_bytes_buffer_ptr_end = (uint32_t*)(random_bytes_buffer + RS_RB_BUFFER_SIZE);
+    #else
+      try {
+        random_bytes_buffer = new unsigned char[num_random_bytes];
+        selected_list = new bool[N]{};
+      } catch (std::bad_alloc&){
+        printf("Allocating memory failed in RS_M2\n");
+      }
+
+      getBulkRandomBytes((unsigned char*)random_bytes_buffer, num_random_bytes);
+    #endif
+
+    random_bytes_buffer_ptr = (uint32_t*) random_bytes_buffer;
+    FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+    if(block_size==4){
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+      RecursiveShuffle_M1_inner<OSWAP_4>(buf, N, block_size, selected_list);
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+    } else if(block_size==8){
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+      RecursiveShuffle_M1_inner<OSWAP_8>(buf, N, block_size, selected_list);
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+    } else if(block_size%16==0) {
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+      RecursiveShuffle_M1_inner<OSWAP_16X>(buf, N, block_size, selected_list);
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+    } else {
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+      RecursiveShuffle_M1_inner<OSWAP_8_16X>(buf, N, block_size, selected_list);
+      FOAV_SAFE_CNTXT(RS_M1_branching_on_block_size_for_OSwap_Style_templates, block_size)
+    }
+
+    FOAV_SAFE_CNTXT(RecursiveShuffle_M1_delete, random_bytes_buffer)
+    delete []random_bytes_buffer;
+    FOAV_SAFE_CNTXT(RecursiveShuffle_M1_delete, selected_list)
+    delete []selected_list;
+  }
+#endif
+#endif
+
+void RecursiveShuffle_M2(unsigned char *buf, uint64_t N, size_t block_size){
+    RecursiveShuffle_M2_parallel(buf, N, block_size, 1);
+}
+
+void RecursiveShuffle_M2_parallel(unsigned char *buf, uint64_t N, size_t block_size, size_t nthreads){
+  FOAV_SAFE2_CNTXT(RS_M2, N, block_size)
+  bool *selected_list;
+  try {
+    selected_list = new bool[N]{};
+  } catch (std::bad_alloc&){
+    printf("Allocating memory failed in RS_M2\n");
+  }
+
+  threadpool_init(nthreads);
+
+  FOAV_SAFE_CNTXT(RS_M2_branching_on_block_size_for_OSwap_Style_templates, block_size)
+  if(block_size==4){
+    RecursiveShuffle_M2_inner_parallel<OSWAP_4>(buf, N, block_size, selected_list, nthreads);
+  } else if(block_size==8){
+    RecursiveShuffle_M2_inner_parallel<OSWAP_8>(buf, N, block_size, selected_list, nthreads);
+  } else if(block_size%16==0) {
+    RecursiveShuffle_M2_inner_parallel<OSWAP_16X>(buf, N, block_size, selected_list, nthreads);
+  } else {
+    RecursiveShuffle_M2_inner_parallel<OSWAP_8_16X>(buf, N, block_size, selected_list, nthreads);
+  }
+  
+  threadpool_shutdown();
+
+  FOAV_SAFE_CNTXT(RecursiveShuffle_M2_delete, selected_list)
+  delete []selected_list;
+}
+
+#if 0
+// We maintain a double type return version of RecusiveShuffle_M2, 
+// to time strictly the RS_M2 component when using it without any encryption or decryption
+// We need this only for the BOS optimizer!!
+double RecursiveShuffle_M2_opt(unsigned char *buf, uint64_t N, size_t block_size){
+  FOAV_SAFE2_CNTXT(RS_M2_opt, N, block_size)
+  //In a single call allocate all the randomness we need here!
+  size_t num_random_bytes = calculatelog2(N) * N * sizeof(uint32_t);
+  long t0, t1;
+  ocall_clock(&t0);
+
+  #ifdef RS_M2_MEM_OPT1
+    if(num_random_bytes > RS_RB_BUFFER_LIMIT) {
+      RS_RB_BUFFER_SIZE = RS_RB_BUFFER_LIMIT;
+    }
+    else{
+      RS_RB_BUFFER_SIZE = num_random_bytes;
+    }
+    try {
+      random_bytes_buffer = new unsigned char[RS_RB_BUFFER_SIZE];
+      selected_list = new bool[N]{};
+    } catch (std::bad_alloc&){
+      printf("Allocating memory failed in RS_M2\n");
+    }
+    getBulkRandomBytes((unsigned char*)random_bytes_buffer, RS_RB_BUFFER_SIZE);
+    random_bytes_buffer_ptr_end = (uint32_t*)(random_bytes_buffer + RS_RB_BUFFER_SIZE);
+  #else
+    try {
+      random_bytes_buffer = new unsigned char[num_random_bytes];
+      selected_list = new bool[N]{};
+    } catch (std::bad_alloc&){
+      printf("Allocating memory failed in RS_M2\n");
+    }
+
+    getBulkRandomBytes((unsigned char*)random_bytes_buffer, num_random_bytes);
+  #endif
+
+  random_bytes_buffer_ptr = (uint32_t*) random_bytes_buffer;
+  FOAV_SAFE_CNTXT(RS_M2_opt, num_random_bytes)
+  FOAV_SAFE2_CNTXT(RS_M2_opt, N, block_size)
+
+  FOAV_SAFE_CNTXT(RS_M2_opt, block_size)
+  if(block_size==4){
+    RecursiveShuffle_M2_inner<OSWAP_4>(buf, N, block_size, selected_list);
+  } else if(block_size==8){
+    RecursiveShuffle_M2_inner<OSWAP_8>(buf, N, block_size, selected_list);
+  } else if(block_size%16==0) {
+    RecursiveShuffle_M2_inner<OSWAP_16X>(buf, N, block_size, selected_list);
+  } else {
+    RecursiveShuffle_M2_inner<OSWAP_8_16X>(buf, N, block_size, selected_list);
+  }
+
+  delete []random_bytes_buffer;
+  delete []selected_list;
+
+  ocall_clock(&t1);
+  double ptime = ((double)(t1-t0))/1000.0;
+  return ptime;
+}
+
+#ifndef BEFTS_MODE
+double DecryptAndShuffleM1(unsigned char *encrypted_buffer, size_t N, size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret) {
+ 
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, N, encrypted_block_size, &decrypted_buffer);
+
+  long t0, t1;
+  ocall_clock(&t0);
+
+  // ShuffleM1 on decrypted_buffer
+  PRB_pool_init(1);
+  RecursiveShuffle_M1(decrypted_buffer, N, decrypted_block_size);
+
+  ocall_clock(&t1);
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+
+  free(decrypted_buffer); 
+  double ptime = ((double)(t1-t0))/1000.0;
+  ret->OSWAP_count = OSWAP_COUNTER;
+  ret->ptime = ptime;
+  return(ptime);
+}
+#endif
+
+double DecryptAndShuffleM2(unsigned char *encrypted_buffer, size_t N, size_t encrypted_block_size, size_t nthreads, unsigned char *result_buffer, enc_ret *ret) {
+ 
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, N, encrypted_block_size, &decrypted_buffer);
+
+  long t0, t1;
+  ocall_clock(&t0);
+
+  // ShuffleM2 on decrypted_buffer
+  PRB_pool_init(nthreads);
+  RecursiveShuffle_M2_parallel(decrypted_buffer, N, decrypted_block_size, nthreads);
+
+  ocall_clock(&t1);
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+
+  #ifdef TIME_MARKHALF
+    printf("Time taken in MarkHalf calls = %f\n", MARKHALF_TIME);
+  #endif
+
+  free(decrypted_buffer); 
+  double ptime = ((double)(t1-t0))/1000.0;
+  ret->OSWAP_count = OSWAP_COUNTER;
+  ret->ptime = ptime;
+  return(ptime);
+}
+#endif

+ 38 - 0
Enclave/OblivAlgs/RecursiveShuffle.hpp

@@ -0,0 +1,38 @@
+#ifndef __RECURSIVESHUFFLE_HPP__
+#define __RECURSIVESHUFFLE_HPP__
+
+#include "TightCompaction_v2.hpp"
+#ifndef BEFTS_MODE
+  #include "foav.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RS_RB_BUFFER_LIMIT 819200
+#define RS_MARKHALF_MAX_COINS 2048
+
+
+void MarkHalf(uint64_t N, bool *selected_list);
+
+#if 0
+void RecursiveShuffle_M1(unsigned char *buf, uint64_t N, size_t block_size);
+#endif
+void RecursiveShuffle_M2(unsigned char *buf, uint64_t N, size_t block_size);
+void RecursiveShuffle_M2_parallel(unsigned char *buf, uint64_t N, size_t block_size, size_t nthreads);
+
+#if 0
+void RecursiveShuffle_M1_inner_16x(unsigned char *buf, uint64_t N, size_t block_size);
+
+double DecryptAndShuffleM1(unsigned char *encrypted_buffer, size_t N, size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret);
+double DecryptAndShuffleM2(unsigned char *encrypted_buffer, size_t N, size_t encrypted_block_size, size_t nthreads, unsigned char *result_buffer, enc_ret *ret);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#include "RecursiveShuffle.tcc"
+
+#endif

+ 161 - 0
Enclave/OblivAlgs/RecursiveShuffle.tcc

@@ -0,0 +1,161 @@
+#ifndef __RECURSIVESHUFFLE_TCC__
+#define __RECURSIVESHUFFLE_TCC__
+
+template<OSwap_Style oswap_style>
+void RecursiveShuffle_M2_inner(unsigned char *buf, uint64_t N, size_t block_size, bool *selected_list) {
+  // Base cases
+  FOAV_SAFE_CNTXT(RecursiveShuffle_M2_inner, N)
+  if (N <= 1) {
+      return;
+  }
+  FOAV_SAFE_CNTXT(RecursiveShuffle_M2_inner, N)
+  if (N == 2) {
+      //Flip a coin and swap the two
+      unsigned char *packet_1 = buf;
+      unsigned char *packet_2 = buf + block_size;
+      bool swap_flag = getRandomBit();
+      oswap_buffer<oswap_style>(packet_1, packet_2, block_size, swap_flag);
+      return;
+  }
+
+  // MarkHalf the elements
+  MarkHalf(N, selected_list);
+
+  //TightCompact
+  TightCompact<oswap_style>(buf, N, block_size, selected_list);
+
+  // Recursively shuffle each half
+  size_t lsize = N/2;
+  size_t rsize = N - lsize;
+  bool *selected_L = selected_list;
+  bool *selected_R = selected_list + lsize; 
+
+  RecursiveShuffle_M2_inner<oswap_style>(buf, lsize, block_size, selected_L);
+  RecursiveShuffle_M2_inner<oswap_style>(buf+ lsize*block_size, rsize, block_size, selected_R);
+}
+
+struct RecursiveShuffle_M2_inner_parallel_args {
+    unsigned char *buf;
+    uint64_t N;
+    size_t block_size;
+    bool *selected_list;
+    size_t nthreads;
+};
+
+template<OSwap_Style oswap_style>
+void RecursiveShuffle_M2_inner_parallel(unsigned char *buf, uint64_t N, size_t block_size, bool *selected_list, size_t nthreads);
+
+template<OSwap_Style oswap_style>
+static void* RecursiveShuffle_M2_inner_parallel_launch(void *voidargs) {
+    struct RecursiveShuffle_M2_inner_parallel_args *args =
+        (RecursiveShuffle_M2_inner_parallel_args*)voidargs;
+    RecursiveShuffle_M2_inner_parallel<oswap_style>(args->buf, args->N, args->block_size,
+        args->selected_list, args->nthreads);
+    return NULL;
+}
+
+template<OSwap_Style oswap_style>
+void RecursiveShuffle_M2_inner_parallel(unsigned char *buf, uint64_t N, size_t block_size, bool *selected_list, size_t nthreads) {
+  FOAV_SAFE2_CNTXT(RS_M2_inner_parallel, nthreads, N)
+  if (nthreads <= 1) {
+#ifdef VERBOSE_TIMINGS_RECSHUFFLE
+    unsigned long start = printf_with_rtclock("Thread %u starting RecursiveShuffle_M2_inner(N=%lu)\n", g_thread_id, N);
+#endif
+    RecursiveShuffle_M2_inner<oswap_style>(buf, N, block_size, selected_list);
+#ifdef VERBOSE_TIMINGS_RECSHUFFLE
+    printf_with_rtclock_diff(start, "Thread %u ending RecursiveShuffle_M2_inner(N=%lu)\n", g_thread_id, N);
+#endif
+    return;
+  }
+  // Base cases
+  FOAV_SAFE_CNTXT(RecursiveShuffle_M2_inner, N)
+  if (N <= 1) {
+      return;
+  }
+  FOAV_SAFE_CNTXT(RecursiveShuffle_M2_inner, N)
+  if (N == 2) {
+      //Flip a coin and swap the two
+      unsigned char *packet_1 = buf;
+      unsigned char *packet_2 = buf + block_size;
+      bool swap_flag = getRandomBit();
+      oswap_buffer<oswap_style>(packet_1, packet_2, block_size, swap_flag);
+      return;
+  }
+#ifdef VERBOSE_TIMINGS_RECSHUFFLE
+  unsigned long start = printf_with_rtclock("Thread %u starting RecursiveShuffle_M2_inner_parallel(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+
+  printf("Before MarkHalf\n");
+  // MarkHalf the elements
+  MarkHalf(N, selected_list);
+  printf("After MarkHalf\n");
+
+  //TightCompact
+  TightCompact_parallel<oswap_style>(buf, N, block_size, selected_list, nthreads);
+
+  // Recursively shuffle each half
+  size_t lsize = N/2;
+  size_t rsize = N - lsize;
+  bool *selected_L = selected_list;
+  bool *selected_R = selected_list + lsize;
+#if 1
+  size_t lthreads = nthreads/2;
+  size_t rthreads = nthreads - lthreads;
+
+  /* The left half will be processed by thread g_thread_id + rthreads, which will
+   * inherit threads g_thread_id+rthreads .. g_thread_id + nthreads-1. */
+  threadid_t leftthreadid = g_thread_id + rthreads;
+  RecursiveShuffle_M2_inner_parallel_args leftargs = {
+    buf, lsize, block_size, selected_L, lthreads
+  };
+  threadpool_dispatch(leftthreadid,
+    RecursiveShuffle_M2_inner_parallel_launch<oswap_style>,
+    &leftargs);
+  /* We will do the right half ourselves, on threads g_thread_id .. g_thread_id..rthreads-1 */
+  RecursiveShuffle_M2_inner_parallel<oswap_style>(buf+ lsize*block_size, rsize, block_size, selected_R, rthreads);
+  threadpool_join(leftthreadid, NULL);
+#else
+  RecursiveShuffle_M2_inner_parallel<oswap_style>(buf, lsize, block_size, selected_L, nthreads);
+  RecursiveShuffle_M2_inner_parallel<oswap_style>(buf+ lsize*block_size, rsize, block_size, selected_R, nthreads);
+#endif
+#ifdef VERBOSE_TIMINGS_RECSHUFFLE
+  printf_with_rtclock_diff(start, "Thread %u ending RecursiveShuffle_M2_inner_parallel(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+}
+
+
+#ifndef BEFTS_MODE
+  template<OSwap_Style oswap_style>
+  void RecursiveShuffle_M1_inner(unsigned char *buf, uint64_t N, size_t block_size, bool *selected_list) {
+    FOAV_SAFE2_CNTXT(RS_M1_inner, N, block_size)
+    // Base cases
+    if (N == 1) {
+        return;
+    }
+    FOAV_SAFE2_CNTXT(RS_M1_inner, N, block_size)
+    if (N == 2) {
+        //Flip a coin and swap the two
+        unsigned char *packet_1 = buf;
+        unsigned char *packet_2 = buf + block_size;
+        bool swap_flag = getRandomBit();
+        oswap_buffer<oswap_style>(packet_1, packet_2, block_size, swap_flag);
+        return;
+    }
+
+    // MarkHalf the elements
+    MarkHalf(N, selected_list);
+
+    //TightCompact
+    OP_TightCompact<oswap_style>(buf, N, block_size, selected_list);
+
+    // Recursively shuffle each half
+    size_t l_size = N/2;
+    size_t r_size = N-l_size;
+    bool *selected_L = selected_list;
+    bool *selected_R = selected_list + l_size; 
+    RecursiveShuffle_M1_inner<oswap_style>(buf, l_size, block_size, selected_L);
+    RecursiveShuffle_M1_inner<oswap_style>(buf+(l_size*block_size), r_size, block_size, selected_R);
+
+  }
+#endif
+#endif

+ 367 - 0
Enclave/OblivAlgs/SortingNetwork.cpp

@@ -0,0 +1,367 @@
+#include <array>
+#include <sgx_tcrypto.h>
+#include "SortingNetwork.hpp"
+
+int num_oddevenmerge_comps(uint64_t N) {
+    int logn = calculatelog2(N);
+    return (N/4) * logn * logn - (N/4) * logn + N - 1;
+}
+
+// The input-output buffer must contain N decrypted blocks of length block_size bytes each.
+// Each block consists of a SN_KEY_SIZE-byte key followed by a (block_size-SN_KEY_SIZE)-byte data
+// block. The data will be sorted in-place ascending by key. block_size must be a multiple of 16.
+void OddEvenMergeSort(unsigned char *buf, uint64_t N, size_t block_size) {
+  if(block_size==4){
+    OddEvenMergeSort<OSWAP_4>(buf, N, block_size);
+  } else if(block_size==8){
+    OddEvenMergeSort<OSWAP_8>(buf, N, block_size);
+  } else if(block_size==12){
+    OddEvenMergeSort<OSWAP_12>(buf, N, block_size);
+  } else if(block_size%16==0){
+    OddEvenMergeSort<OSWAP_16X>(buf, N, block_size);
+  } else{
+    OddEvenMergeSort<OSWAP_8_16X>(buf, N, block_size);
+  }  
+}
+
+#if 0
+double DecryptAndOddEvenMergeSort(unsigned char *encrypted_buffer, uint64_t N, 
+        size_t encrypted_block_size, unsigned char *result_buffer) {
+
+  long t1, t2;
+  //ocall_clock(&t0);
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, N, encrypted_block_size,
+    &decrypted_buffer);
+  ocall_clock(&t1);
+
+  // Apply odd-even mergesort network
+  PRB_pool_init(1);
+  OddEvenMergeSort(decrypted_buffer, N, decrypted_block_size);
+  ocall_clock(&t2);
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+  //ocall_clock(&t3);
+
+  // CLOCKS_PER_SEC == 1000000, so CLOCKS_PER_MS == 1000
+  //double decryption_ms = ((double)(t1-t0))/1000.0;
+  double compare_ms = ((double)(t2-t1))/1000.0;
+  //double encryption_ms = ((double)(t3-t2))/1000.0;
+
+  int num_comparisons = num_oddevenmerge_comps(N);
+  //printf("Estimated comparisons for %d items: %d\n", N, num_comparisons);
+  //printf("Counted comparisons for %d items: %d\n", N, OSWAP_COUNTER);
+
+  //printf("Buffer decryption time: %lf ms\n", decryption_ms);
+  //printf("Compare-and-swaps time: %lf ms\n", compare_ms);
+  //printf("Buffer encryption time: %lf ms\n", encryption_ms);
+
+  free(decrypted_buffer);
+  return(compare_ms);
+}
+
+
+// NOTE: We don't need a _timed and non-timed one. If we dont keep SN Application. We can
+// just make _timed the only one and remove _timed from the name
+double DecryptAndOddEvenMergeSort_timed(unsigned char *encrypted_buffer, uint64_t N, 
+        size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret) {
+
+  long t1, t2;
+  //ocall_clock(&t0);
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, N, encrypted_block_size,
+    &decrypted_buffer);
+  ocall_clock(&t1);
+
+  // Apply odd-even mergesort network
+  PRB_pool_init(1);
+  OddEvenMergeSort(decrypted_buffer, N, decrypted_block_size);
+  ocall_clock(&t2);
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+  //ocall_clock(&t3);
+
+  // CLOCKS_PER_SEC == 1000000, so CLOCKS_PER_MS == 1000
+  //double decryption_ms = ((double)(t1-t0))/1000.0;
+  double compare_ms = ((double)(t2-t1))/1000.0;
+  //double encryption_ms = ((double)(t3-t2))/1000.0;
+
+  int num_comparisons = num_oddevenmerge_comps(N);
+  //printf("Estimated comparisons for %d items: %d\n", N, num_comparisons);
+  //printf("Counted comparisons for %d items: %d\n", N, OSWAP_COUNTER);
+
+  //printf("Buffer decryption time: %lf ms\n", decryption_ms);
+  //printf("Compare-and-swaps time: %lf ms\n", compare_ms);
+  //printf("Buffer encryption time: %lf ms\n", encryption_ms);
+
+  free(decrypted_buffer);
+  ret->OSWAP_count = OSWAP_COUNTER;
+  ret->ptime = compare_ms;
+  return(compare_ms);
+}
+#endif
+
+/*
+ascend: 1 = ascending
+        0 = descending
+*/
+
+//Same as BitonicSort but along with key swaps, swap associated_data based on the same flag.
+// NOTE: 1) We assume keys are limited to 8 byte values!
+//       2) We assume associated_data1 and associated_data2 have the same data_size! This helps set
+//          the Oswap_Style cleanly!
+
+void BitonicSort(unsigned char *keys, size_t N, unsigned char *associated_data1, unsigned char *associated_data2, size_t data_size, bool ascend) {
+  if(data_size==4){
+    BitonicSort<OSWAP_4>(keys, N, associated_data1, associated_data2, data_size, ascend);
+  } else if(data_size==8){
+    BitonicSort<OSWAP_8>(keys, N, associated_data1, associated_data2, data_size, ascend);
+  } else if (data_size%16==0){
+    BitonicSort<OSWAP_16X>(keys, N, associated_data1, associated_data2, data_size, ascend);
+  } else{
+    BitonicSort<OSWAP_8_16X>(keys, N, associated_data1, associated_data2, data_size, ascend);
+  }
+}
+
+void BitonicSort(unsigned char *buffer, size_t N, size_t block_size, bool ascend) {
+  if(block_size==4){
+    BitonicSort<OSWAP_4>(buffer, N, block_size, ascend);
+  } else if(block_size==8){
+    BitonicSort<OSWAP_8>(buffer, N, block_size, ascend);
+  } else if(block_size==12){
+    BitonicSort<OSWAP_12>(buffer, N, block_size, ascend);
+  } else if (block_size%16==0){
+    BitonicSort<OSWAP_16X>(buffer, N, block_size, ascend);
+  }
+  else{
+    BitonicSort<OSWAP_8_16X>(buffer, N, block_size, ascend);
+  }
+}
+
+#if 0
+//TODO: Take this off, if we no longer plan to support SN_App!
+double DecryptAndBitonicSort(unsigned char *encrypted_buffer, uint64_t N, size_t encrypted_block_size,
+  unsigned char *result_buffer) {
+
+  long t1, t2;
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, N, encrypted_block_size,
+    &decrypted_buffer);
+
+  ocall_clock(&t1);
+
+  // Apply odd-even mergesort network
+  //PRB_pool_init(1);
+  BitonicSort(decrypted_buffer, N, decrypted_block_size, true);
+  ocall_clock(&t2);
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  //PRB_pool_shutdown();
+
+  // CLOCKS_PER_SEC == 1000000, so CLOCKS_PER_MS == 1000
+  double compare_ms = ((double)(t2-t1))/1000.0;
+
+  free(decrypted_buffer);
+  return(compare_ms);
+}
+
+double DecryptAndBitonicSort(unsigned char *encrypted_buffer, uint64_t N, size_t encrypted_block_size,
+  unsigned char *result_buffer, enc_ret *ret) {
+
+  long t1, t2;
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, N, encrypted_block_size,
+    &decrypted_buffer);
+
+  ocall_clock(&t1);
+
+  memcpy(result_buffer, decrypted_buffer, N*decrypted_block_size);
+  // Apply odd-even mergesort network
+  PRB_pool_init(1);
+  BitonicSort(decrypted_buffer, N, decrypted_block_size, true);
+  //BitonicSort(decrypted_buffer, N, result_buffer, NULL, decrypted_block_size, true);
+  ocall_clock(&t2);
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+
+  // CLOCKS_PER_SEC == 1000000, so CLOCKS_PER_MS == 1000
+  double compare_ms = ((double)(t2-t1))/1000.0;
+
+  free(decrypted_buffer);
+  ret->OSWAP_count = OSWAP_COUNTER;
+  ret->ptime = compare_ms;
+
+  return(compare_ms);
+}
+
+double DecryptAndOddEvenMergeSortShuffle(unsigned char *encrypted_buffer, uint64_t N, size_t encrypted_block_size,
+  unsigned char *result_buffer, enc_ret *ret) {
+
+  long t1, t2;
+  //ocall_clock(&t0);
+
+  // Decrypt buffer to decrypted_buffer
+  PRB_pool_init(1);
+  unsigned char *decrypted_buffer = NULL;
+  unsigned char *random_bytes = new unsigned char[8*N];
+  getBulkRandomBytes(random_bytes, 8*N);
+
+  size_t decrypted_block_size = decryptBuffer_attachRTags(encrypted_buffer, N, encrypted_block_size,
+    random_bytes, &decrypted_buffer);
+  ocall_clock(&t1);
+
+  // Apply odd-even mergesort network
+  OddEvenMergeSort(decrypted_buffer, N, decrypted_block_size);
+  ocall_clock(&t2);
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer_removeRTags(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+
+  // CLOCKS_PER_SEC == 1000000, so CLOCKS_PER_MS == 1000
+  double compare_ms = ((double)(t2-t1))/1000.0;
+
+  free(decrypted_buffer);
+  delete []random_bytes;
+  ret->OSWAP_count = OSWAP_COUNTER;
+  ret->ptime = compare_ms;
+  
+  return(compare_ms);
+}
+
+
+
+double DecryptAndBitonicSortShuffle(unsigned char *encrypted_buffer, uint64_t N, 
+        size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret) {
+  long t1, t2;
+
+  // Decrypt buffer to decrypted_buffer
+  PRB_pool_init(1);
+  unsigned char *decrypted_buffer = NULL;
+  size_t rsize = 8 * (size_t) N;
+  unsigned char *random_bytes = (unsigned char*) malloc(rsize);
+  if(random_bytes == NULL)
+    printf("Failed to allocate random_bytes in D&BSS\n");
+  getBulkRandomBytes(random_bytes, rsize);
+  
+  size_t decrypted_block_size = decryptBuffer_attachRTags(encrypted_buffer, N, 
+          encrypted_block_size, random_bytes, &decrypted_buffer);
+
+  ocall_clock(&t1);
+
+  // Apply odd-even mergesort network
+  // NOTE: We will never have decrypted_block_size==8, since attaching rTag will add 8 bytes.
+  // So minimum block_size here is 16
+  BitonicSort(decrypted_buffer, N, decrypted_block_size, true);
+
+  ocall_clock(&t2);
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer_removeRTags(decrypted_buffer, N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+
+  // CLOCKS_PER_SEC == 1000000, so CLOCKS_PER_MS == 1000
+  double compare_ms = ((double)(t2-t1))/1000.0;
+
+  free(decrypted_buffer);
+  free(random_bytes);
+
+  ret->OSWAP_count = OSWAP_COUNTER;
+  ret->ptime = compare_ms;
+  return(compare_ms);
+}
+
+void testBitonicSort(){
+  size_t N = 10;
+  // Test the normal version of bitonic sort; each data item is a 16-byte key and two 8-byte integers
+  unsigned char *data = new unsigned char[N*(16+8+8)];
+  PRB_pool_init(1);
+  for(size_t i=0; i<N; i++){
+    unsigned char *item = data+(i*(16+8+8));
+    getRandomBytes((unsigned char*)item, 16);
+    *(uint64_t*)(item+16) = i;
+    *(uint64_t*)(item+24) = N-i;
+  }
+  PRB_pool_shutdown();
+
+  printf("Before BitonicSort\n");
+  for(size_t i=0; i<N; i++){
+    unsigned char *item = data+(i*(16+8+8));
+    printf("(");
+    for (size_t j=0; j<16; ++j) { printf("%02x", item[15-j]); }
+    printf(", %d, %d)\n", *(uint64_t*)(item+16), *(uint64_t*)(item+24));
+  }
+  printf("\n");
+
+  BitonicSort<OSWAP_16X, __uint128_t>((unsigned char*) data, N, 16+8+8, true);
+  
+  printf("After BitonicSort\n");
+  for(size_t i=0; i<N; i++){
+    unsigned char *item = data+(i*(16+8+8));
+    printf("(");
+    for (size_t j=0; j<16; ++j) { printf("%02x", item[15-j]); }
+    printf(", %d, %d)\n", *(uint64_t*)(item+16), *(uint64_t*)(item+24));
+  }
+  printf("\n");
+
+  printf("\n\n\n");
+
+  delete []data;
+
+  // Test the associated data version of bitonic sort
+  __uint128_t *key_array = new __uint128_t[N];
+  uint64_t *ass_data1 = new uint64_t[N];  
+  uint64_t *ass_data2 = new uint64_t[N];
+
+  PRB_pool_init(1);
+  for(size_t i=0; i<N; i++){
+    size_t random_coin;
+    getRandomBytes((unsigned char*) (key_array+i), 16);
+    ass_data1[i] = i;
+    ass_data2[i] = N-i;
+  }
+  PRB_pool_shutdown();
+
+
+  printf("Before BitonicSort (with associated data)\n");
+  for(size_t i=0; i<N; i++){
+    printf("(");
+    for (size_t j=0; j<16; ++j) { printf("%02x", ((unsigned char*)(key_array+i))[15-j]); }
+    printf(", %d, %d)\n", ass_data1[i], ass_data2[i]);
+  }
+  printf("\n");
+
+  BitonicSort<OSWAP_8, __uint128_t>((unsigned char*) key_array, N, (unsigned char*)ass_data1, 
+        (unsigned char*) ass_data2, 8, true);
+  
+  printf("After BitonicSort (with associated data)\n");
+  for(size_t i=0; i<N; i++){
+    printf("(");
+    for (size_t j=0; j<16; ++j) { printf("%02x", ((unsigned char*)(key_array+i))[15-j]); }
+    printf(", %d, %d)\n", ass_data1[i], ass_data2[i]);
+  }
+
+  printf("\n\n\n");
+
+  delete []key_array;
+  delete []ass_data1;
+  delete []ass_data2;
+}
+#endif

+ 37 - 0
Enclave/OblivAlgs/SortingNetwork.hpp

@@ -0,0 +1,37 @@
+#ifndef __SORTINGNETWORK_HPP__
+#define __SORTINGNETWORK_HPP__
+
+#include "oasm_lib.h"
+#include "utils.hpp"
+
+enum OSort_Style{BUFFERS, KEY_DATA, KEY_DATAX2};
+
+void OddEvenMergeSort(unsigned char *buf, uint64_t N, size_t block_size);
+
+#if 0
+double DecryptAndOddEvenMergeSort(unsigned char *encrypted_buffer, uint64_t N, size_t block_size,
+  unsigned char *result_buffer);
+#endif
+
+void BitonicSort(unsigned char *buffer, size_t N, size_t block_size, bool ascend);
+
+void BitonicSort(unsigned char *keys, size_t N, unsigned char *associated_data1,
+      unsigned char *associated_data2, size_t data_size, bool ascend);
+
+template<OSwap_Style oswap_style, typename KeyType = uint64_t>
+void BitonicSort(unsigned char *buffer, size_t N, size_t block_size, bool ascend);
+
+template<OSwap_Style oswap_style, typename KeyType = uint64_t>
+inline void BitonicSort(unsigned char *keys, size_t N, unsigned char *associated_data1,
+      unsigned char *associated_data2, size_t data_size, bool ascend);
+
+template<OSwap_Style oswap_style, typename KeyType = uint64_t>
+void BitonicMerge(unsigned char *keys, size_t N, unsigned char *associated_data1,
+      unsigned char *associated_data2, size_t data_size, bool ascend);
+
+#if 0
+void testBitonicSort();
+#endif
+
+#include "SortingNetwork.tcc"
+#endif

+ 338 - 0
Enclave/OblivAlgs/SortingNetwork.tcc

@@ -0,0 +1,338 @@
+#ifndef __SORTINGNETWORK_TCC__
+#define __SORTINGNETWORK_TCC__
+
+
+// Merge operation for odd-even mergesort. Takes number of spaces to "skip" between items in buffer.
+// Left and right parts must be sorted, with left size a power of two and right size smaller.
+// Merges them to return a sorted result.
+template<OSwap_Style oswap_style>
+void OddEvenMerge(unsigned char *buf, uint64_t skip, uint64_t N, size_t block_size) {
+  unsigned char *block1;
+  unsigned char *block2;
+
+  FOAV_SAFE_CNTXT(OddEvenMerge, N)
+  if (N < 2) {
+    return;
+  }
+  FOAV_SAFE_CNTXT(OddEvenMerge, N)
+  if (N == 2) {
+    block1 = buf;
+    block2 = buf + block_size + (block_size*skip);
+    oswap_buffer<oswap_style>(block1, block2, block_size, 
+          ogt_set_flag(*((uint64_t *) block1), *((uint64_t *) block2)));
+    return;
+  }
+
+  // Merge odd items
+  OddEvenMerge<oswap_style>(buf, 2*skip+1, (N/2)+(N%2), block_size);
+  // Merge even items
+  OddEvenMerge<oswap_style>(buf+block_size+(block_size*skip), 2*skip+1, N/2, block_size);
+
+  // Compare-and-swap subsequent item pairs, skipping first item
+  block2 = buf;
+
+  FOAV_SAFE_CNTXT(OddEvenMerge, N)
+  for (int i=0; i<(N-1)/2; i++) {
+    FOAV_SAFE_CNTXT(OddEvenMerge, i)
+    block1 = block2 + block_size + (block_size*skip);
+    block2 = block1 + block_size + (block_size*skip);
+    oswap_buffer<oswap_style>(block1, block2, block_size,
+        ogt_set_flag(*((uint64_t *) block1), *((uint64_t *) block2)));    
+  }
+}
+
+
+template<OSwap_Style oswap_style>
+void OddEvenMergeSort(unsigned char *buf, uint64_t N, size_t block_size) {
+  // Perform compare-and-swaps
+  unsigned char *block1 = buf;
+  unsigned char *block2 = buf + block_size;
+
+  FOAV_SAFE_CNTXT(OddEvenMerge, N)
+  if (N < 2) {
+    return;
+  }
+  FOAV_SAFE_CNTXT(OddEvenMerge, N)
+  if (N == 2) {
+    bool swap_flag = ogt_set_flag(*((uint64_t *) block1), *((uint64_t *) block2));
+    oswap_buffer<oswap_style>(block1, block2, block_size, swap_flag);    
+    return;
+  }
+
+  // Divide into maximum power of two and remainder
+  uint64_t N1 = pow2_lt(N);
+  uint64_t N2 = N - N1;
+
+  // Recursively sort left and right parts
+  OddEvenMergeSort<oswap_style>(buf, N1, block_size);
+  OddEvenMergeSort<oswap_style>(buf + block_size*N1, N2, block_size);
+
+  // Apply merge operation to complete sort
+  OddEvenMerge<oswap_style>(buf, 0, N, block_size);
+}
+
+template<OSwap_Style oswap_style, typename KeyType>
+void BitonicMerge(unsigned char *buffer, size_t N, size_t block_size, bool ascend) {
+  FOAV_SAFE2_CNTXT(BitonicMerge, N, block_size)
+  if(N<2){
+    return;
+  }
+  else if((N & (N * -1))!=N) {
+    size_t M = pow2_lt(N);
+    unsigned char *block1 = buffer;
+    unsigned char *block2 = buffer + (M * block_size); 
+    size_t feasible_swaps = N - M;
+
+    FOAV_SAFE2_CNTXT(BitonicMerge, feasible_swaps, M)
+    for(size_t i=0; i<feasible_swaps; i++) {
+      FOAV_SAFE2_CNTXT(BitonicMerge, feasible_swaps, i)
+      uint8_t swap_flag = ogt<KeyType>((KeyType*)block1, (KeyType*)block2);
+      FOAV_SAFE_CNTXT(BitonicMerge, ascend)
+      if(ascend){
+        oswap_buffer<oswap_style>(block1, block2, block_size, swap_flag);
+      } else {
+        oswap_buffer<oswap_style>(block1, block2, block_size, !swap_flag);
+      }
+      block1+=block_size;
+      block2+=block_size; 
+      FOAV_SAFE2_CNTXT(BitonicMerge, feasible_swaps, i)
+    }
+ 
+    BitonicMerge<oswap_style, KeyType>(buffer, M, block_size, ascend);
+    BitonicMerge<oswap_style, KeyType>(buffer + (M * block_size), N-M, block_size, ascend);
+  } 
+  else{ //Power of 2 case
+    size_t split = N/2;
+    unsigned char *block1 = buffer;
+    unsigned char *block2 = buffer + (split * block_size); 
+    
+    FOAV_SAFE_CNTXT(BitonicSort, split)
+    for(size_t i=0; i<split; i++) {
+    FOAV_SAFE_CNTXT(BitonicSort, i)
+    FOAV_SAFE_CNTXT(BitonicSort, split)
+      uint8_t swap_flag = ogt<KeyType>((KeyType*)block1, (KeyType*)block2);
+      FOAV_SAFE_CNTXT(BitonicSort, ascend)
+      if(ascend){
+        oswap_buffer<oswap_style>(block1, block2, block_size, swap_flag);
+        //ogt_comp_swap((uint64_t *) block1, (uint64_t *) block2, block1, block2, block_size);
+      } else {
+        oswap_buffer<oswap_style>(block1, block2, block_size, !swap_flag);
+        //ogt_comp_swap((uint64_t *) block2, (uint64_t *) block1, block2, block1, block_size);
+      }
+      block1+=block_size;
+      block2+=block_size; 
+    } 
+
+    BitonicMerge<oswap_style, KeyType>(buffer, split, block_size, ascend);
+    BitonicMerge<oswap_style, KeyType>(buffer + (split * block_size), split, block_size, ascend);
+  }
+}
+
+
+
+template<OSwap_Style oswap_style, typename KeyType = uint64_t>
+void BitonicSort(unsigned char *buffer, size_t N, size_t block_size, bool ascend) {
+  FOAV_SAFE_CNTXT(BitonicSort, N)
+  if(N < 2){
+    return;
+  }
+  else {  // Handle non-power of 2 case:
+    size_t N1 = N/2; 
+    BitonicSort<oswap_style, KeyType>(buffer, N1, block_size, !ascend);
+    BitonicSort<oswap_style, KeyType>(buffer + (block_size * N1), N-N1, block_size, ascend);
+    BitonicMerge<oswap_style, KeyType>(buffer, N, block_size, ascend);
+  }
+}
+
+
+template<OSwap_Style oswap_style, typename KeyType>
+void BitonicMerge(unsigned char *keys, size_t N, unsigned char *associated_data1, 
+      unsigned char *associated_data2, size_t data_size, bool ascend) {
+  if(associated_data1==NULL) {
+    if(N<2){
+      return;
+    }
+    else if((N & (N * -1))!=N) {
+      size_t M = pow2_lt(N);
+      unsigned char *block1 = keys;
+      unsigned char *block2 = keys + (M * sizeof(KeyType));
+      size_t feasible_swaps = N - M;
+
+      for(size_t i=0; i<feasible_swaps; i++) {
+        uint8_t swap_flag = ogt<KeyType>((KeyType*)block1, (KeyType*)block2);
+        if(ascend){
+          oswap_buffer<oswap_style>(block1, block2, data_size, swap_flag);
+        } else {
+          oswap_buffer<oswap_style>(block1, block2, data_size, !swap_flag);
+        }
+        block1+=data_size;
+        block2+=data_size; 
+      }
+   
+      BitonicMerge<oswap_style, KeyType>(keys, M, associated_data1, associated_data2, data_size, ascend);
+      BitonicMerge<oswap_style, KeyType>(keys + (M * sizeof(KeyType)), N-M, associated_data1, associated_data2, data_size, ascend);
+    } 
+    else{ //Power of 2 case
+      size_t split = N/2;
+      unsigned char *block1 = keys;
+      unsigned char *block2 = keys + (split * sizeof(KeyType)); 
+      
+      for(size_t i=0; i<split; i++) {
+        uint8_t swap_flag = ogt<KeyType>((KeyType*)block1, (KeyType*)block2);
+        if(ascend){
+          oswap_buffer<oswap_style>(block1, block2, data_size, swap_flag);
+          //ogt_comp_swap((uint64_t *) block1, (uint64_t *) block2, block1, block2, block_size);
+        } else {
+          oswap_buffer<oswap_style>(block1, block2, data_size, !swap_flag);
+          //ogt_comp_swap((uint64_t *) block2, (uint64_t *) block1, block2, block1, block_size);
+        }
+        block1+=data_size;
+        block2+=data_size; 
+      } 
+
+      BitonicMerge<oswap_style, KeyType>(keys, split, data_size, ascend);
+      BitonicMerge<oswap_style, KeyType>(keys + (split * sizeof(KeyType)), split, data_size, ascend);
+    }
+  } else{
+    if(N<2){
+      return;
+    }
+    else if((N & (N * -1))!=N) {
+      size_t M = pow2_lt(N);
+      unsigned char *block1 = keys;
+      unsigned char *block2 = keys + (M * sizeof(KeyType));
+      size_t feasible_swaps = N - M;
+      unsigned char *adata1_l = associated_data1;
+      unsigned char *adata1_r = associated_data1 + (M * data_size);
+      unsigned char *adata2_l = associated_data2;
+      unsigned char *adata2_r = associated_data2;
+      
+      if(associated_data2!=NULL) {
+        adata2_r = associated_data2 + (M * data_size);
+      }
+
+      for(size_t i=0; i<feasible_swaps; i++) {
+        uint8_t swap_flag = ogt<KeyType>((KeyType*)block1, (KeyType*)block2);
+        if(ascend){
+          oswap_key<KeyType>(block1, block2, swap_flag);
+          oswap_buffer<oswap_style>(adata1_l, adata1_r, data_size, swap_flag);
+          if(associated_data2!=NULL){
+            oswap_buffer<oswap_style>(adata2_l, adata2_r, data_size, swap_flag);
+          }
+        } else {
+          oswap_key<KeyType>(block1, block2, !swap_flag);
+          oswap_buffer<oswap_style>(adata1_l, adata1_r, data_size, !swap_flag);
+          if(associated_data2!=NULL){
+            oswap_buffer<oswap_style>(adata2_l, adata2_r, data_size, !swap_flag);
+          }
+        }
+        block1+=sizeof(KeyType);
+        block2+=sizeof(KeyType);
+        adata1_l+=data_size;
+        adata1_r+=data_size;
+        if(associated_data2!=NULL){
+          adata2_l+=data_size;
+          adata2_r+=data_size;
+        }
+      }
+   
+      BitonicMerge<oswap_style, KeyType>(keys, M, associated_data1, associated_data2, data_size, ascend);
+      if(associated_data2==NULL)
+        BitonicMerge<oswap_style, KeyType>(keys + (M * sizeof(KeyType)), N-M, associated_data1 + (M*data_size), associated_data2, data_size, ascend);
+      else
+        BitonicMerge<oswap_style, KeyType>(keys + (M * sizeof(KeyType)), N-M, associated_data1 + (M*data_size), associated_data2 + (M*data_size), data_size, ascend);
+    } 
+    else{ //Power of 2 case
+      size_t split = N/2;
+      unsigned char *block1 = keys;
+      unsigned char *block2 = keys + (split * sizeof(KeyType)); 
+      unsigned char *adata1_l = associated_data1;
+      unsigned char *adata1_r = associated_data1 + (split * data_size);
+      unsigned char *adata2_l = associated_data2;
+      unsigned char *adata2_r = associated_data2;
+      
+      if(associated_data2!=NULL) {
+        adata2_r = associated_data2 + (split * data_size);
+      }
+      
+      for(size_t i=0; i<split; i++) {
+        uint8_t swap_flag = ogt<KeyType>((KeyType*)block1, (KeyType*)block2);
+        if(ascend){
+          oswap_key<KeyType>(block1, block2, swap_flag);
+          oswap_buffer<oswap_style>(adata1_l, adata1_r, data_size, swap_flag);
+          if(associated_data2!=NULL){
+            oswap_buffer<oswap_style>(adata2_l, adata2_r, data_size, swap_flag);
+          }
+        } else {
+          oswap_key<KeyType>(block1, block2, !swap_flag);
+          oswap_buffer<oswap_style>(adata1_l, adata1_r, data_size, !swap_flag);
+          if(associated_data2!=NULL){
+            oswap_buffer<oswap_style>(adata2_l, adata2_r, data_size, !swap_flag);
+          }
+        }
+        block1+=sizeof(KeyType);
+        block2+=sizeof(KeyType); 
+        adata1_l+=data_size;
+        adata1_r+=data_size;
+        if(associated_data2!=NULL){
+          adata2_l+=data_size;
+          adata2_r+=data_size;
+        }
+      } 
+      BitonicMerge<oswap_style, KeyType>(keys, split, associated_data1, associated_data2, data_size, ascend);
+      if(associated_data2==NULL)
+        BitonicMerge<oswap_style, KeyType>(keys + (split * sizeof(KeyType)), N-split, associated_data1 + (split*data_size), associated_data2, data_size, ascend);
+      else
+        BitonicMerge<oswap_style, KeyType>(keys + (split * sizeof(KeyType)), N-split, associated_data1 + (split*data_size), associated_data2 + (split*data_size), data_size, ascend);
+    }
+  }
+}
+
+
+template<OSwap_Style oswap_style, typename KeyType = uint64_t>
+void BitonicSort(unsigned char *keys, size_t N, unsigned char *associated_data1, 
+      unsigned char *associated_data2, size_t data_size, bool ascend) {
+  FOAV_SAFE_CNTXT(BitonicSort, N)
+  if(N < 2){
+    return;
+  }
+  else {
+    size_t N1 = N/2;
+    FOAV_SAFE_CNTXT(BitonicSort, associated_data1)
+    FOAV_SAFE_CNTXT(BitonicSort, associated_data2)
+    if(associated_data1==NULL){
+      BitonicSort<oswap_style, KeyType>(keys, N1, associated_data1, associated_data2,
+                    data_size, !ascend);
+      // Increment keys by N1 data_size blocks, since keys holds the entire buffer to sort.
+      BitonicSort<oswap_style, KeyType>(keys + (N1*data_size), N-N1, associated_data1,
+                    associated_data2, data_size, ascend);
+      BitonicMerge<oswap_style, KeyType>(keys, N, associated_data1, associated_data2,
+                    data_size, ascend);
+    } else if(associated_data2==NULL){
+      //There is only one associated_data list.
+      BitonicSort<oswap_style, KeyType>(keys, N1, associated_data1, associated_data2,
+                    data_size, !ascend);
+      BitonicSort<oswap_style, KeyType>(keys + (N1*sizeof(KeyType)), N-N1, associated_data1 + (N1*data_size),
+                    associated_data2, data_size, ascend);
+      BitonicMerge<oswap_style, KeyType>(keys, N, associated_data1, associated_data2,
+                    data_size, ascend);
+      FOAV_SAFE_CNTXT(BitonicSort, associated_data1)
+      FOAV_SAFE_CNTXT(BitonicSort, associated_data2)
+    } else { 
+      //Both associated_data lists.
+      BitonicSort<oswap_style, KeyType>(keys, N1, associated_data1, associated_data2,
+                    data_size, !ascend);
+      BitonicSort<oswap_style, KeyType>(keys + (N1*sizeof(KeyType)), N-N1, associated_data1 + (N1*data_size),
+                    associated_data2 + (N1*data_size), data_size, ascend);
+      BitonicMerge<oswap_style, KeyType>(keys, N, associated_data1, associated_data2,
+                    data_size, ascend);
+    }
+
+  }
+  
+}
+
+
+
+
+#endif

+ 26 - 0
Enclave/OblivAlgs/TightCompaction_v2.cpp

@@ -0,0 +1,26 @@
+#include "TightCompaction_v2.hpp"
+
+
+void compute_LS_distances(uint64_t N, unsigned char *buffer_start,
+      size_t block_size, bool *selected_list, uint64_t *LS_distance){
+
+  //rp_end = index in the bucket where the current last real packet is mapped to
+  uint64_t rp_end = 0;
+  unsigned char *buffer_ptr = buffer_start;
+
+  // Linear scan over packets of input bucket while updating LS_distance with distance to left shift  
+  FOAV_SAFE2_CNTXT(TC_compute_LS_distances, N, block_size)
+  for(uint64_t k=0; k<N; k++) {
+
+    uint8_t real_flag = (selected_list[k]==1);
+    uint64_t shift_distance = k-rp_end;
+
+    // Oblivious: If real_flag: ls_distance[k]=shift_distance
+    //                          rp_end=rp_end+1
+    oset_value(&(LS_distance[k]), shift_distance, real_flag); 
+    rp_end+=real_flag;
+
+    buffer_ptr+=block_size;
+    FOAV_SAFE2_CNTXT(TC_compute_LS_distances, N, k)
+  }
+}

+ 45 - 0
Enclave/OblivAlgs/TightCompaction_v2.hpp

@@ -0,0 +1,45 @@
+#ifndef __NOP_TIGHTCOMPACTION_V2_HPP__
+#define __NOP_TIGHTCOMPACTION_V2_HPP__
+
+#ifndef BEFTS_MODE
+  #include <sgx_tcrypto.h>
+  #include "oasm_lib.h"
+  #include "utils.hpp"
+  #include <vector>
+  #include "foav.h"
+#endif
+
+template<OSwap_Style oswap_style>
+void TightCompact_2power(unsigned char *buf, size_t N, size_t block_size,
+       size_t offset, bool *selected);
+
+template<OSwap_Style oswap_style>
+void TightCompact_2power_inner(unsigned char *buf, size_t N, 
+      size_t block_size, size_t offset, bool *selected, uint32_t *selected_count);
+
+template<OSwap_Style oswap_style>
+void TightCompact(unsigned char *buf, size_t N, size_t block_size, bool *selected);
+
+template<OSwap_Style oswap_style>
+void TightCompact_inner(unsigned char *buf, size_t N, size_t block_size, bool *selected, uint32_t *selected_count);
+
+template<OSwap_Style oswap_style>
+void TightCompact_parallel(unsigned char *buf, size_t N, size_t block_size, bool *selected, size_t nthreads);
+
+template<OSwap_Style oswap_style>
+void TightCompact_inner_parallel(unsigned char *buf, size_t N, size_t block_size, bool *selected, uint32_t *selected_count, size_t nthreads);
+
+template<OSwap_Style oswap_style>
+void TightCompact_2power_inner_parallel(unsigned char *buf, size_t N,
+      size_t block_size, size_t offset, bool *selected, uint32_t *selected_count,
+      size_t nthreads);
+
+template <OSwap_Style oswap_style>
+void OP_TightCompact_v2(unsigned char *buf, size_t block_size, bool *selected_list);
+
+void compute_LS_distances(uint64_t N, unsigned char *buffer_start, size_t block_size, 
+        bool *selected_list, uint64_t *LS_distance);
+
+#include "TightCompaction_v2.tcc"
+
+#endif

+ 487 - 0
Enclave/OblivAlgs/TightCompaction_v2.tcc

@@ -0,0 +1,487 @@
+#ifndef __NOP_TIGHTCOMPACTION_V2_TCC__
+#define __NOP_TIGHTCOMPACTION_V2_TCC__
+
+#include "pthread.h"
+
+/*
+   TightCompaction (Non-Order Preserving Tight Compaction):
+
+   Non-Order Preserving TightCompaction can take an input array of blocks of
+   block_size bytes each, and an array of "marked" elements with ones at the
+   indices corresponding to the blocks that need to be compacted.
+   It returns the input array TightCompact-ed, i.e. all the real blocks are
+   moved to the start of the array, or compacted to an input index
+   (with wraparound)
+
+ */
+
+template <OSwap_Style oswap_style>
+void TightCompact_2power(unsigned char *buf, size_t N, size_t block_size, size_t offset, bool *selected) {
+  // Compute counts of selected items at power-of-two intervals
+  FOAV_SAFE2_CNTXT(TC_2power_summing_selected_count, N, block_size)
+  uint32_t *selected_count = NULL;
+  FOAV_SAFE_CNTXT(TC_2power, TC_PRECOMPUTE_COUNTS)
+  if (TC_PRECOMPUTE_COUNTS) {
+    // Allocate array to hold counts
+    selected_count = new uint32_t[N+1];
+    selected_count[0] = 0;
+    // Compute cumulative counts
+    for (size_t i=0; i<N; i++){
+      FOAV_SAFE2_CNTXT(TC_2power_summing_selected_count, i, N)
+      selected_count[i+1] = selected[i] + selected_count[i];
+    }
+    TightCompact_2power_inner<oswap_style>(buf, N, block_size, offset, selected, selected_count);
+    delete[] selected_count;
+  } else {
+    TightCompact_2power_inner<oswap_style>(buf, N, block_size, offset, selected, selected_count);
+  }
+}
+
+template <OSwap_Style oswap_style>
+void TightCompact_2power_inner(unsigned char *buf, size_t N, size_t block_size, size_t offset, bool *selected, uint32_t *selected_count) {
+  FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
+  if (N==1) {
+    return;
+  }
+  if (N==2) {
+    bool swap = (!selected[0] & selected[1]) ^ offset;
+    oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
+    return;
+  }
+
+  // Number of selected items in left half
+  size_t m1;
+  FOAV_SAFE_CNTXT(TC_2power, TC_PRECOMPUTE_COUNTS)
+  if (TC_PRECOMPUTE_COUNTS) {
+    m1 = selected_count[N/2] - selected_count[0];
+  } else {
+    m1=0;
+    FOAV_SAFE_CNTXT(TC_2power, N)
+    for(size_t i=0; i<N/2; i++){
+      FOAV_SAFE_CNTXT(TC_2power, i)
+      m1+=selected[i];
+    }
+  }
+
+  size_t offset_mod = (offset & ((N/2)-1));
+  size_t offset_m1_mod = (offset+m1) & ((N/2)-1);
+  bool offset_right = (offset >= N/2);
+  bool left_wrapped = ((offset_mod + m1) >= (N/2));
+
+  TightCompact_2power_inner<oswap_style>(buf, N/2, block_size, offset_mod, selected, selected_count);
+  TightCompact_2power_inner<oswap_style>(buf + ((N/2)*block_size), N/2, block_size, offset_m1_mod, (selected + (N/2)), selected_count + N/2);
+
+  unsigned char *buf1_ptr = buf, *buf2_ptr = (buf + (N/2)*block_size);
+  FOAV_SAFE_CNTXT(TC_2power_inner, TC_OPT_SWAP_FLAG)
+  if (TC_OPT_SWAP_FLAG) {
+    bool swap_flag = left_wrapped ^ offset_right;
+    size_t num_swap = N/2;
+    FOAV_SAFE2_CNTXT(TC_2power_inner, num_swap, block_size)
+    for(size_t i=0; i<num_swap; i++){
+      FOAV_SAFE2_CNTXT(TC_2power_inner_N/2_swaps, i, num_swap)
+      swap_flag = swap_flag ^ (i == offset_m1_mod);
+      oswap_buffer<oswap_style>(buf1_ptr, buf2_ptr, block_size, swap_flag);
+      buf1_ptr+=block_size;
+      buf2_ptr+=block_size;
+      FOAV_SAFE2_CNTXT(TC_2power_inner, num_swap, block_size)
+    }
+  } else {
+    FOAV_SAFE_CNTXT(TC_2power_inner, N)
+    for(size_t i=0; i<N/2; i++){
+      FOAV_SAFE_CNTXT(TC_2power_inner, i)
+      bool swap_flag = (i>=offset_m1_mod) ^ left_wrapped ^ offset_right;
+      oswap_buffer<oswap_style>(buf1_ptr, buf2_ptr, block_size, swap_flag);
+      buf1_ptr+=block_size;
+      buf2_ptr+=block_size;
+      FOAV_SAFE2_CNTXT(TC_2power_inner, i, N)
+    }
+  }
+}
+
+struct TightCompact_2power_inner_parallel_args {
+    unsigned char *buf;
+    size_t N, block_size, offset;
+    bool *selected;
+    uint32_t *selected_count;
+    size_t nthreads;
+};
+
+template <OSwap_Style oswap_style>
+static void* TightCompact_2power_inner_parallel_launch(void *voidargs) {
+    struct TightCompact_2power_inner_parallel_args *args =
+        (TightCompact_2power_inner_parallel_args *)voidargs;
+    TightCompact_2power_inner_parallel<oswap_style>(args->buf, args->N,
+        args->block_size, args->offset, args->selected, args->selected_count,
+        args->nthreads);
+    return NULL;
+}
+
+struct oswap_range_args {
+    size_t block_size;
+    size_t swap_start, swap_end;
+    size_t offset_m1_mod;
+    unsigned char *buf1, *buf2;
+    bool swap_flag;
+};
+
+template <OSwap_Style oswap_style>
+static void* oswap_range(void *voidargs) {
+    struct oswap_range_args *args = (oswap_range_args*)voidargs;
+    size_t block_size = args->block_size;
+    size_t swap_start = args->swap_start;
+    size_t swap_end = args->swap_end;
+    size_t offset_m1_mod = args->offset_m1_mod;
+    unsigned char *buf1 = args->buf1 + swap_start*block_size;
+    unsigned char *buf2 = args->buf2 + swap_start*block_size;
+    bool swap_flag = args->swap_flag;
+    FOAV_SAFE2_CNTXT(oswap_range, block_size, swap_start)
+    FOAV_SAFE_CNTXT(oswap_range, swap_end)
+    //FOAV_SAFE_CNTXT(oswap_range, &OSWAP_COUNTER)
+    //printf("start oswap range %p %lu %lu\n", buf1, swap_start, swap_end);
+    for(size_t i=swap_start; i<swap_end; i++){
+      FOAV_SAFE2_CNTXT(oswap_range, i, swap_end)
+      oswap_buffer<oswap_style>(buf1, buf2, block_size, swap_flag ^ (i >= offset_m1_mod));
+      buf1+=block_size;
+      buf2+=block_size;
+      //FOAV_SAFE_CNTXT(oswap_range, &OSWAP_COUNTER)
+      FOAV_SAFE2_CNTXT(oswap_range, swap_end, block_size)
+      FOAV_SAFE_CNTXT(oswap_range, i)
+    }
+    //printf("end oswap range %p %lu %lu\n", buf1, swap_start, swap_end);
+    return NULL;
+}
+
+
+template <OSwap_Style oswap_style>
+void TightCompact_2power_inner_parallel(unsigned char *buf, size_t N, size_t block_size, size_t offset, bool *selected, uint32_t *selected_count, size_t nthreads) {
+  FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, g_thread_id)
+  FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
+  FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, nthreads)
+  if (nthreads <= 1) {
+    FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
+    FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, nthreads)
+    unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_2power_inner(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
+    TightCompact_2power_inner<oswap_style>(buf, N, block_size, offset, selected, selected_count);
+    printf_with_rtclock_diff(start, "Thread %u ending TightCompact_2power_inner(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
+    return;
+  }
+  FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, N)
+  if (N==1) {
+    return;
+  }
+  FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, N)
+  if (N==2) {
+    bool swap = (!selected[0] & selected[1]) ^ offset;
+    oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
+    return;
+  }
+
+  unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_2power_inner_parallel(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
+  // Number of selected items in left half
+  size_t m1;
+  m1 = selected_count[N/2] - selected_count[0];
+
+  size_t offset_mod = (offset & ((N/2)-1));
+  size_t offset_m1_mod = (offset+m1) & ((N/2)-1);
+  bool offset_right = (offset >= N/2);
+  bool left_wrapped = ((offset_mod + m1) >= (N/2));
+  size_t lthreads = nthreads/2;
+  size_t rthreads = nthreads - lthreads;
+
+  threadid_t rightthreadid = g_thread_id + lthreads;
+  /* Dispatch the right half to thread g_thread_id + lthreads; it will inherit threads
+     g_thread_id + lthreads .. g_thread_id + nthreads-1. */
+  struct TightCompact_2power_inner_parallel_args rightargs = {
+    buf+ ((N/2)*block_size), N/2, block_size, offset_m1_mod, selected + N/2, selected_count + N/2, rthreads
+  };
+  threadpool_dispatch(rightthreadid,
+    TightCompact_2power_inner_parallel_launch<oswap_style>,
+    &rightargs);
+  /* Do the left half ourselves (threads g_thread_id .. g_thread_id + lthreads-1) */
+  TightCompact_2power_inner_parallel<oswap_style>(buf, N/2, block_size, offset_mod, selected, selected_count, lthreads);
+  threadpool_join(rightthreadid, NULL);
+
+  unsigned char *buf1_ptr = buf, *buf2_ptr = (buf + (N/2)*block_size);
+  bool swap_flag = left_wrapped ^ offset_right;
+  size_t num_swap = N/2;
+  FOAV_SAFE2_CNTXT(TC_2power_inner, num_swap, block_size)
+
+  oswap_range_args args[nthreads];
+  size_t inc = num_swap / nthreads;
+  size_t extra = num_swap % nthreads;
+  size_t last = 0;
+  for (size_t i=0; i<nthreads; ++i) {
+    size_t next = last + inc + (i < extra);
+    args[i] = { block_size, last, next, offset_m1_mod, buf1_ptr, buf2_ptr, swap_flag };
+    last = next;
+  }
+  for (size_t i=0; i<nthreads-1; ++i) {
+    threadpool_dispatch(g_thread_id+1+i, oswap_range<oswap_style>, args+i);
+  }
+  // Do the last section ourselves
+  oswap_range<oswap_style>((void*)(args+nthreads-1));
+  for (size_t i=0; i<nthreads-1; ++i) {
+    FOAV_SAFE2_CNTXT(TC_2power_inner_parallel, i, nthreads)
+    threadpool_join(g_thread_id+1+i, NULL);
+  }
+  printf_with_rtclock_diff(start, "Thread %u ending TightCompact_2power_inner_parallel(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
+}
+
+/*
+  NOTE: TightCompact can only be invoked with offset 0.
+  To invoke with a non-0 offset, use TightCompact_2power, with N = 2^x.
+*/
+
+template <OSwap_Style oswap_style>
+void TightCompact(unsigned char *buf, size_t N, size_t block_size,
+       bool *selected) {
+  FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
+  uint32_t *selected_count = NULL;
+  if (TC_PRECOMPUTE_COUNTS) {
+
+    // Allocate array to hold counts 
+    try {
+      selected_count = new uint32_t[N+1];
+    } catch (std::bad_alloc&){
+      printf("Allocating memory failed in TC\n");
+    }
+    selected_count[0] = 0;
+
+    // Compute cumulative counts
+    for (size_t i=0; i<N; i++){
+      selected_count[i+1] = selected[i] + selected_count[i];
+    }
+    TightCompact_inner<oswap_style>(buf, N, block_size, selected, selected_count);
+    delete[] selected_count;
+  } else {
+    TightCompact_inner<oswap_style>(buf, N, block_size, selected, selected_count);
+  }
+}
+
+template <OSwap_Style oswap_style>
+void TightCompact_parallel(unsigned char *buf, size_t N, size_t block_size,
+       bool *selected, size_t nthreads) {
+  FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
+  uint32_t *selected_count = NULL;
+  // Allocate array to hold counts
+  try {
+    selected_count = new uint32_t[N+1];
+  } catch (std::bad_alloc&){
+    printf("Allocating memory failed in TC\n");
+  }
+  selected_count[0] = 0;
+
+  // Compute cumulative counts
+  for (size_t i=0; i<N; i++){
+    selected_count[i+1] = selected[i] + selected_count[i];
+  }
+  //printf("TightCompact_parallel(nthreads=%lu)\n", nthreads);
+  TightCompact_inner_parallel<oswap_style>(buf, N, block_size, selected, selected_count, nthreads);
+  delete[] selected_count;
+}
+
+template <OSwap_Style oswap_style>
+void TightCompact_inner(unsigned char *buf, size_t N, size_t block_size, bool *selected, uint32_t *selected_count){
+  FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
+  if(N==0){
+    return;
+  }
+  else if(N==1){
+    return;
+  }
+  else if(N==2){
+    bool swap = (!selected[0] & selected[1]);
+    oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
+    return;
+  }
+
+  size_t gt_pow2;
+  size_t split_index;
+
+  // Find largest power of 2 < N 
+  gt_pow2 = pow2_lt(N);
+
+  // For Order-preserving ORCompact
+  // This will be right (R) of the recursion, and the leftover non-power of 2 left (L)
+  split_index = N - gt_pow2;
+
+  // Number of selected items in the non-power of 2 side (left)
+  size_t mL;
+  if (TC_PRECOMPUTE_COUNTS) {
+    mL = selected_count[split_index] - selected_count[0];
+  } else {
+    mL = 0;
+    for(size_t i=0; i<split_index; i++){
+      mL+=selected[i];
+    }
+  }
+
+  unsigned char *L_ptr = buf;
+  unsigned char *R_ptr = buf + (split_index * block_size);
+
+  //printf("Lsize = %ld, Rsize = %ld, Rside offset = %ld\n", split_index, gt_pow2, (gt_pow2 - split_index + mL));
+  TightCompact_inner<oswap_style>(L_ptr, split_index, block_size, selected, selected_count);
+  TightCompact_2power_inner<oswap_style>(R_ptr, gt_pow2, block_size, (gt_pow2 - split_index + mL) % gt_pow2, selected+split_index, selected_count+split_index);
+
+  // For OP we CnS the first n_2 elements (split_size) against the suffix n_2 elements of the n_1 (2 power elements)
+  R_ptr = buf + (gt_pow2 * block_size); 
+
+  // Perform N-split_index oblivious swaps for this level
+  FOAV_SAFE_CNTXT(TC_inner_oswap_loop, split_index)
+  for (size_t i=0; i<split_index; i++){
+    FOAV_SAFE2_CNTXT(TC_inner_oswap_loop, i, split_index)
+    // Oswap blocks at L_start, R_start conditional on marked_items
+    bool swap_flag = i>=mL;
+    oswap_buffer<oswap_style>(L_ptr, R_ptr, block_size, swap_flag);
+    L_ptr+=block_size;
+    R_ptr+=block_size;
+    FOAV_SAFE2_CNTXT(TC_inner_oswap_loop, i, split_index)
+  }
+}
+
+
+template <OSwap_Style oswap_style>
+void TightCompact_inner_parallel(unsigned char *buf, size_t N, size_t block_size, bool *selected, uint32_t *selected_count, size_t nthreads){
+  FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
+  FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, nthreads)
+  if (nthreads <= 1 || N < 16) {
+    unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_inner(N=%lu)\n", g_thread_id, N);
+    TightCompact_inner<oswap_style>(buf, N, block_size, selected, selected_count);
+    printf_with_rtclock_diff(start, "Thread %u ending TightCompact_inner(N=%lu)\n", g_thread_id, N);
+    return;
+  }
+  if(N==0){
+    return;
+  }
+  else if(N==1){
+    return;
+  }
+  else if(N==2){
+    bool swap = (!selected[0] & selected[1]);
+    oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
+    return;
+  }
+
+  unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_inner_parallel(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+
+  size_t split_index, n1, n2;
+
+  // Find largest power of 2 < N
+  // This will be right (R) n1 of the recursion, and the leftover left (L) n2
+  n1 = pow2_lt(N);
+  n2 = N - n1;
+
+  // Number of selected items in left
+  size_t m2;
+  m2 = selected_count[n2] - selected_count[0];
+
+  unsigned char *L_ptr = buf;
+  unsigned char *R_ptr = buf + (n2 * block_size);
+
+  size_t lthreads = nthreads/2;
+  size_t rthreads = nthreads - lthreads;
+
+  struct TightCompact_2power_inner_parallel_args rightargs = {
+    R_ptr, n1, block_size, n1 - n2 + m2, selected + n2, selected_count + n2,
+    rthreads
+  };
+  threadpool_dispatch(g_thread_id+lthreads,
+    TightCompact_2power_inner_parallel_launch<oswap_style>,
+    &rightargs);
+  TightCompact_inner_parallel<oswap_style>(L_ptr, n2, block_size, selected, selected_count, lthreads);
+  threadpool_join(g_thread_id+lthreads, NULL);
+
+  size_t num_swap = N-n1;
+  FOAV_SAFE2_CNTXT(TC_inner_parallel, nthreads, num_swap)
+  oswap_range_args args[nthreads];
+  size_t inc = num_swap / nthreads;
+  size_t extra = num_swap % nthreads;
+  size_t last = 0;
+
+  // We tweak R_ptr before we compare, to set compare the n2 prefix in L with the n2 suffix of R
+  R_ptr = buf + (n1 * block_size);
+
+  for (size_t i=0; i<nthreads; ++i) {
+    size_t next = last + inc + (i < extra);
+    args[i] = { block_size, last, next, m2, L_ptr, R_ptr, false };
+    last = next;
+    FOAV_SAFE2_CNTXT(TC_inner_parallel, i, nthreads)
+  }
+  for (size_t i=0; i<nthreads-1; ++i) {
+    threadpool_dispatch(g_thread_id+1+i, oswap_range<oswap_style>, args+i);
+  }
+  // Do the last section ourselves
+  oswap_range<oswap_style>((void*)(args+nthreads-1));
+  FOAV_SAFE_CNTXT(TC_inner_parallel, nthreads)
+  for (size_t i=0; i<nthreads-1; ++i) {
+    threadpool_join(g_thread_id+1+i, NULL);
+    FOAV_SAFE2_CNTXT(TC_inner_parallel, i, nthreads)
+  }
+
+  printf_with_rtclock_diff(start, "Thread %u ending TightCompact_inner_parallel(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+}
+
+  #ifndef BEFTS_MODE 
+
+  // Perform the oswaps for input level over the OP_Tight Compaction Network
+  template <OSwap_Style oswap_style>
+  void process_TCN(uint64_t N, uint8_t level, unsigned char *bfr_ptr, size_t block_size,
+          uint64_t *LS_distance) { 
+    FOAV_SAFE2_CNTXT(process_TCN, N, level)
+    FOAV_SAFE_CNTXT(process_TCN, block_size)
+    uint64_t comparator_dist = (1<<level);
+    // bfr_fop = bfr_first_operand_pointer, bfr_sop = bfr_second_operand_pointer
+    unsigned char *bfr_fop = bfr_ptr;
+    unsigned char *bfr_sop = bfr_ptr + (comparator_dist * block_size);
+
+    // Number of oblivious swaps
+    uint64_t num_oswaps = N - comparator_dist;
+    uint64_t sop_index = comparator_dist;
+    uint64_t fop_index = 0;
+
+    FOAV_SAFE_CNTXT(process_TCN, num_oswaps)
+    for(uint64_t i=0; i<num_oswaps; i++){ 
+      uint64_t move_dist = LS_distance[sop_index] & (1 << (level+1)-1);
+      // Obliviously if sop!=dummy AND move_dist!=0, set move_flag
+      uint8_t dist_flag = ogt_set_flag(move_dist, 0);
+      // but appropriate for 8 bytes oswaps.
+      oswap_buffer<oswap_style>(bfr_fop, bfr_sop, block_size, dist_flag);
+
+      // Adjust LS_distance after an oswap based on move_dist:
+      // Obliviously if dist_flag, set LS_distance[thread][fop_index] to 
+      // (LS_distance[thread][sop_index]-move_dist)
+      LS_distance[sop_index]-= move_dist;
+      oset_value(&(LS_distance[fop_index]), LS_distance[sop_index], dist_flag);
+      oset_value(&(LS_distance[sop_index]), 0, dist_flag);
+
+      bfr_fop+=block_size;
+      bfr_sop+=block_size;
+      sop_index++;
+      fop_index++;
+      FOAV_SAFE2_CNTXT(process_TCN, i, num_oswaps)
+    }
+  }
+
+
+  template <OSwap_Style oswap_style>
+  void OP_TightCompact(unsigned char *buf, uint64_t N, size_t block_size, bool *selected_list){
+    
+    FOAV_SAFE2_CNTXT(OP_TightCompact, N, block_size)
+    FOAV_SAFE2_CNTXT(OP_TightCompact, buf, selected_list)
+    uint64_t *LS_distance = new uint64_t[N];
+
+    int TCN_l = calculatelog2(N);
+    compute_LS_distances(N, buf, block_size, selected_list, LS_distance);
+      
+    FOAV_SAFE_CNTXT(OP_TightCompact, TCN_l)
+    for(int l=0; l<TCN_l; l++) {
+      process_TCN<oswap_style>(N, l, buf, block_size, LS_distance); 
+      FOAV_SAFE2_CNTXT(OP_TightCompact, l, TCN_l)
+    } 
+
+    delete[] LS_distance;
+  }
+
+  #endif
+
+#endif

+ 1511 - 0
Enclave/OblivAlgs/WaksmanNetwork.cpp

@@ -0,0 +1,1511 @@
+#include "SortingNetwork.hpp"
+#include "WaksmanNetwork.hpp"
+#include "oasm_lib.h"
+
+// Count the number of input and output switches, and the number of
+// WaksmanSubnetworks, used to handle N items.  Add the numbers to the
+// numInSwitches, numOutSwitches, and numSubnetworks parameters.
+static void countSwitches(uint32_t N, size_t &numInSwitches,
+    size_t &numOutSwitches, size_t &numSubnetworks)
+{
+    ++numSubnetworks;
+    // Base cases
+    FOAV_SAFE_CNTXT(countswitches, N)
+    if (N < 2) {
+        return;
+    } else if (N == 2) {
+        ++numOutSwitches;
+        return;
+    }
+
+    // How many switches do we use ourselves?
+
+    // If N is even, we use (N/2)-1 input and N/2 output switches
+    // If N is odd, we use (N-1)/2 input and (N-1)/2 output switches
+    // Note that with integer division, both cases can be handled by
+    // computing (N-1)/2 input and N/2 output switches.
+    numInSwitches += (N-1)/2;
+    numOutSwitches += N/2;
+
+    // Then recurse into the two children. If N is even, we divide in
+    // half.  If N is odd, the left child will have the extra entry.
+    countSwitches((N+1)/2, numInSwitches, numOutSwitches, numSubnetworks);
+    countSwitches(N/2, numInSwitches, numOutSwitches, numSubnetworks);
+}
+
+WaksmanNetwork::WaksmanNetwork(uint32_t N) : Ntotal(N) {
+    size_t numInSwitches = 0, numOutSwitches = 0, numSubnetworks = 0;
+    countSwitches(N, numInSwitches, numOutSwitches, numSubnetworks);
+    inSwitchVec.resize(numInSwitches);
+    outSwitchVec.resize(numOutSwitches);
+}
+
+/* Intialize data structure counting unselected permutation mappings for fast random selection.
+  Call initially with empty=true - argument only needed for recursive calls.
+*/
+static inline void initUnselectedCnt(uint32_t *unselected_cnt, uint32_t num_vals, bool empty = true) {
+  FOAV_SAFE2_CNTXT(countswitches, num_vals, empty)
+  if (num_vals == 0) { // Check just in case - this should never be called with num_vals == 0.
+    return;
+  }
+  if (empty == true) {
+    unselected_cnt[num_vals-1] = num_vals;
+  }
+  if (num_vals == 1) {
+    return;
+  }
+  uint32_t num_left = (num_vals+1)/2;
+  initUnselectedCnt(unselected_cnt, num_left, true);
+  initUnselectedCnt(unselected_cnt+num_left, num_vals - num_left, false);
+}
+
+
+/* Modifies unselected_cnt to indicate item at index has been selected.
+  Call initially with unadjusted=true - argument only needed for recursive calls.
+*/
+static inline void updateUnselectedCnt(uint32_t *unselected_cnt, uint32_t num_vals, uint32_t index,
+  bool unadjusted = true) {
+  FOAV_SAFE2_CNTXT(updateUnselectedCnt, num_vals, unadjusted)
+  if (num_vals == 0) { // Check just in case - this should never be called with num_vals == 0.
+    return;
+  }
+  FOAV_SAFE2_CNTXT(updateUnselectedCnt, num_vals, unadjusted)
+  if (unadjusted == true) {
+    unselected_cnt[num_vals-1]--;
+  }
+  FOAV_SAFE2_CNTXT(updateUnselectedCnt, num_vals, unadjusted)
+  if (num_vals == 1) {
+    return;
+  }
+  uint32_t num_left = (num_vals+1)/2;
+  FOAV_SAFE2_CNTXT(updateUnselectedCnt, index, num_left)
+  if (index < num_left) {
+    updateUnselectedCnt(unselected_cnt, num_left, index, true);
+  } else {
+    updateUnselectedCnt(unselected_cnt+num_left, num_vals - num_left, index-num_left, false);
+  }
+}
+
+
+/* Computes pseudo-random permutation (PRP), __uint128_t -> __uint128_t.
+   The input is the 128-bit integer with in_high in the top 64 bits and
+   in_low in the lower 64 bits.
+*/
+static inline __uint128_t prp128(const AESkey &aeskey,
+    uint64_t in_high, uint64_t in_low) {
+  __m128i ciphertext;
+  AES_ECB_encrypt(ciphertext, _mm_set_epi64x(in_high,in_low), aeskey);
+
+  return reinterpret_cast<__uint128_t>(ciphertext);
+}
+
+
+void print_u128(__uint128_t x) {
+  unsigned char *c = ((unsigned char *) &x) + sizeof(__uint128_t) - 1;
+  for (int i=0; i<sizeof(__uint128_t); i++) {
+    printf("%.2hhx", *c);
+    c--;
+  }
+}
+
+
+/* Look up either (1) permutation mapping corresponding to hash, or (2) random unselected mapping.
+  Returns index into forward_perm pointing to the mapping looked up.
+*/
+static inline uint32_t permOrRand(uint32_t N, unsigned char *forward_perm, randkey_t hashval, uint32_t *unselected_cnt,
+  uint8_t rand_flag) {
+  uint32_t rand_bytes;
+  uint32_t start = 0;
+  uint32_t end = N-1;
+  uint32_t mid;
+  uint8_t hash_dir;
+  uint8_t rand_dir;
+  uint32_t tot_unselected_cnt = unselected_cnt[end];
+  uint32_t left_unselected_cnt;
+  uint64_t rand_val;
+
+  getRandomBytes((unsigned char *) &rand_bytes, sizeof(uint32_t));
+  rand_val = tot_unselected_cnt * rand_bytes;
+  rand_val >>= 32;
+
+  while (true) {
+    FOAV_SAFE_CNTXT(permOrRand, start)
+    FOAV_SAFE_CNTXT(permOrRand, end)
+    if (start == end) {
+      return start;
+    }
+    mid = (start+end)/2;
+    // Compare desired hash value to hash value just after the current midpoint
+    hash_dir = ogt<randkey_t>((randkey_t *) (forward_perm + ((mid+1)*(sizeof(randkey_t) + 8))),
+      &hashval);
+    // Compare random unselected value to number unselected in left half
+    left_unselected_cnt = unselected_cnt[mid];
+    rand_dir = ogt_set_flag(left_unselected_cnt, rand_val);
+    // Pick between hash_dir and rand_dir based on rand_flag
+    bool f1 = ((1-rand_flag) & hash_dir);
+    bool f2 = (rand_flag & rand_dir);
+    FOAV_SAFE_CNTXT(permOrRand, f1)
+    FOAV_SAFE_CNTXT(permOrRand, f2)
+    if ((f1 | f2) == 1) {
+      end = mid;
+      tot_unselected_cnt = left_unselected_cnt;
+    } else {
+      start = mid+1;
+      tot_unselected_cnt -= left_unselected_cnt;
+      rand_val -= left_unselected_cnt;
+    }
+  }
+}
+
+// If this is defined, set it to the smallest N you want to see
+// profiling data for
+// #define PROFILE_SETPERM_N 32768
+
+// Define this to show the intermediate states of setPermutation
+// #define SHOW_SETPERM
+
+// Produce the partner of x; that is, x+Nleft if x < Nleft, or x-Nleft
+// if x >= Nleft
+static inline uint32_t PARTNER(uint32_t x, uint32_t Nleft)
+{
+    uint32_t side = (x >= Nleft) * (Nleft<<1);
+    return x + Nleft - side;
+}
+
+// The elements of the permutation array start off as just 32-bit
+// integers, where if j = permutation[i], then the item in position i
+// will move to position j.  This will sort the permutation; that is, it
+// will apply the inverse of the given permutation.  So if we want to
+// apply the given permutation, we will first use the permutation to set
+// the control bits of the Waksman network in a way that will sort it,
+// and then apply the inverse permutation by applying the Waksman
+// switches in reverse order.
+
+// The strategy of setPermutation is as follows. The invariant is that
+// we are given as input a permutation of 0..2k-1, and we will set the
+// Waksman network control bits to output the sorted list 0..2k-1.  (If
+// we are given an input of odd length, so a permutation of 0..2k-2, we
+// implicitly append an entry permutation[2k-1] = 2k-1 to it.)  We then
+// find a setting of the k-1 input switches (switch i OSWAPs
+// permutation[i] with permutation[i+k] for i=0..k-2; permutation[k-1]
+// never gets swapped, whether or not there was a permutation[2k-1] in
+// the original input) such that permutation[0..k-1] mod k ends up being
+// a permutation of 0..k-1, and permutation[k..2k-1] mod k ends up being
+// a permultation of 0..k-1.  (If we were given an odd input initially,
+// then it will necessarily be the case that permutation[2k-1] = 2k-1
+// and so permutation[2k-1] mod k = k-1, so permutation[k..2k-2] mod k
+// will be a permutation of 0..k-2.) We recurse on the left and the
+// right, which will set the input and output switches of the
+// subnetworks such that, after applying the switches, on the left,
+// permutation[0..k-1] mod k will be 0..k-1 (in order), and similarly on
+// the right either permutation[k..2k-2] mod k or permutation[k..2k-1]
+// mod k, depending on the input size, will be 0..k-2 or 0..k-1
+// respectively.
+
+// Then we set the k-1 or k output switches (depending on the length of
+// the right side), where switch i again OSWAPs permutation[i] with
+// permutation[i+k].  Note that both of these values will necessarily be
+// i mod k at this point, so the switch just has to be set to the "high
+// bit" of permutation[i]; that is, the bit that is 1 iff
+// (permutation[i] >= k).  This will yield the desired sorted list.
+
+// Note that when recursing, we only consider the permutation values mod
+// k, but we need to remember whether the value v represented the
+// original v or the original v+k, so that we can use that bit to set
+// output switch v correctly.  To keep track of this, when we recurse,
+// for each v = permutation[i] in the array, we attach to it a stack of
+// the "high bits" it's gone through so far (initially empty).  At
+// recursive depth d (the initial call is d=0), we have the values in
+// the permutation array being (v, [b_0, ..., b_{d-1}]) where v is an
+// integer 0 <= v <= 2k-1, and each b_i is a bit.  When we recurse to
+// depth d+1, we push the new high bit onto the stack (the top of the
+// stack is on the right in this notation), to yield (v mod k, [b_0,
+// ..., b_{d-1}, b_d]).  The recursive call then uses the v mod k values,
+// which, as above, will be a permutation of 0..k-1.  When the
+// recursions finish, the topmost high bit on the stack will be popped
+// off to yield (v mod k + b_d*k (= v), [b_0, ..., b_{d-1}]).
+
+// The way we actually internally represent the value at depth d
+// (v, [b_0, ..., b_{d-1}]) is by packing that into a single integer,
+// with v followed by the d bits: x = v<<d | b_0<<(d-1) | ... | b_{d-1}.
+
+// For example, suppose initially we have N=14, and v = permutation[i] =
+// 12.  Then the initial representation of v (with depth d=0) is just
+// x = v = 12.
+// At the first level, k=7, so when we recurse, (12, []) will become
+// x = (12 mod 7, [(12 >= 7)]) = (5, [1]) for d=1, which we represent as
+// [101][1] in binary (brackets for clarity only) = 11.  At the next
+// level, k=4 and d=2, so x = (5 mod 4, [1, (5 >= 4)]) = (1, [1,1]),
+// which we represent as [1][11] = 7.  At the next level (suppose this
+// entry ends up in the left recursion), k=2 and d=3, so x = (1 mod 2,
+// [1, 1, (1 >= 2)]) = (1, [1,1,0]), which we represent as [1][110] =
+// 14.  When k<4, there are no more recursive calls.  As each layer of
+// recursion ends, at k=2 and d=3, 14 = [1][110] becomes [(1+0*2)][11] =
+// [1][11] = 7. At k=4 and d=2, [1][11] becomes [(1+1*4)][1] = [101][1]
+// = 11. At k=7 and d=1, [101][1] becomes [(5+1*7)][] = 12.
+
+// The following functions manipulate this representation.  Note that
+// they must all be oblivious to x, but need not be to depth or k.
+
+// Return the value v encoded in the representation x at depth d
+static inline uint32_t GET(uint32_t x, uint32_t depth)
+{
+    return x>>depth;
+}
+
+// Turn a representation x of a value v between 0 and 2k-1 at depth d
+// (so with d extra bits) into one at depth d+1 (with v between 0 and
+// k-1). Pass kd = k<<d. k will be Nleft.
+static inline uint32_t PUSH(uint32_t x, uint32_t kd)
+{
+    // If the effective value is v and the d extra bits are s,
+    // then x = v<<d | s.  We want to turn that into
+    // ((v%k) << (d+1)) | (s<<1) | (floor(v/k))
+    // Recall v < 2*k, so floor(v/k) is the bit b that indicates
+    // whether v >= k, or equivalently, that x >= (k<<d)
+    uint32_t b = (x >= kd);
+    // Now (v%k) = (v - b*k), which avoids taking a potentially
+    // non-oblivious mod.  So ((v%k) << (d+1)) | (s<<1) | b
+    // = (((v%k) << d) | s) << 1 | b
+    // = (((v - b*k)<<d) | s) << 1 | b
+    // = (((v<<d)|s) - ((b*k)<<d)) << 1 | b
+    // = (x - ((b*k)<<d)) << 1 | b
+    // = ((x<<1) - ((b*k)<<(d+1))) | b
+    // = ((x<<1) - b*(k<<(d+1)) | b
+    x = ((x<<1) - b*(kd<<1)) | b;
+    return x;
+}
+
+// Turn a representation x of a value v between 0 and k-1 at depth d+1
+// (so with d+1 extra bits) into one at depth d (with x between 0 and
+// 2*k-1). It should always be that POP(PUSH(x, d, k), d, k) = x
+// whenever 0 <= x < (k<<(d+1)). Pass kd = k<<d.  k weill be Nleft.
+static inline uint32_t POP(uint32_t x, uint32_t kd)
+{
+    uint32_t b = x&1;
+    x = (x>>1) + b*kd;
+    return x;
+}
+
+
+/* Input:
+    permutation: points to array of integers 0, ..., N-1 in some order, indicating i->permutation[i]
+  Note: This function modifies the input permutation (and actually sorts it).
+*/
+void WaksmanNetwork::setPermutation(uint32_t *permutation) {
+    FOAV_SAFE_CNTXT(WN_SetPerm, Ntotal)
+    if (Ntotal > 1) {
+        WNTraversal traversal(*this);
+        WNMem mem(*this);
+        setPermutation(permutation, Ntotal, 0, traversal, mem);
+    }
+}
+
+/* Input:
+    permutation: points to array of integers 0, ..., N-1 in some order, indicating i->permutation[i]
+  Note: This function modifies the input permutation (and actually sorts it).
+*/
+void WaksmanNetwork::setPermutation(uint32_t *permutation, uint32_t N,
+  uint32_t depth, WNTraversal &traversal, const WNMem &mem) {
+  //printf("Start setPermutation(): N=%d\n", N);
+
+#ifdef SHOW_SETPERM
+  printf("S");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", permutation[i]);
+  }
+  printf("\n ");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", GET(permutation[i], depth));
+  }
+  printf("\n");
+#endif
+
+  // Handle N<=2 as special cases
+
+  FOAV_SAFE_CNTXT(setPermutation, N)
+  if (N < 2) return;
+
+  traversal.subnetNumber += 1;
+
+  FOAV_SAFE_CNTXT(setPermutation, N)
+  if (N == 2) {
+    // Store output switch value
+    traversal.outSwitches[0] = GET(permutation[0], depth);
+    //printf("Set outSwitches[0] to %d\n", outSwitches[0]);
+    // Apply output switch
+    oswap_buffer<OSWAP_4>((unsigned char *) permutation,
+      (unsigned char *) (permutation + 1), 4, traversal.outSwitches[0]);
+#ifdef SHOW_SETPERM
+    printf("O");
+    for(uint32_t i=0;i<N/2;++i) {
+      printf(" %s", traversal.outSwitches[i] ? " X" : "||");
+    }
+    printf("\n");
+
+    printf("E");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", permutation[i]);
+    }
+    printf("\n ");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", GET(permutation[i],depth));
+    }
+    printf("\n");
+#endif
+    traversal.outSwitches += 1;
+    return;
+  }
+
+#ifdef PROFILE_SETPERM_N
+  unsigned long prof_all, prof_before, prof_flt, prof_sflt, prof_unsel, prof_rlt,
+    prof_setsw, prof_srtsw, prof_appsw, prof_rec1, prof_rec2, prof_outsw;
+
+  if (N >= PROFILE_SETPERM_N) {
+    prof_all = printf_with_rtclock("begin setPermutation N=%u\n", N);
+    prof_before = printf_with_rtclock("begin before recursion N=%u\n", N);
+  }
+#endif
+
+  // The size of the left recursive half.  If N is odd, this is the
+  // larger half
+  const uint32_t Nleft = (N+1)/2;
+  // The size of the right recursive half.  This is also the number of
+  // output switches.
+  const uint32_t Nright = N/2;
+
+  // N, rounded up to an even number
+  const uint32_t Neven = (Nleft<<1);
+
+  if (N > 4) {
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    prof_flt = printf_with_rtclock("begin forward lookup table N=%u\n", N);
+  }
+#endif
+
+  const uint64_t snNum = traversal.subnetNumber;
+
+  // Create forward lookup using pseudorandom permutation (PRP)
+  // Produced as PRP(i)->(i, GET(permutation[i])) sorted by PRP(i)
+  // Note: i and permutation[i] are represented as uint32_t values to pack into one uint64_t
+  unsigned char *cur_forward_hash = mem.forward_perm;
+  uint32_t *cur_forward_map = (uint32_t *) (mem.forward_perm + sizeof(randkey_t));
+  // Generate key for forward-lookup PRP
+  __uint128_t forward_perm_hash;
+  //printf("Creating forward lookup table\n");
+  for (uint32_t i=0; i<Neven; i++) {
+    forward_perm_hash = prp128(mem.forward_key, snNum, (uint64_t) i);
+    FOAV_SAFE_CNTXT(setPermutation, snNum)
+    FOAV_SAFE_CNTXT(setPermutation, forward_perm_hash)
+    memcpy(cur_forward_hash, &forward_perm_hash, sizeof(__uint128_t));
+    cur_forward_hash += sizeof(randkey_t) + 8;
+    *cur_forward_map = i;
+    cur_forward_map += 1;
+    *cur_forward_map = i < N ? GET(permutation[i], depth) : N;
+    cur_forward_map = (uint32_t *) (cur_forward_hash + sizeof(randkey_t));
+  }
+
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_flt, "end forward lookup table N=%u\n", N);
+    prof_sflt = printf_with_rtclock("begin sort forward lookup table N=%u\n", N);
+  }
+#endif
+  BitonicSort<FPERM_OSWAP_STYLE, randkey_t>(mem.forward_perm, (size_t) N, sizeof(randkey_t) + 8, true);
+  // Print forward lookup table
+  /*
+  unsigned char *tmp_cur_forward_hash = forward_perm;
+  uint32_t *tmp_cur_forward_map = (uint32_t *) (forward_perm + sizeof(randkey_t));
+  __uint128_t tmp_forward_perm_hash;
+  for (uint32_t i=0; i<N; i++) {
+    memcpy(&tmp_forward_perm_hash, tmp_cur_forward_hash, sizeof(__uint128_t));
+    printf("\t (");
+    print_u128(tmp_forward_perm_hash);
+    printf(") %d -> %d\n", *tmp_cur_forward_map, *(tmp_cur_forward_map+1));
+    tmp_cur_forward_hash += sizeof(randkey_t) + 8;
+    tmp_cur_forward_map = (uint32_t *) (tmp_cur_forward_hash + sizeof(randkey_t));
+  }
+  */
+
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_sflt, "end sort forward lookup table N=%u\n", N);
+    prof_unsel = printf_with_rtclock("begin unselected count N=%u\n", N);
+  }
+#endif
+  // Create cumulative count of unselected items
+  initUnselectedCnt(mem.unselected_cnt, N);
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_unsel, "end unselected count N=%u\n", N);
+    prof_rlt = printf_with_rtclock("begin reverse lookup table N=%u\n", N);
+  }
+#endif
+
+  // Create reverse lookup using hash table
+  // Maps \pi(i) to i and index of i->\pi(i) in forward_perm
+  mem.reverse_perm->reserve(N);
+
+  // Lookup done on keyed hash of \pi(i) with reverse key
+  cur_forward_hash = mem.forward_perm;
+  cur_forward_map = (uint32_t *) (mem.forward_perm + sizeof(randkey_t));
+  randkey_t reverse_perm_hash;
+  //printf("Creating reverse-permutation hash table\n");
+  FOAV_SAFE_CNTXT(setPermutation, Neven)
+  for (uint32_t i=0; i<Neven; i++) {
+    FOAV_SAFE_CNTXT(setPermutation, i)
+    reverse_perm_hash = prp128(mem.reverse_key, snNum, (uint64_t) *(cur_forward_map+1));
+    FOAV_SAFE_CNTXT(setPermutation, snNum)
+    FOAV_SAFE_CNTXT(setPermutation, reverse_perm_hash)
+    std::pair<uint32_t, uint32_t> reverse_val(*cur_forward_map, i);
+    //printf("Inserting prp128(%d) = ", *(cur_forward_map+1));
+    //print_u128(reverse_perm_hash);
+    //printf(" -> (%d, %d)\n", reverse_val.first, reverse_val.second);
+    mem.reverse_perm->insert(std::make_pair(reverse_perm_hash, reverse_val));
+    cur_forward_hash += sizeof(randkey_t) + 8;
+    cur_forward_map = (uint32_t *) (cur_forward_hash + sizeof(randkey_t));
+  }
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_rlt, "end reverse lookup table N=%u\n", N);
+    prof_setsw = printf_with_rtclock("begin set switches N=%u\n", N);
+  }
+#endif
+
+  // Set input switch values
+  uint32_t cycle_start = Neven-1; // start of current permutation cycle
+  uint32_t forward = 0; // item defining switch to set
+  uint32_t forward_partner; // forward permutation "partner" (i.e. same input switch)
+  randkey_t forward_partner_hash;
+  uint32_t perm_idx;
+  uint32_t forward_partner_map; // permutation map applied to forward partner
+  uint32_t switch_num;
+  uint32_t switch_val;
+  uint32_t forward_partner_map_partner; // "partner" (i.e. same residue class) of forward_partner_map
+  randkey_t forward_partner_map_partner_hash;
+  //const uint32_t input_switch_bit = N >> 1; // bit pattern determining input switch partners
+  //const uint32_t switch_mask = (N-1) >> 1; // mask to compute input switch number via AND
+  //const uint32_t crp_xor = N >> 1; // bit pattern to compute composite residue partner via XOR
+  uint32_t *cur_switch = traversal.inSwitches;
+  uint8_t rand_flag = 0; // Indicate if next forward lookup should be random (due to cycle end)
+
+  // Perform first back-and-forth lookups on items Neven-1 and Nleft-1, which have no input switch
+  //printf("forward = %d\n", Neven-1);
+  //printf("forward partner = %d\n", Nleft-1);
+  forward_partner_hash = (randkey_t) prp128(mem.forward_key, snNum, Nleft-1);
+  FOAV_SAFE_CNTXT(setPermutation, snNum)
+  FOAV_SAFE_CNTXT(setPermutation, forward_partner_hash)
+  perm_idx = permOrRand(N, mem.forward_perm, forward_partner_hash, mem.unselected_cnt, 0);
+  cur_forward_map = (uint32_t *) (mem.forward_perm + perm_idx*(sizeof(randkey_t) + 8) +
+    sizeof(randkey_t));
+  forward_partner_map = *(cur_forward_map+1);
+  //printf("forward_partner_map = %d\n", forward_partner_map);
+  updateUnselectedCnt(mem.unselected_cnt, N, perm_idx);
+  forward_partner_map_partner = PARTNER(forward_partner_map, Nleft);
+  //printf("forward_partner_map_partner = %d\n", forward_partner_map_partner);
+  forward_partner_map_partner_hash = prp128(mem.reverse_key, snNum, (uint64_t) forward_partner_map_partner);
+  FOAV_SAFE_CNTXT(setPermutation, snNum)
+  FOAV_SAFE_CNTXT(setPermutation, forward_partner_map_partner_hash)
+
+  //printf("looking up ");
+  //print_u128(forward_partner_map_partner_hash);
+  //printf("\n");
+  std::pair<uint32_t, uint32_t>& reverse_perm_ret = mem.reverse_perm->at(forward_partner_map_partner_hash);
+  forward = reverse_perm_ret.first;
+  perm_idx = reverse_perm_ret.second;
+  updateUnselectedCnt(mem.unselected_cnt, N, perm_idx);
+  rand_flag = oe_set_flag(forward, cycle_start);
+
+  // Perform remaining back-and-forth lookups and input switch settings
+  for (uint32_t i=0; i<Nleft-1; i++) {
+    //printf("forward = %d\n", forward);
+    // Forward map partner (ignored if random lookup)
+    forward_partner = PARTNER(forward, Nleft);
+    //printf("forward partner = %d\n", forward_partner);
+    // Either map forward_partner under permutation or perform random lookup
+    forward_partner_hash = (randkey_t) prp128(mem.forward_key, snNum, (uint64_t) forward_partner);
+    FOAV_SAFE_CNTXT(setPermutation, snNum)
+    FOAV_SAFE_CNTXT(setPermutation, forward_partner_hash)
+    perm_idx = permOrRand(N, mem.forward_perm, forward_partner_hash, mem.unselected_cnt, rand_flag);
+    cur_forward_map = (uint32_t *) (mem.forward_perm + perm_idx*(sizeof(randkey_t) + 8) +
+      sizeof(randkey_t));
+    forward_partner_map = *(cur_forward_map+1);
+    //printf("forward_partner_map = %d\n", forward_partner_map);
+    // update unselected_cnt with forward lookup
+    updateUnselectedCnt(mem.unselected_cnt, N, perm_idx);
+    // Write out current switch setting (need to do after potentially random permOrRand lookup)
+    switch_val = ((*cur_forward_map) >= Nleft); // value of current input switch
+    //printf("switch_val = %d\n", switch_val);
+    switch_num = (*cur_forward_map) - switch_val * Nleft; // number of current input switch
+    //printf("switch_num = %d\n", switch_num);
+    *cur_switch = (switch_num<<1) | switch_val;
+    cur_switch++;
+    // If random, update cycle_start
+    oset_value_uint32_t(&cycle_start, PARTNER((*cur_forward_map),Nleft), rand_flag);
+    // Reverse map the residue-class partner
+    forward_partner_map_partner = PARTNER(forward_partner_map, Nleft);
+    //printf("forward_partner_map_partner = %d\n", forward_partner_map_partner);
+    forward_partner_map_partner_hash = prp128(mem.reverse_key, snNum, (uint64_t) forward_partner_map_partner);
+    FOAV_SAFE_CNTXT(setPermutation, snNum)
+    FOAV_SAFE_CNTXT(setPermutation, forward_partner_map_partner_hash)
+    std::pair<uint32_t, uint32_t>& reverse_perm_ret = mem.reverse_perm->at(forward_partner_map_partner_hash);
+    forward = reverse_perm_ret.first;
+    perm_idx = reverse_perm_ret.second;
+    //printf("forward = %d, perm_idx = %d\n", forward, perm_idx);
+    // Update unselected_cnt with reverse lookup
+    updateUnselectedCnt(mem.unselected_cnt, N, perm_idx);
+    // Indicate random lookup needed if cycle start has been reached
+    rand_flag = 0; // Needed because oe_set_flag() only sets (i.e. doesn't unset)
+    rand_flag = oe_set_flag(forward, cycle_start);
+    //printf("rand_flag = %d\n", rand_flag);
+  }
+  // Clear reverse lookup for use by any recursive call
+  mem.reverse_perm->clear();
+
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_setsw, "end set switches N=%u\n", N);
+    prof_srtsw = printf_with_rtclock("begin sort switches N=%u\n", N);
+  }
+#endif
+  // Put switches in order
+  BitonicSort<OSWAP_4, uint32_t>((unsigned char *) traversal.inSwitches,
+    (size_t) Nleft-1, 4, true);
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_srtsw, "end sort switches N=%u\n", N);
+  }
+#endif
+  // Print switches
+  /*
+  printf("Switch\tVal\n");
+  cur_switch = (uint32_t *) inSwitches.data();
+  for (uint64_t i : inSwitches) {
+    printf("%d\t%d\n", (*cur_switch)>>1, *(cur_switch)&1);
+    cur_switch += 1;
+  }
+  */
+  } else {
+    // N == 3 or N == 4
+    // If (GET(permutation[0]) & 1) == (GET(permutation[1]) & 1), set
+    // the switch to 1 (so that permutation[0] and permutation[2] get
+    // swapped, otherwise 0.  The switch setting is actually stored in
+    // the low bit of inSwitches[0].
+    traversal.inSwitches[0] = uint64_t((GET(permutation[0],depth) ^
+        GET(permutation[1],depth) ^ 1) & 1);
+  }
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    prof_appsw = printf_with_rtclock("begin apply switches N=%u\n", N);
+  }
+#endif
+#ifdef SHOW_SETPERM
+  printf("I");
+  for(uint32_t i=0;i<Nleft-1;++i) {
+    printf(" %s", (traversal.inSwitches[i]&1) ? " X" : "||");
+  }
+  printf("\n");
+#endif
+
+  // Apply input switches to permutation
+  uint32_t *cur_switch_val = traversal.inSwitches;
+  uint32_t kd = Nleft << depth;
+  FOAV_SAFE_CNTXT(setPermutation, Nleft)
+  for (uint32_t i=0; i<Nleft-1; i++) {
+    FOAV_SAFE2_CNTXT(setPermutation, i, Nleft)
+    permutation[i] = PUSH(permutation[i], kd);
+    permutation[i+Nleft] = PUSH(permutation[i+Nleft], kd);
+    oswap_buffer<OSWAP_4>((unsigned char *) (permutation+i),
+      (unsigned char *) (permutation+Nleft+i), 4, (*cur_switch_val)&1);
+    cur_switch_val += 1;
+  }
+  permutation[Nleft-1] = PUSH(permutation[Nleft-1], kd);
+  if (N == Neven) {
+      permutation[2*Nleft-1] = PUSH(permutation[2*Nleft-1], kd);
+  }
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_appsw, "end apply switches N=%u\n", N);
+    printf_with_rtclock_diff(prof_before, "end before recursion N=%u\n", N);
+    prof_rec1 = printf_with_rtclock("begin recursion1 N=%u\n", N);
+  }
+#endif
+#ifdef SHOW_SETPERM
+  printf(" ");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", permutation[i]);
+  }
+  printf("\n ");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", GET(permutation[i], depth+1));
+  }
+  printf("\n");
+#endif
+
+  traversal.inSwitches += (Nleft-1);
+  uint8_t *outSwitch = traversal.outSwitches;
+  traversal.outSwitches += Nright;
+
+  // Recursively set switches of subnetworks and propagate permutation through network
+  setPermutation(permutation, Nleft, depth+1, traversal, mem);
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_rec1, "end recursion1 N=%u\n", N);
+    prof_rec2 = printf_with_rtclock("begin recursion2 N=%u\n", N);
+  }
+#endif
+  setPermutation(permutation + Nleft, Nright, depth+1, traversal, mem);
+
+#ifdef SHOW_SETPERM
+  printf("R");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", permutation[i]);
+  }
+  printf("\n ");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", GET(permutation[i],depth+1));
+  }
+  printf("\n");
+#endif
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_rec2, "end recursion2 N=%u\n", N);
+    prof_outsw = printf_with_rtclock("begin output switches N=%u\n", N);
+  }
+#endif
+  // Store output switch values and apply to permutation values
+  //printf("Setting output switches\n");
+  for (uint32_t i=0; i<Nright; i++) {
+    outSwitch[i] = permutation[i] & 1;
+    permutation[i] = POP(permutation[i], kd);
+    permutation[i+Nleft] = POP(permutation[i+Nleft], kd);
+    //printf("\toutSwitch[%d] = %d\n", i, outSwitch[i]);
+    oswap_buffer<OSWAP_4>((unsigned char *) (permutation + i),
+      (unsigned char *) (permutation + Nleft + i), 4, outSwitch[i]);
+  }
+  if (N != Neven) {
+    permutation[Nright] = POP(permutation[Nright], kd);
+  }
+#ifdef PROFILE_SETPERM_N
+  if (N >= PROFILE_SETPERM_N) {
+    printf_with_rtclock_diff(prof_outsw, "end output switches N=%u\n", N);
+    printf_with_rtclock_diff(prof_all, "end setPermutation N=%u\n", N);
+  }
+#endif
+#ifdef SHOW_SETPERM
+  printf("O");
+  for(uint32_t i=0;i<Nright;++i) {
+    printf(" %s", outSwitch[i] ? " X" : "||");
+  }
+  printf("\n");
+
+  printf("E");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", permutation[i]);
+  }
+  printf("\n ");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", GET(permutation[i],depth));
+  }
+  printf("\n");
+#endif
+
+}
+
+/*
+void generateRandomPermutation(uint32_t N, uint32_t *random_permutation){
+  //Initialize random permutation as 1,...,N
+  for(uint32_t i=0; i<N; i++) {
+    random_permutation[i]=i;
+  }
+
+  //Convert it to a random permutation of [1,N]
+  RecursiveShuffle_M2((unsigned char *) random_permutation, (uint32_t) N, sizeof(uint32_t));
+  // To parallelize: RecursiveShuffle_M2_parallel(buf, N, block_size, 1);
+}
+*/
+
+#if 0
+void OblivWaksmanShuffle(unsigned char *buffer, uint32_t N, size_t block_size, enc_ret *ret) {
+  uint32_t *random_permutation;
+  try {
+    random_permutation = new uint32_t[N];
+  } catch (std::bad_alloc&) {
+    printf("Allocating memory failed in OblivWaksmanShuffle\n");
+  }
+  // Generate random permutation
+  double wt1, wt2;
+  ocall_wallclock(&wt1, 1);
+  generateRandomPermutation(N, random_permutation);
+
+  ocall_wallclock(&wt2, 1);
+  ret->gen_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+  ret->OSWAP_gp = OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  #ifdef TEST_WN_OA
+    uint32_t *correct_permuted_keys = new uint32_t[N];
+    printf("perm    =");
+    for(size_t i=0; i<N; i++) {
+        printf(" %2d", random_permutation[i]);
+    }
+    printf("\norig    =");
+    for(size_t i=0; i<N; i++) {
+      printf(" %2d", *((uint32_t*)(buffer + (block_size * i))));
+    }
+    printf("\ncorrect =");
+    for(size_t i=0; i<N; i++) {
+      uint32_t buffer_key = *((uint32_t*)(buffer + (block_size * random_permutation[i])));
+      correct_permuted_keys[i] = buffer_key;
+      printf(" %2d", buffer_key);
+    }
+    printf("\n");
+  #endif
+
+  // Set control bits to implement randomly generated permutation
+  ocall_wallclock(&wt1, 1);
+  FOAV_SAFE_CNTXT(OWShuffle, N)
+  WaksmanNetwork wnet((uint32_t) N);
+  //printf("\nSetting control bits\n");
+  wnet.setPermutation(random_permutation);
+  ocall_wallclock(&wt2, 1);
+  ret->control_bits_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_cb=OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  // Apply the permutation
+  //printf("\n Applying permutation\n");
+  ocall_wallclock(&wt1, 1);
+  if (block_size == 4) {
+    wnet.applyInversePermutation<OSWAP_4>(buffer, block_size);
+  } else if (block_size == 8) {
+    wnet.applyInversePermutation<OSWAP_8>(buffer, block_size);
+  } else if (block_size == 12) {
+    wnet.applyInversePermutation<OSWAP_12>(buffer, block_size);
+  } else if (block_size%16 == 0) {
+    wnet.applyInversePermutation<OSWAP_16X>(buffer, block_size);
+  } else {
+    wnet.applyInversePermutation<OSWAP_8_16X>(buffer, block_size);
+  }
+  ocall_wallclock(&wt2, 1);
+  ret->apply_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_ap = OSWAP_COUNTER;
+  #endif
+
+  #ifdef TEST_WN_OA
+    printf("output  =");
+    for(size_t i=0; i<N; i++) {
+      printf(" %2d", *((uint32_t*)(buffer + (block_size * i))));
+    }
+    printf("\n");
+    unsigned char *buffer_ptr = buffer;
+    for(size_t i=0; i<N; i++) {
+      uint32_t buffer_key = *((uint32_t*)(buffer_ptr));
+      if(correct_permuted_keys[i]!=buffer_key) {
+        printf("TEST_WN_OA: Shuffle Correctness Failed\n");
+        break;
+      }
+      buffer_ptr+=block_size;
+    }
+    delete []correct_permuted_keys;
+  #endif
+
+  delete[] random_permutation;
+}
+
+void OblivWaksmanShuffle(unsigned char *buffer, uint32_t N,
+    size_t block_size, uint32_t nthreads, enc_ret *ret) {
+  uint32_t *random_permutation;
+  try {
+    random_permutation = new uint32_t[N];
+  } catch (std::bad_alloc&) {
+    printf("Allocating memory failed in OblivWaksmanShuffle\n");
+  }
+  // Generate random permutation
+  double wt1, wt2;
+  ocall_wallclock(&wt1, 1);
+  generateRandomPermutation(N, random_permutation);
+
+  ocall_wallclock(&wt2, 1);
+  ret->gen_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+  ret->OSWAP_gp = OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  #ifdef TEST_WN_OA
+    uint32_t *correct_permuted_keys = new uint32_t[N];
+    printf("perm    =");
+    for(size_t i=0; i<N; i++) {
+        printf(" %2d", random_permutation[i]);
+    }
+    printf("\norig    =");
+    for(size_t i=0; i<N; i++) {
+      printf(" %2d", *((uint32_t*)(buffer + (block_size * i))));
+    }
+    printf("\ncorrect =");
+    for(size_t i=0; i<N; i++) {
+      uint32_t buffer_key = *((uint32_t*)(buffer + (block_size * random_permutation[i])));
+      correct_permuted_keys[i] = buffer_key;
+      printf(" %2d", buffer_key);
+    }
+    printf("\n");
+  #endif
+
+  // Set control bits to implement randomly generated permutation
+  ocall_wallclock(&wt1, 1);
+  FOAV_SAFE_CNTXT(OWShuffle, N)
+  WaksmanNetwork wnet((uint32_t) N);
+  //printf("\nSetting control bits\n");
+  wnet.setPermutation(random_permutation);
+  WNEvalPlan evalplan(N, nthreads);
+  ocall_wallclock(&wt2, 1);
+  ret->control_bits_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_cb=OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  // Apply the permutation
+  //printf("\n Applying permutation\n");
+  ocall_wallclock(&wt1, 1);
+  if (block_size == 4) {
+    wnet.applyInversePermutation<OSWAP_4>(buffer, block_size, evalplan);
+  } else if (block_size == 8) {
+    wnet.applyInversePermutation<OSWAP_8>(buffer, block_size, evalplan);
+  } else if (block_size == 12) {
+    wnet.applyInversePermutation<OSWAP_12>(buffer, block_size, evalplan);
+  } else if (block_size%16 == 0) {
+    wnet.applyInversePermutation<OSWAP_16X>(buffer, block_size, evalplan);
+  } else {
+    wnet.applyInversePermutation<OSWAP_8_16X>(buffer, block_size, evalplan);
+  }
+  ocall_wallclock(&wt2, 1);
+  ret->apply_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_ap = OSWAP_COUNTER;
+  #endif
+
+  #ifdef TEST_WN_OA
+    printf("output  =");
+    for(size_t i=0; i<N; i++) {
+      printf(" %2d", *((uint32_t*)(buffer + (block_size * i))));
+    }
+    printf("\n");
+    unsigned char *buffer_ptr = buffer;
+    for(size_t i=0; i<N; i++) {
+      uint32_t buffer_key = *((uint32_t*)(buffer_ptr));
+      if(correct_permuted_keys[i]!=buffer_key) {
+        printf("TEST_WN_OA: Shuffle Correctness Failed\n");
+        break;
+      }
+      buffer_ptr+=block_size;
+    }
+    delete []correct_permuted_keys;
+  #endif
+
+  delete[] random_permutation;
+}
+
+
+void DecryptAndOblivWaksmanShuffle(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret) {
+  double wt1, wt2;
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, (uint64_t) N, encrypted_block_size,
+    &decrypted_buffer);
+  // Set the Waksman control bits to implement the permutation
+  ocall_wallclock(&wt1, 0);
+  ocall_wallclock(&wt1, 1);
+  PRB_pool_init(1);
+  OblivWaksmanShuffle(decrypted_buffer, N, decrypted_block_size, ret);
+  ocall_wallclock(&wt2, 1);
+  ret->ptime = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_count = OSWAP_COUNTER;
+  #endif
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, (uint64_t) N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+  free(decrypted_buffer);
+
+  return;
+}
+
+void OblivWaksmanSort(unsigned char *buffer, uint32_t N, size_t block_size, enc_ret *ret) {
+  uint32_t *sort_permutation;
+  try {
+    FOAV_SAFE_CNTXT(OWSort, N)
+    sort_permutation = new uint32_t[N];
+  } catch (std::bad_alloc&) {
+    printf("Allocating memory failed in OblivWaksmanSort\n");
+  }
+  // Generate sort permutation
+  double wt1, wt2;
+  ocall_wallclock(&wt1, 1);
+  generateSortPermutation_OA(N, buffer, block_size, sort_permutation);
+
+  ocall_wallclock(&wt2, 1);
+  ret->gen_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_gp = OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  // Set control bits to implement randomly generated permutation
+  ocall_wallclock(&wt1, 1);
+#ifdef PROFILE_SETPERM_N
+  unsigned long x = printf_with_rtclock("Creating network\n");
+#endif
+  FOAV_SAFE_CNTXT(OblivWaksmanSort, N)
+  WaksmanNetwork wnet = WaksmanNetwork((uint32_t) N);
+  FOAV_SAFE_CNTXT(OblivWaksmanSort, wnet)
+#ifdef PROFILE_SETPERM_N
+  printf_with_rtclock_diff(x, "Created network\n");
+#endif
+  //printf("\nSetting control bits\n");
+  wnet.setPermutation(sort_permutation);
+  ocall_wallclock(&wt2, 1);
+  ret->control_bits_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_cb=OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  // Apply the permutation
+  //printf("\nApplying permutation\n");
+  ocall_wallclock(&wt1, 1);
+  FOAV_SAFE_CNTXT(AP, block_size)
+  if (block_size == 4) {
+    wnet.applyInversePermutation<OSWAP_4>(buffer, block_size);
+  } else if (block_size == 8) {
+    wnet.applyInversePermutation<OSWAP_8>(buffer, block_size);
+  } else if (block_size == 12) {
+    wnet.applyInversePermutation<OSWAP_12>(buffer, block_size);
+  } else if (block_size%16 == 0) {
+    wnet.applyInversePermutation<OSWAP_16X>(buffer, block_size);
+  } else {
+    wnet.applyInversePermutation<OSWAP_8_16X>(buffer, block_size);
+  }
+  ocall_wallclock(&wt2, 1);
+  ret->apply_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_ap = OSWAP_COUNTER;
+  #endif
+
+  delete[] sort_permutation;
+}
+
+void OblivWaksmanSort(unsigned char *buffer, uint32_t N, size_t block_size, uint32_t nthreads, enc_ret *ret) {
+  uint32_t *sort_permutation;
+  try {
+    FOAV_SAFE_CNTXT(OWSort, N)
+    sort_permutation = new uint32_t[N];
+  } catch (std::bad_alloc&) {
+    printf("Allocating memory failed in OblivWaksmanSort\n");
+  }
+  // Generate sort permutation
+  double wt1, wt2;
+  ocall_wallclock(&wt1, 1);
+  generateSortPermutation_OA(N, buffer, block_size, sort_permutation);
+
+  ocall_wallclock(&wt2, 1);
+  ret->gen_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_gp = OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  // Set control bits to implement randomly generated permutation
+  ocall_wallclock(&wt1, 1);
+#ifdef PROFILE_SETPERM_N
+  unsigned long x = printf_with_rtclock("Creating network\n");
+#endif
+  FOAV_SAFE_CNTXT(OblivWaksmanSort, N)
+  WaksmanNetwork wnet = WaksmanNetwork((uint32_t) N);
+  FOAV_SAFE_CNTXT(OblivWaksmanSort, wnet)
+#ifdef PROFILE_SETPERM_N
+  printf_with_rtclock_diff(x, "Created network\n");
+#endif
+  //printf("\nSetting control bits\n");
+  wnet.setPermutation(sort_permutation);
+  WNEvalPlan evalplan(N, nthreads);
+  ocall_wallclock(&wt2, 1);
+  ret->control_bits_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_cb=OSWAP_COUNTER;
+    OSWAP_COUNTER=0;
+  #endif
+
+  // Apply the permutation
+  //printf("\nApplying permutation\n");
+  ocall_wallclock(&wt1, 1);
+  FOAV_SAFE_CNTXT(AP, block_size)
+  if (block_size == 4) {
+    wnet.applyInversePermutation<OSWAP_4>(buffer, block_size, evalplan);
+  } else if (block_size == 8) {
+    wnet.applyInversePermutation<OSWAP_8>(buffer, block_size, evalplan);
+  } else if (block_size == 12) {
+    wnet.applyInversePermutation<OSWAP_12>(buffer, block_size, evalplan);
+  } else if (block_size%16 == 0) {
+    wnet.applyInversePermutation<OSWAP_16X>(buffer, block_size, evalplan);
+  } else {
+    wnet.applyInversePermutation<OSWAP_8_16X>(buffer, block_size, evalplan);
+  }
+  ocall_wallclock(&wt2, 1);
+  ret->apply_perm_time = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_ap = OSWAP_COUNTER;
+  #endif
+
+  delete[] sort_permutation;
+}
+
+void DecryptAndOblivWaksmanSort(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, uint32_t nthreads, unsigned char *result_buffer, enc_ret *ret) {
+  double wt1, wt2;
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, (uint64_t) N, encrypted_block_size,
+    &decrypted_buffer);
+  // Set the Waksman control bits to implement the permutation
+  threadpool_init(nthreads);
+  ocall_wallclock(&wt1, 0);
+  ocall_wallclock(&wt1, 1);
+  PRB_pool_init(nthreads);
+  OblivWaksmanSort(decrypted_buffer, N, decrypted_block_size, nthreads, ret);
+  ocall_wallclock(&wt2, 1);
+  ret->ptime = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_count = OSWAP_COUNTER;
+  #endif
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, (uint64_t) N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+  threadpool_shutdown();
+  free(decrypted_buffer);
+
+  return;
+}
+
+void DecryptAndOblivWaksmanSort(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret) {
+  double wt1, wt2;
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, (uint64_t) N, encrypted_block_size,
+    &decrypted_buffer);
+  // Set the Waksman control bits to implement the permutation
+  ocall_wallclock(&wt1, 0);
+  ocall_wallclock(&wt1, 1);
+  PRB_pool_init(1);
+  OblivWaksmanSort(decrypted_buffer, N, decrypted_block_size, ret);
+  ocall_wallclock(&wt2, 1);
+  ret->ptime = wt2-wt1;
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_count = OSWAP_COUNTER;
+  #endif
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, (uint64_t) N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+  free(decrypted_buffer);
+
+  return;
+}
+
+void DecryptAndOWSS(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret) {
+  double wt1, wt2, wt3;
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, (uint64_t) N, encrypted_block_size,
+    &decrypted_buffer);
+  // Set the Waksman control bits to implement the permutation
+  ocall_wallclock(&wt1, 0);
+  ocall_wallclock(&wt1, 1);
+  PRB_pool_init(1);
+  OblivWaksmanShuffle(decrypted_buffer, N, decrypted_block_size, ret);
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_count = OSWAP_COUNTER;
+  #endif
+
+  ocall_wallclock(&wt2, 1);
+  qsort(decrypted_buffer, N, decrypted_block_size, compare);
+  ocall_wallclock(&wt3, 1);
+  ret->qsort_time = wt3-wt2;
+  ret->ptime = wt3-wt1;
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, (uint64_t) N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+  free(decrypted_buffer);
+
+  return;
+}
+#endif
+
+// #define PROFILE_MTMERGESORT
+
+template<typename T> static int compare(const void *a, const void *b);
+
+template<>
+int compare<uint64_t>(const void *a, const void *b)
+{
+    uint32_t *a32 = (uint32_t*)a;
+    uint32_t *b32 = (uint32_t*)b;
+    int hi = a32[1]-b32[1];
+    int lo = a32[0]-b32[0];
+    return oselect_uint32_t(hi, lo, !hi);
+}
+
+template<typename T>
+struct MergeArgs {
+    T* dst;
+    const T* leftsrc;
+    size_t Nleft;
+    const T* rightsrc;
+    size_t Nright;
+};
+
+// Merge two sorted arrays into one.  The (sorted) source arrays are
+// leftsrc and rightsrc of lengths Nleft and Nright respectively.  Put
+// the merged sorted array into dst[0..Nleft+Nright-1].  Use up to the
+// given number of threads.
+template<typename T>
+static void* merge(void *voidargs)
+{
+    const MergeArgs<T>* args = (const MergeArgs<T>*)voidargs;
+#ifdef PROFILE_MTMERGESORT
+unsigned long start = printf_with_rtclock("begin merge(dst=%p, leftsrc=%p, Nleft=%lu, rightsrc=%p, Nright=%lu, nthreads=%lu)\n", args->dst, args->leftsrc, args->Nleft, args->rightsrc, args->Nright);
+#endif
+    T* dst = args->dst;
+    const T* left = args->leftsrc;
+    const T* right = args->rightsrc;
+    const T* leftend = args->leftsrc + args->Nleft;
+    const T* rightend = args->rightsrc + args->Nright;
+
+    while (left != leftend && right != rightend) {
+        if (compare<T>(left, right) < 0) {
+            *dst = *left;
+            ++dst;
+            ++left;
+        } else {
+            *dst = *right;
+            ++dst;
+            ++right;
+        }
+    }
+
+    if (left != leftend) {
+        memmove(dst, left, (leftend-left)*sizeof(T));
+    }
+    if (right != rightend) {
+        memmove(dst, right, (rightend-right)*sizeof(T));
+    }
+#ifdef PROFILE_MTMERGESORT
+printf_with_rtclock_diff(start, "end merge(dst=%p, leftsrc=%p, Nleft=%lu, rightsrc=%p, Nright=%lu, nthreads=%lu)\n", args->dst, args->leftsrc, args->Nleft, args->rightsrc, args->Nright);
+#endif
+
+    return NULL;
+}
+
+// In the sorted subarray src[0 .. len-1], binary search for the first
+// element that's larger than the target.  The return value is the index
+// into that subarray, so it's 0 if src[0] > target, and it's len if all
+// the elements are less than the target.  Remember that all elements
+// have to be different, so no comparison will ever return that the
+// elements are equal.
+template<typename T>
+static size_t binsearch(const T* src, size_t len, const T* target)
+{
+    size_t left = 0;
+    size_t right = len;
+
+    if (len == 0) {
+        return 0;
+    }
+    if (compare<T>(src + left, target) > 0) {
+        return 0;
+    }
+    if (len > 0 && compare<T>(src + right - 1, target) < 0) {
+        return len;
+    }
+
+    // Invariant: src[left] < target and src[right] > target (where
+    // src[len] is considered to be greater than all targets)
+    while (right - left > 1) {
+        size_t mid = left + (right - left)/2;
+        if (compare<T>(src + mid, target) > 0) {
+            right = mid;
+        } else {
+            left = mid;
+        }
+    }
+
+    return right;
+}
+
+// Merge two sorted arrays into one.  The (sorted) source arrays are
+// leftsrc and rightsrc of lengths Nleft and Nright respectively.  Put
+// the merged sorted array into dst[0..Nleft+Nright-1].  Use up to the
+// given number of threads.
+template<typename T>
+static void mtmerge(T* dst, const T* leftsrc, size_t Nleft,
+    const T* rightsrc, size_t Nright, threadid_t nthreads)
+{
+#ifdef PROFILE_MTMERGESORT
+unsigned long start = printf_with_rtclock("begin mtmerge(dst=%p, leftsrc=%p, Nleft=%lu, rightsrc=%p, Nright=%lu, nthreads=%lu)\n", dst, leftsrc, Nleft, rightsrc, Nright, nthreads);
+#endif
+
+    threadid_t threads_to_use = nthreads;
+    if (Nleft < 500 || Nright < 500) {
+        threads_to_use = 1;
+    }
+
+    // Break the left array into threads_to_use approximately
+    // equal-sized pieces
+
+    MergeArgs<T> margs[threads_to_use];
+    size_t leftinc = Nleft / threads_to_use;
+    size_t leftextra = Nleft % threads_to_use;
+    size_t leftlast = 0;
+    size_t rightlast = 0;
+
+    for (threadid_t t=0; t<threads_to_use; ++t) {
+        size_t leftlen = leftinc + (t < leftextra);
+        // Find the segment in the right array corresponding to this
+        // segment in the lest array.  If this is the last segment of
+        // the left array, that's just the whole remaining right array.
+        size_t rightlen;
+        if (t == threads_to_use - 1) {
+            rightlen = Nright - rightlast;
+        } else {
+            // The first element of the next left segment
+            const T* target = leftsrc + leftlast + leftlen;
+            // In the sorted subarray rightsrc[rightlast .. Nright-1],
+            // binary search for the first element that's larger than
+            // the target.  The return value is the index into that
+            // subarray, so it's 0 if rightsrc[rightlast] > target, and
+            // it's Nright-rightlast if all the elements are less than
+            // the target.
+            rightlen = binsearch<T>(rightsrc + rightlast,
+                Nright-rightlast, target);
+        }
+        margs[t] = { dst + leftlast + rightlast,
+            leftsrc + leftlast, leftlen,
+            rightsrc + rightlast, rightlen };
+        leftlast += leftlen;
+        rightlast += rightlen;
+        if (t > 0) {
+            threadpool_dispatch(g_thread_id+t, merge<T>, &margs[t]);
+        }
+    }
+    // Do the first block ourselves
+    merge<T>(&margs[0]);
+    for (size_t t=1; t<threads_to_use; ++t) {
+        threadpool_join(g_thread_id+t, NULL);
+    }
+
+#ifdef PROFILE_MTMERGESORT
+printf_with_rtclock_diff(start, "end mtmerge(dst=%p, leftsrc=%p, Nleft=%lu, rightsrc=%p, Nright=%lu, nthreads=%lu)\n", dst, leftsrc, Nleft, rightsrc, Nright, nthreads);
+#endif
+}
+
+template<typename T>
+struct MTMergesortArgs {
+    T* buf;
+    size_t N;
+    T* backing;
+    threadid_t nthreads;
+    bool ret;
+};
+
+template<typename T>
+static bool mtmergesort(T* buf, size_t N, T* backing, threadid_t nthreads);
+
+template<typename T>
+static void *mtmergesort_launch(void *voidargs)
+{
+    MTMergesortArgs<T>* args = (MTMergesortArgs<T>*)voidargs;
+    args->ret = mtmergesort<T>(args->buf, args->N, args->backing,
+        args->nthreads);
+    return NULL;
+}
+
+// Multithreaded mergesort.  Pass the data of type T to sort, as a
+// pointer and number of elements.  Also pass a backing store of the
+// same size.  The sorted data will end up in either the original data
+// array or the backing store; this function will return false if it's
+// in the original data and true if it's in the backing store.  Use up
+// to the given number of threads.
+template<typename T>
+static bool mtmergesort(T* buf, size_t N, T* backing, threadid_t nthreads)
+{
+    if (nthreads == 1 || N < 1000) {
+        // Just sort naively
+#ifdef PROFILE_MTMERGESORT
+unsigned long start = printf_with_rtclock("begin qsort(buf=%p, N=%lu)\n", buf, N);
+#endif
+        qsort(buf, N, sizeof(T), compare<T>);
+#ifdef PROFILE_MTMERGESORT
+printf_with_rtclock_diff(start, "end qsort(buf=%p, N=%lu)\n", buf, N);
+#endif
+        return false;
+    }
+#ifdef PROFILE_MTMERGESORT
+unsigned long start = printf_with_rtclock("begin mtmergesort(buf=%p, N=%lu, backing=%p, nthreads=%lu)\n", buf, N, backing, nthreads);
+#endif
+    size_t Nleft = (N+1)/2;
+    size_t Nright = N/2;
+    threadid_t threads_left = (nthreads+1)/2;
+    threadid_t threads_right = nthreads/2;
+
+    MTMergesortArgs<T> ms_right_args =
+        { buf + Nleft, Nright, backing + Nleft, threads_right, false };
+    threadpool_dispatch(g_thread_id+threads_left, mtmergesort_launch<T>,
+        &ms_right_args);
+    bool leftret = mtmergesort<T>(buf, Nleft, backing, threads_left);
+    threadpool_join(g_thread_id+threads_left, NULL);
+    bool rightret = ms_right_args.ret;
+
+    // If the left and right sorts put their answers in different
+    // places, move the right answer to match the left
+    if (leftret != rightret) {
+        if (leftret) {
+            // The left is in backing, and the right is in buf
+            memmove(backing + Nleft, buf + Nleft, Nright * sizeof(T));
+        } else {
+            // The left is in buf, and the right is in backing
+            memmove(buf + Nleft, backing + Nleft, Nright * sizeof(T));
+        }
+    }
+
+    // Merge the two halves
+    if (leftret) {
+        // The recursive outputs are in backing; merge them into buf
+        mtmerge<T>(buf, backing, Nleft, backing+Nleft, Nright, nthreads);
+    } else {
+        // The recursive outputs are in buf; merge them into backing
+        mtmerge<T>(backing, buf, Nleft, buf+Nleft, Nright, nthreads);
+    }
+#ifdef PROFILE_MTMERGESORT
+printf_with_rtclock_diff(start, "end mtmergesort(buf=%p, N=%lu, backing=%p, nthreads=%lu)\n", buf, N, backing, nthreads);
+#endif
+    return !leftret;
+}
+
+struct datacopy_args {
+    const unsigned char *inbuf;
+    const uint64_t *idx;
+    unsigned char *outbuf;
+    size_t start, end, sz;
+};
+
+static void* datacopy_range(void *voidargs)
+{
+    const datacopy_args *args = (datacopy_args*)voidargs;
+    for (size_t i=args->start; i<args->end; ++i) {
+        memmove(args->outbuf+i*args->sz,
+            args->inbuf+(args->idx[i]&0xffffffff)*args->sz,
+            args->sz);
+    }
+    return NULL;
+}
+
+// Sort the given array of N elements, each of size sz, using up to
+// nthreads threads. The output is put into the same memory as the input
+// array.  The first 4 bytes of each element is its key.
+static void mtsort(void *buffer, size_t N, size_t sz, threadid_t nthreads)
+{
+    // No multithreading yet
+
+    uint64_t *idx = new uint64_t[N];
+    unsigned char *inbuf = (unsigned char *)buffer;
+    unsigned char *outbuf = new unsigned char[N*sz];
+    for (size_t i=0; i<N; ++i) {
+        uint64_t key = (*(uint32_t*)(inbuf+sz*i));
+        idx[i] = (key<<32) + i;
+    }
+    // Sort the keys and indices
+    uint64_t *backingidx = new uint64_t[N];
+    bool whichbuf = mtmergesort<uint64_t>(idx, N, backingidx, nthreads);
+    uint64_t *sortedidx = whichbuf ? backingidx : idx;
+
+    // Copy the data using the sorted indices, potentially using
+    // multiple threads
+    threadid_t threads_to_use = nthreads;
+    datacopy_args dcargs[threads_to_use];
+    size_t inc = N / threads_to_use;
+    size_t extra = N % threads_to_use;
+    size_t last = 0;
+
+    for (size_t t=0; t<threads_to_use; ++t) {
+        size_t next = last + inc + (t < extra);
+        dcargs[t] = { inbuf, sortedidx, outbuf, last, next, sz };
+        last = next;
+        if (t > 0) {
+            threadpool_dispatch(g_thread_id+t, datacopy_range,
+                &dcargs[t]);
+        }
+    }
+    // Do the first block ourselves
+    datacopy_range(&dcargs[0]);
+    for (size_t t=1; t<threads_to_use; ++t) {
+        threadpool_join(g_thread_id+t, NULL);
+    }
+
+    delete[] idx;
+    delete[] backingidx;
+    memmove(inbuf, outbuf, N*sz);
+    delete[] outbuf;
+}
+
+#if 0
+void DecryptAndMTSS(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, threadid_t nthreads,
+  unsigned char *result_buffer, enc_ret *ret) {
+  double wt1, wt2, wt3;
+
+  // Decrypt buffer to decrypted_buffer
+  unsigned char *decrypted_buffer = NULL;
+  size_t decrypted_block_size = decryptBuffer(encrypted_buffer, (uint64_t) N, encrypted_block_size,
+    &decrypted_buffer);
+  // Set the Waksman control bits to implement the permutation
+  threadpool_init(nthreads);
+  ocall_wallclock(&wt1, 0);
+  ocall_wallclock(&wt1, 1);
+  PRB_pool_init(nthreads);
+  OblivWaksmanShuffle(decrypted_buffer, N, decrypted_block_size, nthreads, ret);
+  #ifdef COUNT_OSWAPS
+    ret->OSWAP_count = OSWAP_COUNTER;
+  #endif
+
+  ocall_wallclock(&wt2, 1);
+  mtsort(decrypted_buffer, N, decrypted_block_size, nthreads);
+  ocall_wallclock(&wt3, 1);
+  ret->qsort_time = wt3-wt2;
+  ret->ptime = wt3-wt1;
+
+  // Encrypt buffer to result_buffer
+  encryptBuffer(decrypted_buffer, (uint64_t) N, decrypted_block_size, result_buffer);
+  PRB_pool_shutdown();
+  threadpool_shutdown();
+  free(decrypted_buffer);
+
+  return;
+}
+#endif

+ 687 - 0
Enclave/OblivAlgs/WaksmanNetwork.hpp

@@ -0,0 +1,687 @@
+#ifndef __WAKSMANNETWORK_HPP__
+#define __WAKSMANNETWORK_HPP__
+
+#include <unordered_map>
+#include <vector>
+#include "oasm_lib.h"
+#include <sgx_tcrypto.h>
+#include "utils.hpp"
+#include "RecursiveShuffle.hpp"
+#include "aes.hpp"
+
+// #define PROFILE_MTAPPLYPERM
+
+typedef __uint128_t randkey_t;
+#define FPERM_OSWAP_STYLE OSWAP_8_16X // OSwap_Style for forward perm (consistent w/ randkey_t)
+
+// A struct to hold a multi-thread evaluation plan for a
+// WaksmanNetwork of a given size and number of threads
+
+// If you have a WaksmanNetwork with N>2 items, and you want to apply
+// it using nthreads>1 threads, this WaksmanNetwork will itself use
+// the first (N-1)/2 input switches and N/2 output switches (in
+// addition to those used by its subnetworks); it will recurse into
+// the left subnetwork containing (N+1)/2 items and (nthreads+1)/2
+// threads, and into the right subnetwork containing N/2 items and
+// nthreads/2 threads.  If N=2, then there is just a single output
+// switch and no subnetworks.  If N<2, there are no switches and no
+// subnetworks.  If nthreads=1, we compute the total number of inputs
+// and output switches used by this WaksmanNetwork and its
+// subnetworks, but just store the total in this WNEvalPlan with no
+// WNEvalPlans for the subnetworks.
+
+struct WNEvalPlan {
+  // The number of items for the represented WaksmanNetwork
+  uint32_t N;
+  // The number of threads to use to evaluate it
+  uint32_t nthreads;
+  // The total number of input and output switches used by this
+  // WaksmanNetwork and its subnetworks
+  size_t subtree_num_inswitches, subtree_num_outswitches;
+  // If N>2 and nthreads>1, these are the evaluation plans for the
+  // subnetworks.  This vector will contain 0 (if N<=2 or nthreads=1)
+  // or 2 (otherwise) items.
+  std::vector<WNEvalPlan> subplans;
+
+  WNEvalPlan(uint32_t N, uint32_t nthreads) : N(N), nthreads(nthreads) {
+      if (N<2) {
+          subtree_num_inswitches = 0;
+          subtree_num_outswitches = 0;
+      } else if (N == 2) {
+          subtree_num_inswitches = 0;
+          subtree_num_outswitches = 1;
+      } else if (nthreads <= 1) {
+          subtree_num_inswitches = 0;
+          subtree_num_outswitches = 0;
+          count_switches(N);
+      } else {
+        const uint32_t Nleft = (N+1)/2;
+        const uint32_t Nright = N/2;
+        const uint32_t numInSwitches = (N-1)/2;
+        const uint32_t numOutSwitches = N/2;
+        const uint32_t nthr_left = (nthreads+1)/2;
+        const uint32_t nthr_right = nthreads/2;
+        subplans.emplace_back(Nleft, nthr_left);
+        subplans.emplace_back(Nright, nthr_right);
+        subtree_num_inswitches = numInSwitches +
+            subplans[0].subtree_num_inswitches +
+            subplans[1].subtree_num_inswitches;
+        subtree_num_outswitches = numOutSwitches +
+            subplans[0].subtree_num_outswitches +
+            subplans[1].subtree_num_outswitches;
+      }
+  }
+
+  // Count the number of input and output switches used by a
+  // WaksmanNetwork with N items.  Add those values to
+  // subtree_num_inswitches and subtree_num_outswitches.
+  void count_switches(uint32_t N) {
+    if (N<2) {
+      return;
+    }
+    if (N == 2) {
+      subtree_num_outswitches += 1;
+      return;
+    }
+    const uint32_t Nleft = (N+1)/2;
+    const uint32_t Nright = N/2;
+    const uint32_t numInSwitches = (N-1)/2;
+    const uint32_t numOutSwitches = N/2;
+
+    subtree_num_inswitches += numInSwitches;
+    subtree_num_outswitches += numOutSwitches;
+    count_switches(Nleft);
+    count_switches(Nright);
+  }
+
+  void dump(int indent = 0) {
+      printf("%*sN = %lu, nthreads = %lu, inswitches = %lu, outswitches = %lu\n",
+          indent, "", N, nthreads, subtree_num_inswitches,
+          subtree_num_outswitches);
+      if (subplans.size() > 0) {
+          subplans[0].dump(indent+2);
+          subplans[1].dump(indent+2);
+      }
+  }
+};
+
+/*
+  WaksmanNetwork Class: Contains a Waksman permutation network and can apply it to an input array.
+    setPermutation(uint32_t *permutation, unsigned char *forward_perm, [optional preallocated
+      memory regions]): Takes permutation as an array of N index values (i.e. values in [N]) and
+      optional pointers to allocated memory. It sets the Waksman network switches to that
+      permutation.
+    applyPermutation(unsigned char *buf, size_t block_size): Takes buffer of N items of block_size
+      bytes each and applies stored permutation in-place (i.e. modifying input buffer).
+    applyInversePermutation(unsigned char *buf, size_t block_size): Takes buffer of N items of
+      block_size bytes each and applies inverse of stored permutation in-place.
+*/
+
+class WaksmanNetwork {
+  uint32_t Ntotal; // number of items to permute
+  std::vector<uint32_t> inSwitchVec; // input layer of (numbered) switches
+  std::vector<uint8_t> outSwitchVec; // output layer of switches
+
+  // A struct to keep track of the current subnet number, and input and
+  // output switches, for each subnet as we traverse the network.
+  struct WNTraversal {
+    uint64_t subnetNumber;
+    uint32_t *inSwitches;
+    uint8_t *outSwitches;
+
+    WNTraversal(WaksmanNetwork &wn) : subnetNumber(0),
+        inSwitches(wn.inSwitchVec.data()),
+        outSwitches(wn.outSwitchVec.data()) {}
+  };
+
+  struct appInvPermArgs {
+      WaksmanNetwork &wn;
+      unsigned char *buf;
+      size_t block_size;
+      const WNEvalPlan &plan;
+      WNTraversal &traversal;
+
+      appInvPermArgs(WaksmanNetwork &wn, unsigned char *buf,
+          size_t block_size, const WNEvalPlan &plan, WNTraversal &traversal)
+          : wn(wn), buf(buf), block_size(block_size), plan(plan),
+          traversal(traversal) {}
+  };
+
+  template <OSwap_Style oswap_style>
+  static void *applyInversePermutation_launch(void *voidarg)
+  {
+      appInvPermArgs *arg = (appInvPermArgs *)voidarg;
+      arg->wn.applyInversePermutation<oswap_style>(arg->buf, arg->block_size,
+          arg->plan, arg->traversal);
+      return NULL;
+  }
+
+  // A struct to hold pre-allocated memory (and AES keys) so that we
+  // only allocate memory once, before the recursive setPermutation is
+  // called.
+  struct WNMem {
+    unsigned char *forward_perm;
+    uint32_t *unselected_cnt;
+    std::unordered_map<randkey_t, std::pair<uint32_t, uint32_t>> *reverse_perm;
+    AESkey forward_key, reverse_key;
+
+    WNMem(WaksmanNetwork &wn) {
+        // Round Ntotal up to an even number
+        uint32_t Neven = wn.Ntotal + (wn.Ntotal&1);
+        forward_perm = new unsigned char[Neven * (sizeof(randkey_t) + 8)];
+        unselected_cnt = new uint32_t[wn.Ntotal];
+        reverse_perm = new std::unordered_map<randkey_t, std::pair<uint32_t, uint32_t>>;
+        __m128i forward_rawkey, reverse_rawkey;
+        getRandomBytes((unsigned char *) &forward_rawkey, sizeof(forward_rawkey));
+        getRandomBytes((unsigned char *) &reverse_rawkey, sizeof(reverse_rawkey));
+        AES_128_Key_Expansion(forward_key, forward_rawkey);
+        AES_128_Key_Expansion(reverse_key, reverse_rawkey);
+    }
+
+    ~WNMem() {
+        delete[] forward_perm;
+        delete[] unselected_cnt;
+        delete reverse_perm;
+    }
+  };
+
+  void setPermutation(uint32_t *permutation, uint32_t N,
+    uint32_t depth, WNTraversal &traversal, const WNMem &mem);
+
+  template <OSwap_Style oswap_style>
+  void applyPermutation(unsigned char *buf, uint32_t N,
+    size_t block_size, WNTraversal &traversal);
+
+  template <OSwap_Style oswap_style>
+  void applyInversePermutation(unsigned char *buf, uint32_t N,
+    size_t block_size, WNTraversal &traversal);
+
+  template <OSwap_Style oswap_style>
+  void applyInversePermutation(unsigned char *buf, size_t block_size,
+    const WNEvalPlan &plan, WNTraversal &traversal);
+
+public:
+
+  // Set up the WaksmanNetwork for N items.  N need not be a power of 2.
+  // N <= 2^31
+  WaksmanNetwork(uint32_t N);
+
+  void setPermutation(uint32_t *permutation);
+
+  template <OSwap_Style oswap_style>
+  void applyPermutation(unsigned char *buf, size_t block_size);
+
+  template <OSwap_Style oswap_style>
+  void applyInversePermutation(unsigned char *buf, size_t block_size);
+
+  template <OSwap_Style oswap_style>
+  void applyInversePermutation(unsigned char *buf, size_t block_size,
+    const WNEvalPlan &plan);
+
+};
+
+// Define this to show the intermediate states of applyPermutation
+// #define SHOW_APPLYPERM
+
+// Apply permutation encoded by control bits to data elements in buffer. Permutes in place.
+template <OSwap_Style oswap_style>
+void WaksmanNetwork::applyPermutation(unsigned char *buf, size_t block_size) {
+    FOAV_SAFE_CNTXT(AP, Ntotal)
+    if (Ntotal > 1) {
+        WNTraversal traversal(*this);
+        applyPermutation<oswap_style>(buf, Ntotal, block_size, traversal);
+    }
+}
+
+// Apply permutation encoded by control bits to data elements in buffer. Permutes in place.
+template <OSwap_Style oswap_style>
+void WaksmanNetwork::applyPermutation(unsigned char *buf, uint32_t N,
+    size_t block_size, WNTraversal &traversal) {
+
+  FOAV_SAFE_CNTXT(AP, Ntotal)
+  FOAV_SAFE_CNTXT(AP, N)
+  if (N < 2) return;
+
+  const uint32_t Nleft = (N+1)/2;
+  const uint32_t Nright = N/2;
+  const uint32_t numInSwitches = (N-1)/2;
+  const uint32_t numOutSwitches = N/2;
+  const uint32_t *inSwitch = traversal.inSwitches;
+  const uint8_t *outSwitch = traversal.outSwitches;
+
+  traversal.subnetNumber += 1;
+  traversal.inSwitches += numInSwitches;
+  traversal.outSwitches += numOutSwitches;
+
+#ifdef SHOW_APPLYPERM
+  printf("s");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", *(uint32_t*)(buf+block_size*i));
+  }
+  printf("\n");
+#endif
+
+  if (N == 2) {
+#ifdef SHOW_APPLYPERM
+    printf("o");
+    for(uint32_t i=0;i<numOutSwitches;++i) {
+      printf(" %s", outSwitch[i] ? " X" : "||");
+    }
+    printf("\n");
+#endif
+    oswap_buffer<oswap_style>(buf, buf + block_size, (uint32_t) block_size, outSwitch[0]);
+#ifdef SHOW_APPLYPERM
+    printf("e");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+  } else {
+#ifdef SHOW_APPLYPERM
+    printf("i");
+    for(uint32_t i=0;i<numInSwitches;++i) {
+      printf(" %s", (inSwitch[i]&1) ? " X" : "||");
+    }
+    printf("\n");
+#endif
+    // Apply input switches to permutation
+    const uint32_t *curInSwitchVal = inSwitch;
+    for (uint32_t i=0; i<numInSwitches; i++) {
+      oswap_buffer<oswap_style>(buf + block_size*(i), buf + block_size*(Nleft+i), block_size,
+        (*curInSwitchVal)&1);
+      curInSwitchVal += 1;
+    }
+#ifdef SHOW_APPLYPERM
+    printf(" ");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+
+    // Apply subnetwork switches
+    applyPermutation<oswap_style>(buf, Nleft, block_size, traversal);
+    applyPermutation<oswap_style>(buf + block_size*Nleft, Nright,
+        block_size, traversal);
+
+#ifdef SHOW_APPLYPERM
+    printf("r");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+    printf("o");
+    for(uint32_t i=0;i<numOutSwitches;++i) {
+      printf(" %s", outSwitch[i] ? " X" : "||");
+    }
+    printf("\n");
+#endif
+    // Apply output switches to permutation
+    for (uint32_t i=0; i<numOutSwitches; i++) {
+      oswap_buffer<oswap_style>(buf + block_size*i, buf + block_size*(Nleft+i), block_size,
+        *outSwitch);
+      ++outSwitch;
+    }
+#ifdef SHOW_APPLYPERM
+    printf("e");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+  }
+}
+
+// Apply permutation encoded by control bits to data elements in buffer
+// using a multithread evaluation plan. Permutes in place.
+template <OSwap_Style oswap_style>
+void WaksmanNetwork::applyInversePermutation(unsigned char *buf,
+        size_t block_size, const WNEvalPlan &plan) {
+    FOAV_SAFE_CNTXT(AP, Ntotal)
+    if (Ntotal > 1) {
+        WNTraversal traversal(*this);
+        applyInversePermutation<oswap_style>(buf, block_size, plan, traversal);
+    }
+}
+
+template <typename CBT>
+struct ApplySwitchesArgs {
+    unsigned char *buf;
+    size_t block_size;
+    const CBT* switches;
+    uint32_t swStart, swEnd;
+    uint32_t stride;
+};
+
+// Apply a consecutive sequence of input or output switches, using
+// arguments passed as an ApplySwitchesArgs*.  CBT is the
+// control bit type (uint32_t for input switches or uint8_t for output
+// switches).
+template <OSwap_Style oswap_style, typename CBT>
+static void* applySwitchesRange(void *voidargs)
+{
+    const ApplySwitchesArgs<CBT>* args =
+        (const ApplySwitchesArgs<CBT> *)voidargs;
+    unsigned char *buf = args->buf;
+    const size_t block_size = args->block_size;
+    const uint32_t swStart = args->swStart;
+    const CBT* switches = args->switches + swStart;
+    const uint32_t swEnd = args->swEnd;
+    const uint32_t stride = args->stride;
+
+    FOAV_SAFE_CNTXT(applySwitchesRange, swEnd)
+    for (uint32_t i=swStart; i<swEnd; ++i) {
+        FOAV_SAFE2_CNTXT(applySwitchesRange, i, swEnd)
+        oswap_buffer<oswap_style>(buf + block_size*(i),
+            buf + block_size*(stride+i), block_size,
+            (*switches)&1);
+        ++switches;
+    }
+
+    return NULL;
+}
+
+// Apply a consecutive sequence of input or output switches using
+// up to nthreads threads.  CBT is the control bit type (uint32_t for
+// input switches or uint8_t for output switches), but it will be
+// deduced automatically from the type of the switches argument.
+template <OSwap_Style oswap_style, typename CBT>
+static void applySwitches(unsigned char *buf, size_t block_size,
+    const CBT* switches, uint32_t numSwitches, uint32_t stride,
+    uint32_t nthreads)
+{
+    uint32_t threads_to_use = nthreads;
+    ApplySwitchesArgs<CBT> asargs[threads_to_use];
+    uint32_t inc = numSwitches / threads_to_use;
+    uint32_t extra = numSwitches % threads_to_use;
+    uint32_t last = 0;
+
+    for (uint32_t t=0; t<threads_to_use; ++t) {
+        uint32_t next = last + inc + (t < extra);
+        asargs[t] = { buf, block_size, switches, last, next, stride };
+        last = next;
+        if (t > 0) {
+            threadpool_dispatch(g_thread_id+t,
+                applySwitchesRange<oswap_style,CBT>, &asargs[t]);
+        }
+    }
+    // Do the first block ourselves
+    applySwitchesRange<oswap_style,CBT>(&asargs[0]);
+    for (size_t t=1; t<threads_to_use; ++t) {
+        threadpool_join(g_thread_id+t, NULL);
+    }
+}
+
+// Apply inverse of permutation encoded by control bits to data elements
+// in buffer using a multithread evaluation plan. Permutes in place.
+template <OSwap_Style oswap_style>
+void WaksmanNetwork::applyInversePermutation(unsigned char *buf,
+    size_t block_size, const WNEvalPlan &plan, WNTraversal &traversal) {
+
+  const uint32_t N = plan.N;
+  const uint32_t nthreads = plan.nthreads;
+
+  if (N < 2) return;
+  if (nthreads <= 1) {
+#ifdef PROFILE_MTAPPLYPERM
+unsigned long start = printf_with_rtclock("Thread %u starting single-threaded applyInversePermutation(N=%lu)\n", g_thread_id, N);
+#endif
+    applyInversePermutation<oswap_style>(buf, N, block_size, traversal);
+#ifdef PROFILE_MTAPPLYPERM
+printf_with_rtclock_diff(start, "Thread %u ending single-threaded applyInversePermutation(N=%lu)\n", g_thread_id, N);
+#endif
+    return;
+  }
+
+#ifdef PROFILE_MTAPPLYPERM
+unsigned long start = printf_with_rtclock("Thread %u starting applyInversePermutation(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+
+  const uint32_t Nleft = (N+1)/2;
+  const uint32_t Nright = N/2;
+  const uint32_t numInSwitches = (N-1)/2;
+  const uint32_t numOutSwitches = N/2;
+  const uint32_t *inSwitch = traversal.inSwitches;
+  const uint8_t *outSwitch = traversal.outSwitches;
+  const uint32_t nthr_left = (nthreads+1)/2;
+  const uint32_t nthr_right = nthreads/2;
+
+  WNTraversal lefttraversal = traversal;
+  lefttraversal.inSwitches += numInSwitches;
+  lefttraversal.outSwitches += numOutSwitches;
+  traversal.inSwitches +=
+    plan.subplans[0].subtree_num_inswitches + numInSwitches;
+  traversal.outSwitches +=
+    plan.subplans[0].subtree_num_outswitches + numOutSwitches;
+
+#ifdef SHOW_APPLYPERM
+  printf("s");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", *(uint32_t*)(buf+block_size*i));
+  }
+  printf("\n");
+#endif
+
+  FOAV_SAFE_CNTXT(AIP, N)
+  if (N == 2) {
+#ifdef SHOW_APPLYPERM
+    printf("o");
+    for(uint32_t i=0;i<numOutSwitches;++i) {
+      printf(" %s", outSwitch[i] ? " X" : "||");
+    }
+    printf("\n");
+#endif
+    oswap_buffer<oswap_style>(buf, buf + block_size, (uint32_t) block_size,
+        outSwitch[0]);
+#ifdef SHOW_APPLYPERM
+    printf("e");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+  } else {
+    // Apply output switches to permutation
+#ifdef SHOW_APPLYPERM
+    printf("o");
+    for(uint32_t i=0;i<numOutSwitches;++i) {
+      printf(" %s", outSwitch[i] ? " X" : "||");
+    }
+    printf("\n");
+#endif
+#ifdef PROFILE_MTAPPLYPERM
+unsigned long outswstart = printf_with_rtclock("Thread %u starting output switches (N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+    applySwitches<oswap_style>(buf, block_size, outSwitch, numOutSwitches,
+        Nleft, nthreads);
+#ifdef PROFILE_MTAPPLYPERM
+printf_with_rtclock_diff(outswstart, "Thread %u ending output switches (N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+#ifdef SHOW_APPLYPERM
+    printf(" ");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+
+    // Apply subnetwork switches
+    threadid_t rightthreadid = g_thread_id + nthr_left;
+    appInvPermArgs rightargs(*this, buf + block_size*Nleft,
+        block_size, plan.subplans[1], traversal);
+    threadpool_dispatch(rightthreadid,
+        applyInversePermutation_launch<oswap_style>, &rightargs);
+    applyInversePermutation<oswap_style>(buf, block_size,
+        plan.subplans[0], lefttraversal);
+    threadpool_join(rightthreadid, NULL);
+
+    // Apply input switches to permutation
+#ifdef SHOW_APPLYPERM
+    printf("r");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+    printf("i");
+    for(uint32_t i=0;i<numInSwitches;++i) {
+      printf(" %s", (inSwitch[i]&1) ? " X" : "||");
+    }
+    printf("\n");
+#endif
+#ifdef PROFILE_MTAPPLYPERM
+unsigned long inswstart = printf_with_rtclock("Thread %u starting input switches (N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+    applySwitches<oswap_style>(buf, block_size, inSwitch, numInSwitches,
+        Nleft, nthreads);
+#ifdef PROFILE_MTAPPLYPERM
+printf_with_rtclock_diff(inswstart, "Thread %u ending input switches (N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+#ifdef SHOW_APPLYPERM
+    printf("e");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+  }
+
+#ifdef PROFILE_MTAPPLYPERM
+printf_with_rtclock_diff(start, "Thread %u ending applyInversePermutation(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
+#endif
+
+}
+
+
+// Apply inverse of permutation in control bits to data elements in buffer. Permutes in place.
+template <OSwap_Style oswap_style>
+void WaksmanNetwork::applyInversePermutation(unsigned char *buf, size_t block_size) {
+    FOAV_SAFE_CNTXT(AIP, Ntotal)
+    if (Ntotal > 1) {
+        WNTraversal traversal(*this);
+        applyInversePermutation<oswap_style>(buf, Ntotal, block_size,
+            traversal);
+    }
+}
+
+// Apply inverse of permutation in control bits to data elements in buffer. Permutes in place.
+template <OSwap_Style oswap_style>
+void WaksmanNetwork::applyInversePermutation(unsigned char *buf,
+    uint32_t N, size_t block_size, WNTraversal &traversal) {
+  FOAV_SAFE_CNTXT(AIP, N)
+  if (N < 2) return;
+
+  const uint32_t Nleft = (N+1)/2;
+  const uint32_t Nright = N/2;
+  const uint32_t numInSwitches = (N-1)/2;
+  const uint32_t numOutSwitches = N/2;
+  const uint32_t *inSwitch = traversal.inSwitches;
+  const uint8_t *outSwitch = traversal.outSwitches;
+
+  traversal.subnetNumber += 1;
+  traversal.inSwitches += numInSwitches;
+  traversal.outSwitches += numOutSwitches;
+
+#ifdef SHOW_APPLYPERM
+  printf("s");
+  for(uint32_t i=0;i<N;++i) {
+    printf(" %2d", *(uint32_t*)(buf+block_size*i));
+  }
+  printf("\n");
+#endif
+
+  FOAV_SAFE_CNTXT(AIP, N)
+  if (N == 2) {
+#ifdef SHOW_APPLYPERM
+    printf("o");
+    for(uint32_t i=0;i<numOutSwitches;++i) {
+      printf(" %s", outSwitch[i] ? " X" : "||");
+    }
+    printf("\n");
+#endif
+    oswap_buffer<oswap_style>(buf, buf + block_size, (uint32_t) block_size,
+        outSwitch[0]);
+#ifdef SHOW_APPLYPERM
+    printf("e");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+  } else {
+    // Apply output switches to permutation
+#ifdef SHOW_APPLYPERM
+    printf("o");
+    for(uint32_t i=0;i<numOutSwitches;++i) {
+      printf(" %s", outSwitch[i] ? " X" : "||");
+    }
+    printf("\n");
+#endif
+    FOAV_SAFE_CNTXT(AIP, numOutSwitches)
+    for (uint32_t i=0; i<numOutSwitches; i++) {
+    FOAV_SAFE2_CNTXT(AIP, i, numOutSwitches)
+      oswap_buffer<oswap_style>(buf + block_size*i, buf + block_size*(Nleft+i), block_size,
+        *outSwitch);
+      ++outSwitch;
+    }
+#ifdef SHOW_APPLYPERM
+    printf(" ");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+
+    // Apply subnetwork switches
+    applyInversePermutation<oswap_style>(buf, Nleft,
+        block_size, traversal);
+    applyInversePermutation<oswap_style>(buf + block_size*Nleft, Nright,
+        block_size, traversal);
+
+    // Apply input switches to permutation
+#ifdef SHOW_APPLYPERM
+    printf("r");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+    printf("i");
+    for(uint32_t i=0;i<numInSwitches;++i) {
+      printf(" %s", (inSwitch[i]&1) ? " X" : "||");
+    }
+    printf("\n");
+#endif
+    const uint32_t *curInSwitchVal = inSwitch;
+    FOAV_SAFE_CNTXT(AIP, numInSwitches)
+    for (uint32_t i=0; i<numInSwitches; i++) {
+    FOAV_SAFE2_CNTXT(AIP, i, numInSwitches)
+      oswap_buffer<oswap_style>(buf + block_size*(i), buf + block_size*(Nleft+i), block_size,
+        (*curInSwitchVal&1));
+      curInSwitchVal += 1;
+    }
+#ifdef SHOW_APPLYPERM
+    printf("e");
+    for(uint32_t i=0;i<N;++i) {
+      printf(" %2d", *(uint32_t*)(buf+block_size*i));
+    }
+    printf("\n");
+#endif
+  }
+}
+
+
+#if 0
+void OblivWaksmanShuffle(unsigned char *buffer, uint32_t N, size_t block_size, enc_ret *ret);
+
+void DecryptAndOblivWaksmanShuffle(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret);
+
+void DecryptAndOWSS(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, unsigned char *result_buffer, enc_ret *ret);
+
+void DecryptAndMTSS(unsigned char *encrypted_buffer, uint32_t N,
+  size_t encrypted_block_size, size_t nthreads,
+  unsigned char *result_buffer, enc_ret *ret);
+#endif
+
+#endif

+ 127 - 0
Enclave/OblivAlgs/aes.hpp

@@ -0,0 +1,127 @@
+#ifndef __AES_HPP__
+#define __AES_HPP__
+
+/* Based on reference code from the Intel AES-NI whitepaper
+ * http://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf
+ */
+
+/* Extracted from /usr/lib/gcc/x86_64-linux-gnu/11/include/emmintrin.h and
+   /usr/lib/gcc/x86_64-linux-gnu/11/include/wmmintrin.h */
+
+typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi32 (__m128i __A, const int __mask)
+{
+  return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si128 (__m128i __A, const int __N)
+{
+  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A ^ (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aeskeygenassist_si128 (__m128i __X, const int __C)
+{
+  return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenclast_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64x (long long __q1, long long __q0)
+{
+  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
+}
+
+
+using AESkey = __m128i[11];
+
+static inline __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2)
+{
+    __m128i temp3;
+    temp2 = _mm_shuffle_epi32 (temp2 ,0xff);
+    temp3 = _mm_slli_si128 (temp1, 0x4);
+    temp1 = _mm_xor_si128 (temp1, temp3);
+    temp3 = _mm_slli_si128 (temp3, 0x4);
+    temp1 = _mm_xor_si128 (temp1, temp3);
+    temp3 = _mm_slli_si128 (temp3, 0x4);
+    temp1 = _mm_xor_si128 (temp1, temp3);
+    temp1 = _mm_xor_si128 (temp1, temp2);
+    return temp1;
+}
+
+static inline void AES_128_Key_Expansion (AESkey &key, __m128i rawkey)
+{
+    __m128i temp1, temp2;
+    __m128i *Key_Schedule = key;
+    temp1 = rawkey;
+    Key_Schedule[0] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1 ,0x1);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[1] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x2);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[2] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x4);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[3] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x8);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[4] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x10);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[5] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x20);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[6] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x40);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[7] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x80);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[8] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x1b);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[9] = temp1;
+    temp2 = _mm_aeskeygenassist_si128 (temp1,0x36);
+    temp1 = AES_128_ASSIST(temp1, temp2);
+    Key_Schedule[10] = temp1;
+}
+
+static inline void AES_ECB_encrypt(__m128i &ciphertext, __m128i plaintext,
+    const AESkey &key)
+{
+    __m128i tmp;
+    int j;
+    tmp = plaintext;
+    tmp = _mm_xor_si128 (tmp,key[0]);
+    for(j=1; j<10; j++){
+        tmp = _mm_aesenc_si128 (tmp,key[j]);
+    }
+    tmp = _mm_aesenclast_si128 (tmp,key[j]);
+    ciphertext=tmp;
+}
+
+#endif

+ 22 - 0
Enclave/OblivAlgs/foav.h

@@ -0,0 +1,22 @@
+#ifndef __FOAV_H__
+#define __FOAV_H__
+
+// -DFOAV_ENABLE=0 to disable, -DFOAV_ENABLE=1 (or just -DFOAV_ENAVLE) to enable
+#ifndef FOAV_ENABLE
+// Defaults to enable
+#define FOAV_ENABLE 1
+#endif
+
+#if FOAV_ENABLE == 0
+#define FOAV_SAFE(var)
+#define FOAV_SAFE2(var1,var2)
+#define FOAV_SAFE_CNTXT(context, var)
+#define FOAV_SAFE2_CNTXT(context,var1,var2)
+#else
+#define FOAV_SAFE(var) __asm__ ("# FOAV " #var " (%0)"::"X"(var):);
+#define FOAV_SAFE2(var1,var2) __asm__ ("# FOAV " #var1 " (%0)\n\t# FOAV " #var2 " (%1)"::"X"(var1),"X"(var2):);
+#define FOAV_SAFE_CNTXT(context, var) __asm__ ("# FOAV " #context " " #var " (%0)"::"X"(var):);
+#define FOAV_SAFE2_CNTXT(context,var1,var2) __asm__ ("# FOAV " #context " " #var1 " (%0)\n\t# FOAV " #context " " #var2 " (%1)"::"X"(var1),"X"(var2):);
+#endif
+
+#endif

+ 183 - 0
Enclave/OblivAlgs/oasm_lib.h

@@ -0,0 +1,183 @@
+/*
+*    ZeroTrace: Oblivious Memory Primitives from Intel SGX
+*    Copyright (C) 2018  Sajin (sshsshy)
+*
+*    This program is free software: you can redistribute it and/or modify
+*    it under the terms of the GNU General Public License as published by
+*    the Free Software Foundation, version 3 of the License.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU General Public License for more details.
+*
+*    You should have received a copy of the GNU General Public License
+*    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef __OASM_LIB__
+	#define __OASM_LIB__
+
+#include <cstdint>
+
+  #ifndef BEFTS_MODE
+    #include "CONFIG.h"
+  #endif
+
+  #ifdef COUNT_OSWAPS
+    extern thread_local uint64_t OSWAP_COUNTER;
+  #endif
+
+  // Oblivious Buffer move/swap functions:
+#if 0
+	extern "C" void oswap_buffer_16x(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag);
+	extern "C" void oswap_buffer_byte(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag);
+
+	extern "C" void oswap_buffer_byte_16x(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag);
+	extern "C" void oswap_buffer_byte_v2(unsigned char *dest, unsigned char *source, uint8_t flag);
+
+	extern "C" void ogt_comp_swap(uint64_t *key1, uint64_t *key2, unsigned char *buff1, unsigned char *buff2, uint32_t buffersize);
+#endif
+
+  enum OSwap_Style { OSWAP_4, OSWAP_8, OSWAP_12, OSWAP_16X, OSWAP_8_16X };
+  template<OSwap_Style oswap_style> inline void oswap_buffer(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag);
+  template<typename KeyType> inline void oswap_key(unsigned char *dest, unsigned char *source, uint8_t flag);
+
+  template<OSwap_Style oswap_style> inline void omove_buffer(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag);
+
+  inline uint8_t ogt_set_flag(uint64_t key1, uint64_t key2)
+  {
+    uint8_t flag;
+    __asm__ (
+          "# inline ogt_set_flag\n"
+          "cmp %[key2], %[key1]\n"
+          //"# FOAV ogt_set_flag key1 (%[key1]):\n"
+          "seta %[flag]\n"
+          : [flag] "=r" (flag)
+          : [key1] "r" (key1), [key2] "r" (key2)
+          : "cc"
+      );
+    return flag;
+  }
+
+  // A size-templated version of oblivious greater than for wide-key Bitonic Sort
+  // Returns 1 if (*key1p) > (*key2p), and 0 otherwise, in a fully oblivious manner.
+  template<typename keytype>
+  inline uint8_t ogt(const keytype *key1p, const keytype *key2p);
+
+  template<>
+  inline uint8_t ogt<uint32_t>(const uint32_t *key1p, const uint32_t *key2p)
+  {
+    uint8_t flag;
+    __asm__ (
+          "# inline ogt_uint32\n"
+          "movl (%[key1p]), %%eax\n"
+          "cmpl (%[key2p]), %%eax\n"
+          "seta %[flag]\n"
+          : [flag] "=r" (flag)
+          : [key1p] "r" (key1p), [key2p] "r" (key2p)
+          : "eax", "cc"
+      );
+    return flag;
+  }
+
+  template<>
+  inline uint8_t ogt<uint64_t>(const uint64_t *key1p, const uint64_t *key2p) {
+    __asm__ ("# inline ogt_uint64\n");
+    return ogt_set_flag(*key1p, *key2p);
+  }
+
+  template<>
+  inline uint8_t ogt<__uint128_t>(const __uint128_t *key1p, const __uint128_t *key2p) {
+    uint8_t flag;
+    __asm__ (
+          "# inline ogt_uint128\n"
+	  "movq    8(%[key2p]), %%rcx\n"
+          "movq    (%[key1p]), %%rax\n"
+          "cmpq    %%rax, (%[key2p])\n"
+          "sbbq    8(%[key1p]), %%rcx\n"
+          "setc    %[flag]\n"
+          : [flag] "=r" (flag)
+          : [key1p] "r" (key1p), [key2p] "r" (key2p)
+          : "rax", "rcx", "cc"
+      );
+    return flag;
+  }
+
+  inline uint8_t oge_set_flag(uint64_t key1, uint64_t key2)
+  {
+    uint8_t flag;
+    __asm__ (
+          "# inline oge_set_flag\n"
+          "cmp %[key2], %[key1]\n"
+          //"# FOAV oge_set_flag key1 (%[key1]):\n"
+          "setae %[flag]\n"
+          : [flag] "=r" (flag)
+          : [key1] "r" (key1), [key2] "r" (key2)
+          : "cc"
+      );
+    return flag;
+  }
+
+  inline uint8_t oe_set_flag(uint32_t key1, uint32_t key2)
+  {
+    uint8_t flag;
+    __asm__ (
+          "# inline oe_set_flag\n"
+          "cmp %[key2], %[key1]\n"
+          //"# FOAV oe_set_flag key1 (%[key1]):\n"
+          "sete %[flag]\n"
+          : [flag] "=r" (flag)
+          : [key1] "r" (key1), [key2] "r" (key2)
+          : "cc"
+      );
+    return flag;
+  }
+
+  inline void oset_value(uint64_t *dest, uint64_t value, uint32_t flag)
+  {
+    __asm__ (
+        "# inline oset_value\n"
+        "mov (%[dest]), %%r10\n"
+        "test %[flag], %[flag]\n"
+        "cmovnz %[value], %%r10\n"
+        "mov %%r10, (%[dest])\n"
+        :
+        : [dest] "r" (dest), [value] "r" (value), [flag] "r" (flag)
+        : "cc", "memory", "r10"
+    );
+  }
+
+  inline void oset_value_uint32_t(uint32_t *dest, uint32_t value, uint8_t flag)
+  {
+    __asm__ (
+        "# inline oset_value_uint32_t\n"
+        "mov (%[dest]), %%r10d\n"
+        "test %[flag], %[flag]\n"
+        "cmovnz %[value], %%r10d\n"
+        "mov %%r10d, (%[dest])\n"
+        :
+        : [dest] "r" (dest), [value] "r" (value), [flag] "r" (flag)
+        : "cc", "memory", "r10"
+    );
+  }
+
+  inline uint32_t oselect_uint32_t(uint32_t value_0, uint32_t value_1, uint8_t flag)
+  {
+    uint32_t out;
+    __asm__ (
+        "# inline oselect_uint32_t\n"
+        "mov %[value0], %[out]\n"
+        "test %[flag], %[flag]\n"
+        "cmovnz %[value1], %[out]\n"
+        : [out] "=r" (out)
+        : [value0] "r" (value_0), [value1] "r" (value_1), [flag] "r" (flag)
+        : "cc"
+    );
+    return out;
+  }
+
+  #include "oasm_lib.tcc"
+
+#endif

+ 392 - 0
Enclave/OblivAlgs/oasm_lib.tcc

@@ -0,0 +1,392 @@
+#ifndef __OASM_LIB_TCC__
+#define __OASM_LIB_TCC__
+
+#include "foav.h"
+
+template<> inline void oswap_buffer<OSWAP_4>(unsigned char *dest, unsigned char *source, uint32_t , uint8_t flag)
+{
+    #ifdef COUNT_OSWAPS
+      OSWAP_COUNTER++;
+    #endif
+
+    #if 0
+    oswap_buffer_byte_v2(dest, source, flag);
+    #else
+    __asm__ (
+        "# inline oswap_buffer<OSWAP_4>\n"
+        "test %[flag], %[flag]\n"
+        "movl (%[dest]), %%r10d\n"
+        "movl (%[dest]), %%r11d\n"
+        "movl (%[source]), %%ecx\n"
+        "cmovnz %%ecx, %%r10d\n"
+        "cmovnz %%r11d, %%ecx\n"
+        "movl %%r10d, (%[dest])\n"
+        "movl %%ecx, (%[source])\n"
+        :
+        : [dest] "r" (dest), [source] "r" (source), [flag] "r" (flag)
+        : "cc", "memory", "r10", "r11", "ecx"
+    );
+    #endif
+}
+
+template<> inline void oswap_buffer<OSWAP_8>(unsigned char *dest, unsigned char *source, uint32_t , uint8_t flag)
+{
+    #ifdef COUNT_OSWAPS
+      OSWAP_COUNTER++;
+    #endif
+
+    #if 0
+    oswap_buffer_byte_v2(dest, source, flag);
+    #else
+    __asm__ (
+        "# inline oswap_buffer<OSWAP_8>\n"
+        "test %[flag], %[flag]\n"
+        "movq (%[dest]), %%r10\n"
+        "movq (%[dest]), %%r11\n"
+        "movq (%[source]), %%rcx\n"
+        "cmovnz %%rcx, %%r10\n"
+        "cmovnz %%r11, %%rcx\n"
+        "movq %%r10, (%[dest])\n"
+        "movq %%rcx, (%[source])\n"
+        :
+        : [dest] "r" (dest), [source] "r" (source), [flag] "r" (flag)
+        : "cc", "memory", "r10", "r11", "rcx"
+    );
+    #endif
+}
+
+template<> inline void oswap_buffer<OSWAP_12>(unsigned char *dest, unsigned char *source, uint32_t , uint8_t flag)
+{
+    #ifdef COUNT_OSWAPS
+      OSWAP_COUNTER++;
+    #endif
+
+    #if 0
+    oswap_buffer_byte_v2(dest, source, flag);
+    #else
+    __asm__ (
+        "# inline oswap_buffer<OSWAP_12>\n"
+        "test %[flag], %[flag]\n"
+        "movq (%[dest]), %%r14\n"   // dest data
+        "movq (%[dest]), %%r12\n"   // dest data
+        "movl 8(%[dest]), %%ebx\n"  // dest data (next word)
+        "movl 8(%[dest]), %%edx\n"  // dest data (next word)
+        "movq (%[source]), %%r15\n"   // source data
+        "movl 8(%[source]), %%r13d\n"  // source data (next word)
+
+        "cmovnz %%r15, %%r14\n"    // r14 <- r15 based on the flag (C1)
+        "cmovnz %%r13d, %%ebx\n"    // rbx <- r13 based on the flag (C1')
+        "cmovnz %%r12, %%r15\n"    // r15 <- r12 based on the flag (C2)
+        "cmovnz %%edx, %%r13d\n"    // r13 <- rdx based on the flag (C2')
+
+        "movq %%r14, (%[dest])\n"   // dest gets back r14, which is source's data if flag is true from (C1)
+                                  // else it gets back the same dest data
+        "movl %%ebx, 8(%[dest])\n"  // dest+8 gets back ebx, which is source+8's data if flag is true from
+                                  // (C1'), else it gets back the same dest+8 data
+        "movq %%r15, (%[source])\n"   // source gets back r15, which is dest's original data if flag is true
+                                  // from (C2), else it gets back the same B2 data
+        "movl %%r13d, 8(%[source])\n"  // source+8 gets back r13d, which is dest+8's original data if flag is
+                                  // true from (C2'), else it gets back the same B2 data
+        :
+        : [dest] "r" (dest), [source] "r" (source), [flag] "r" (flag)
+        : "cc", "memory", "rcx", "r12", "r13", "r14", "r15", "rbx", "rdx"
+    );
+    #endif
+}
+
+template<> inline void oswap_buffer<OSWAP_16X>(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag)
+{
+    #ifdef COUNT_OSWAPS
+      OSWAP_COUNTER++;
+    #endif
+
+    __asm__ (
+      "# inline oswap_buffer<OSWAP_16X>\n"
+
+      //Move ptr to dest and source buffers to r10 and r11
+      "movq %[dest], %%r10\n"
+      "movq %[source], %%r11\n"
+
+      //Set loop parameters
+      "movl %[buffersize], %%ecx\n"
+      "shr $4, %%ecx\n"
+
+      //Loop to fetch iter & res chunks till blk_size
+      "1:\n"
+        "test %[flag], %[flag]\n"
+        "movq (%%r10), %%r14\n"   // dest data
+        "movq (%%r10), %%r12\n"   // dest data
+        "movq 8(%%r10), %%rbx\n"  // dest data (next qword)
+        "movq 8(%%r10), %%rdx\n"  // dest data (next qword)
+        "movq (%%r11), %%r15\n"   // source data
+        "movq 8(%%r11), %%r13\n"  // source data (next qword)        
+        
+        "cmovnz %%r15, %%r14\n"    // r14 <- r15 based on the flag (C1)
+        "cmovnz %%r13, %%rbx\n"    // rbx <- r13 based on the flag (C1')
+        "cmovnz %%r12, %%r15\n"    // r15 <- r12 based on the flag (C2)
+        "cmovnz %%rdx, %%r13\n"    // r13 <- rdx based on the flag (C2') 
+
+        "movq %%r14, (%%r10)\n"   // dest gets back r14, which is source's data if flag is true from (C1)
+                                  // else it gets back the same dest data 
+        "movq %%rbx, 8(%%r10)\n"  // dest+8 gets back rbx, which is source+8's data if flag is true from
+                                  // (C1'), else it gets back the same dest+8 data
+        "movq %%r15, (%%r11)\n"   // source gets back r15, which is dest's original data if flag is true
+                                  // from (C2), else it gets back the same B2 data
+        "movq %%r13, 8(%%r11)\n"  // source+8 gets back r13, which is dest+8's original data if flag is 
+                                  // true from (C2'), else it gets back the same B2 data
+        "add $16, %%r10\n"
+        "add $16, %%r11\n"
+        "dec %%ecx\n"
+        "# FOAV oswap_buffer_16X ctr (%%ecx)\n"
+        "jnz 1b\n"
+        :
+        : [dest] "r" (dest), [source] "r" (source), [buffersize] "r" (buffersize), [flag] "r" (flag)
+        : "cc", "memory", "r10", "r11", "rcx", "r12", "r13", "r14", "r15", "rbx", "rdx"
+    );
+}
+
+template<> inline void oswap_buffer<OSWAP_8_16X>(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag)
+{
+    #ifdef COUNT_OSWAPS 
+      OSWAP_COUNTER++;
+    #endif
+
+    __asm__ (
+      "# inline oswap_buffer<OSWAP_8_16X>\n"
+
+      //Move ptr to dest and source buffers to r10 and r11
+      "movq %[dest], %%r10\n"
+      "movq %[source], %%r11\n"
+
+      // Move first 8 bytes obliviously:
+      "test %[flag], %[flag]\n"
+      "movq (%%r10), %%r14\n"   // dest data
+      "movq (%%r10), %%r12\n"   // dest data
+      "movq (%%r11), %%r15\n"   // source data
+      
+      "cmovnz %%r15, %%r14\n"    // r14 <- r15 based on the flag (C1)
+      "cmovnz %%r12, %%r15\n"    // r15 <- r12 based on the flag (C2)
+
+      "movq %%r14, (%%r10)\n"   // dest gets back r14, which is source's data if flag is true from (C1)
+                                // else it gets back the same dest data 
+      "movq %%r15, (%%r11)\n"   // source gets back r15, which is dest's original data if flag is true
+                                // from (C2), else it gets back the same B2 data
+      "add $8, %%r10\n"
+      "add $8, %%r11\n" 
+
+      //Set loop parameters
+      "movl %[buffersize], %%ecx\n"
+      "shr $4, %%ecx\n"
+
+      //Loop to fetch iter & res chunks till blk_size
+      "1:\n"
+        "test %[flag], %[flag]\n"
+        "movq (%%r10), %%r14\n"   // dest data
+        "movq (%%r10), %%r12\n"   // dest data
+        "movq 8(%%r10), %%rbx\n"  // dest data (next qword)
+        "movq 8(%%r10), %%rdx\n"  // dest data (next qword)
+        "movq (%%r11), %%r15\n"   // source data
+        "movq 8(%%r11), %%r13\n"  // source data (next qword)        
+        
+        "cmovnz %%r15, %%r14\n"    // r14 <- r15 based on the flag (C1)
+        "cmovnz %%r13, %%rbx\n"    // rbx <- r13 based on the flag (C1')
+        "cmovnz %%r12, %%r15\n"    // r15 <- r12 based on the flag (C2)
+        "cmovnz %%rdx, %%r13\n"    // r13 <- rdx based on the flag (C2') 
+
+        "movq %%r14, (%%r10)\n"   // dest gets back r14, which is source's data if flag is true from (C1)
+                                  // else it gets back the same dest data 
+        "movq %%rbx, 8(%%r10)\n"  // dest+8 gets back rbx, which is source+8's data if flag is true from
+                                  // (C1'), else it gets back the same dest+8 data
+        "movq %%r15, (%%r11)\n"   // source gets back r15, which is dest's original data if flag is true
+                                  // from (C2), else it gets back the same B2 data
+        "movq %%r13, 8(%%r11)\n"  // source+8 gets back r13, which is dest+8's original data if flag is 
+                                  // true from (C2'), else it gets back the same B2 data
+        "add $16, %%r10\n"
+        "add $16, %%r11\n"
+        "dec %%ecx\n"
+        " # FOAV oswap_buffer_16X ctr (%%ecx)\n"
+        "jnz 1b\n"
+        :
+        : [dest] "r" (dest), [source] "r" (source), [buffersize] "r" (buffersize), [flag] "r" (flag)
+        : "cc", "memory", "r10", "r11", "rcx", "r12", "r13", "r14", "r15", "rbx", "rdx" 
+    );
+
+
+}
+
+template<> inline void oswap_key<uint32_t>(unsigned char *dest, unsigned char *source, uint8_t flag)
+{
+    oswap_buffer<OSWAP_4>(dest, source, 4, flag);
+}
+
+template<> inline void oswap_key<uint64_t>(unsigned char *dest, unsigned char *source, uint8_t flag)
+{
+    oswap_buffer<OSWAP_8>(dest, source, 8, flag);
+}
+
+template<> inline void oswap_key<__uint128_t>(unsigned char *dest, unsigned char *source, uint8_t flag)
+{
+    oswap_buffer<OSWAP_16X>(dest, source, 16, flag);
+}
+
+template<> inline void omove_buffer<OSWAP_8>(unsigned char *dest, unsigned char *source, uint32_t , uint8_t flag)
+{
+  __asm__ (
+      "# inline omove_buffer<OSWAP_8>\n"
+
+      "test %[flag], %[flag]\n"
+      "movq (%[dest]), %%r10\n"
+      //"movq (%[source]), %%rcx\n"
+      "cmovnz (%[source]), %%r10\n"
+      "movq %%r10, (%[dest])\n"
+      :
+      : [dest] "r" (dest), [source] "r" (source), [flag] "r" (flag)
+      : "cc", "memory", "r10"
+  );
+}
+
+
+template<> inline void omove_buffer<OSWAP_16X>(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag)
+{
+    __asm__ (
+      "# inline omove_buffer<OSWAP_16X>\n"
+
+      //Move ptr to dest and source buffers to r10 and r11
+      "movq %[dest], %%r10\n"
+      "movq %[source], %%r11\n"
+
+      //Set loop parameters
+      "movl %[buffersize], %%ecx\n"
+      "shr $4, %%ecx\n"
+
+      //Loop to fetch iter & res chunks till blk_size
+      "1:\n"
+        "test %[flag], %[flag]\n"
+        "movq (%%r10), %%r14\n"   // dest data
+        "movq 8(%%r10), %%rbx\n"  // dest data (next qword)
+        "movq (%%r11), %%r15\n"   // source data
+        "movq 8(%%r11), %%r13\n"  // source data (next qword)        
+        
+        "cmovnz %%r15, %%r14\n"    // r14 <- r15 based on the flag (C1)
+        "cmovnz %%r13, %%rbx\n"    // rbx <- r13 based on the flag (C1')
+
+        "movq %%r14, (%%r10)\n"   // dest gets back r14, which is source's data if flag is true from (C1)
+                                  // else it gets back the same dest data 
+        "movq %%rbx, 8(%%r10)\n"  // dest+8 gets back rbx, which is source+8's data if flag is true from
+                                  // (C1'), else it gets back the same dest+8 data
+        "add $16, %%r10\n"
+        "add $16, %%r11\n"
+        "dec %%ecx\n"
+        " # FOAV oswap_buffer_16X ctr (%%ecx)\n"
+        "jnz 1b\n"
+        :
+        : [dest] "r" (dest), [source] "r" (source), [buffersize] "r" (buffersize), [flag] "r" (flag)
+        : "cc", "memory", "r10", "r11", "rcx", "r13", "r14", "r15", "rbx"
+    );
+}
+
+
+template<> inline void omove_buffer<OSWAP_8_16X>(unsigned char *dest, unsigned char *source, uint32_t buffersize, uint8_t flag)
+{
+    __asm__ (
+      "# inline omove_buffer<OSWAP_8_16X>\n"
+
+      //Move ptr to dest and source buffers to r10 and r11
+      "movq %[dest], %%r10\n"
+      "movq %[source], %%r11\n"
+
+      // Move first 8 bytes obliviously:
+      "test %[flag], %[flag]\n"
+      "movq (%%r10), %%r14\n"   // dest data
+      "movq (%%r11), %%r15\n"   // source data
+      
+      "cmovnz %%r15, %%r14\n"    // r14 <- r15 based on the flag (C1)
+
+      "movq %%r14, (%%r10)\n"   // dest gets back r14, which is source's data if flag is true from (C1)
+                                // else it gets back the same dest data 
+      "add $8, %%r10\n"
+      "add $8, %%r11\n" 
+
+      //Set loop parameters
+      "movl %[buffersize], %%ecx\n"
+      "shr $4, %%ecx\n"
+
+      //Loop to fetch iter & res chunks till blk_size
+      "1:\n"
+        "test %[flag], %[flag]\n"
+        "movq (%%r10), %%r14\n"   // dest data
+        "movq 8(%%r10), %%rbx\n"  // dest data (next qword)
+        "movq (%%r11), %%r15\n"   // source data
+        "movq 8(%%r11), %%r13\n"  // source data (next qword)        
+        
+        "cmovnz %%r15, %%r14\n"    // r14 <- r15 based on the flag (C1)
+        "cmovnz %%r13, %%rbx\n"    // rbx <- r13 based on the flag (C1')
+
+        "movq %%r14, (%%r10)\n"   // dest gets back r14, which is source's data if flag is true from (C1)
+                                  // else it gets back the same dest data 
+        "movq %%rbx, 8(%%r10)\n"  // dest+8 gets back rbx, which is source+8's data if flag is true from
+                                  // (C1'), else it gets back the same dest+8 data
+        "add $16, %%r10\n"
+        "add $16, %%r11\n"
+        "dec %%ecx\n"
+        " # FOAV oswap_buffer_16X ctr (%%ecx)\n"
+        "jnz 1b\n"
+        :
+        : [dest] "r" (dest), [source] "r" (source), [buffersize] "r" (buffersize), [flag] "r" (flag)
+        : "cc", "memory", "r10", "r11", "rcx", "r13", "r14", "r15", "rbx"
+    );
+}
+
+/*
+omove_buffer:
+	; Take inputs,  1 ptr to dest_buffer, 2 ptr to source_buffer, 3 buffer_size, 4 flag
+	; Linux : 	rdi,rsi,rdx,rcx->rbp
+
+	; Callee-saved : RBP, RBX, and R12–R15
+
+	push rbx
+	push rbp
+	push r12
+	push r13
+	push r14
+	push r15
+
+	; Move ptr to data from serialized_dest_block and serialized_source_blk
+	mov r10, rdi
+	mov r11, rsi
+
+	;RCX will be lost for loop, store flag from rcx to rbp (1 byte , so bpl)
+	mov bpl, cl
+
+	; Oblivious evaluation of flag
+	cmp bpl, 1
+
+	;Set loop parameters
+	mov ax, dx
+	xor rdx, rdx
+	mov bx, 8
+	div bx
+	mov cx, ax
+
+	; Loop to fetch iter & res chunks till blk_size
+	loopstart_omb:
+		cmp bpl, 1
+		mov r14, qword [r10]
+		mov r15, qword [r11]
+		cmovz r14, r15 				;r14 / r15 based on the compare
+		mov qword [r10], r14
+		add r10, 8
+		add r11, 8
+		loop loopstart_omb
+
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop rbp
+	pop rbx
+
+	ret
+*/
+
+#endif

+ 1079 - 0
Enclave/OblivAlgs/utils.cpp

@@ -0,0 +1,1079 @@
+#include <pthread.h>
+#include "utils.hpp"
+
+#ifdef COUNT_OSWAPS
+thread_local uint64_t OSWAP_COUNTER=0;
+#endif
+
+PRB_buffer* PRB_pool;
+thread_local uint64_t PRB_rand_bits = 0;
+thread_local uint32_t PRB_rand_bits_remaining = 0;
+
+bool bulk_initialized = false; 
+sgx_aes_ctr_128bit_key_t bulk_random_seed[SGX_AESCTR_KEY_SIZE];
+unsigned char bulk_counter[SGX_AESCTR_KEY_SIZE];
+
+
+int compare(const void *buf1, const void *buf2) {
+  uint64_t label1, label2;
+  memcpy(&label1, (const unsigned char*) buf1, 8);
+  memcpy(&label2, (const unsigned char*) buf2, 8);
+
+  return((int)(label1 - label2));
+}
+
+int compare_32(const void *buf1, const void *buf2) {
+  uint32_t label1, label2;
+  memcpy(&label1, (const unsigned char*) buf1, 4);
+  memcpy(&label2, (const unsigned char*) buf2, 4);
+
+  return((int)(label1 - label2));
+}
+
+#if 0
+void generateSortPermutation_DJB(size_t N, unsigned char *buffer, size_t block_size, size_t *permutation) {
+  size_t *keys;
+  try {
+    keys = new size_t[N];
+  } catch (std::bad_alloc&) {
+    printf("Allocating memory failed in generateSortPermutation_DJB\n");
+  }
+
+  unsigned char *buffer_ptr = buffer;
+  for(size_t i=0; i<N; i++){
+    keys[i] = *((size_t*)(buffer_ptr));
+    permutation[i] = i;
+    buffer_ptr+=block_size;
+  }
+
+  BitonicSort((unsigned char*) keys, N, (unsigned char*) permutation, NULL, 8, true); 
+  /* 
+  printf("\nSort Permutation:\n");
+  for(size_t i=0; i<N; i++)
+    printf("%ld, ", permutation[i]);
+  printf("\n");
+  */
+  delete[] keys;
+}
+
+void generateSortPermutation_OA(uint32_t N, unsigned char *buffer, size_t block_size, uint32_t *permutation) {
+  // Extract key list from buffer
+  uint32_t *keys = new uint32_t[N];
+  unsigned char *buffer_ptr = buffer;
+  for(size_t i=0; i<N; i++){
+    keys[i] = *((uint32_t*)(buffer_ptr));
+    permutation[i] = i;
+    buffer_ptr+=block_size;
+  }
+
+  BitonicSort<OSWAP_4, uint32_t> ((unsigned char*) keys, N, (unsigned char*) permutation, NULL, 4, true); 
+
+  /*
+  printf("\nSort Permutation:\n");
+  for(size_t i=0; i<N; i++)
+    printf("%ld, ", permutation[i]);
+  printf("\n");
+  */ 
+
+  delete []keys;
+}
+#endif
+
+
+/* Debug function to see keys in a buffer*/
+void displayKeysInBuffer(unsigned char *buffer, size_t N, size_t block_size){
+  unsigned char *ptr = buffer;
+  printf("Keys in displayKeysInBuffer:\n");
+  for(size_t i=0; i<N; i++){
+    size_t key = *((size_t*) ptr);
+    ptr+=block_size;
+    printf("%ld\n",key);
+  }
+  printf("\n\n");
+}
+
+#if 0
+#ifndef BEFTS_MODE
+/*
+  Decrypts buffers passed to the enclave that are encrypted with keys from Enclave_LoadTestKeys
+  with AES_GCM. In addition it, gives each decrypted block an 8 byte random tag at the start.
+  Intended for using SN as a shuffler, by sorting the blocks based on the attached random tags.
+
+  The function assumes the provided encrypted buffer is initalized to the correct length.
+  It returns a buffer of correct size (N * block_size_with_tag ) back to the function that
+  invoked decryptBuffer, where block_size_with_tag = decrypted_block_size + 8
+
+  The function returns the block_size_with_tag.
+*/
+
+size_t decryptBuffer_attachRTags_addDummies(unsigned char *encrypted_buffer, uint64_t N, 
+        uint64_t N_prime, uint64_t B, uint64_t Z, size_t encrypted_block_size, 
+        unsigned char *random_bytes, unsigned char **decrypted_buffer) {
+
+  size_t decrypted_block_size = encrypted_block_size - SGX_AESGCM_IV_SIZE - SGX_AESGCM_MAC_SIZE;
+  size_t block_size_with_tag = decrypted_block_size + 8;
+
+  // If decrypted_buffer hasn't been allocated yet, allocate required memory to hold the decrypted
+  // buffer
+  if((*decrypted_buffer)==NULL){
+    size_t mem_to_malloc = 2 * N_prime * block_size_with_tag;
+    (*decrypted_buffer) = (unsigned char *) malloc(mem_to_malloc);
+    if(*decrypted_buffer==NULL) {
+      printf("Malloc failed in decryptBuffer_withAttachedRandomTags_interleaveDummies\n");
+    }
+  }
+
+  unsigned char *dec_buf_ptr = *decrypted_buffer;
+  unsigned char *enc_buf_ptr = encrypted_buffer;
+  unsigned char *tag_ptr = enc_buf_ptr + SGX_AESGCM_IV_SIZE + decrypted_block_size;
+
+  uint64_t reals_per_bucket = N / B;
+  uint32_t num_buckets_with_extra_reals = N % B;
+  uint64_t packets_to_extract = reals_per_bucket;
+  for(size_t B_curr = 0; B_curr < B; B_curr++) {
+    packets_to_extract = (B_curr < num_buckets_with_extra_reals)? reals_per_bucket+1 : reals_per_bucket;
+    size_t num_dummies = Z - packets_to_extract;
+
+    for(size_t i=0; i<packets_to_extract; i++) {
+      uint64_t destination_bucket = (*((uint64_t*) random_bytes)) % B;
+      memcpy(dec_buf_ptr, (unsigned char*) &destination_bucket, 8);
+      random_bytes+=8;
+      dec_buf_ptr+=8; 
+
+      sgx_status_t aesret = sgx_rijndael128GCM_decrypt(
+          &enclave_decryption_key, enc_buf_ptr + SGX_AESGCM_IV_SIZE, decrypted_block_size,
+          dec_buf_ptr, enc_buf_ptr, SGX_AESGCM_IV_SIZE, NULL, 0,
+          (const sgx_aes_gcm_128bit_tag_t*)(tag_ptr));
+      if (aesret != SGX_SUCCESS) {
+        printf("sgx_rijndael128GCM_decrypt failure (%x)\n", aesret);
+        return -1;
+      }
+
+      dec_buf_ptr+=decrypted_block_size;
+      enc_buf_ptr+=encrypted_block_size;
+      tag_ptr+=encrypted_block_size;
+    }
+
+    for(size_t i=0; i<num_dummies; i++) {
+      // Set the destination label to UINT64_MAX to indicate it's a dummy.
+      // We don't care about the contents of the dummy, so whatever came from malloc is fine.
+      *((uint64_t*) dec_buf_ptr) = UINT64_MAX;
+      dec_buf_ptr+=block_size_with_tag;  
+    }
+  } 
+  return(block_size_with_tag);
+}
+
+
+/*
+  Decrypts buffers passed to the enclave that are encrypted with keys from Enclave_LoadTestKeys
+  with AES_GCM. In addition it, gives each decrypted block an 8 byte random tag at the start.
+  Intended for using SN as a shuffler, by sorting the blocks based on the attached randome tags.
+
+  The function assumes the provided encrypted buffer is initalized to the correct length.
+  It returns a buffer of correct size (N * block_size_with_tag ) back to the function that
+  invoked decryptBuffer, where block_size_with_tag = decrypted_block_size + 8
+
+  The function returns the block_size_with_tag.
+*/
+
+size_t decryptBuffer_attachRTags(unsigned char *encrypted_buffer, uint64_t N, size_t encrypted_block_size, unsigned char *random_bytes, unsigned char **decrypted_buffer) {
+
+  size_t decrypted_block_size = encrypted_block_size - SGX_AESGCM_IV_SIZE - SGX_AESGCM_MAC_SIZE;
+  size_t block_size_with_tag = decrypted_block_size + 8;
+
+  // If decrypted_buffer hasn't been allocated yet, allocate required memory to hold the decrypted
+  // buffer
+  if((*decrypted_buffer)==NULL){
+    (*decrypted_buffer) = (unsigned char *) malloc(N * block_size_with_tag);
+    if(*decrypted_buffer==NULL) {
+      printf("Malloc failed in decryptBuffer_withAttachedRandomTags\n");
+    }
+  }
+
+  unsigned char *dec_buf_ptr = *decrypted_buffer;
+  unsigned char *enc_buf_ptr = encrypted_buffer;
+  unsigned char *tag_ptr = enc_buf_ptr + SGX_AESGCM_IV_SIZE + decrypted_block_size;
+
+  
+  for(size_t i =0; i<N; i++){
+    memcpy(dec_buf_ptr, random_bytes, 8);
+    random_bytes+=8;
+    dec_buf_ptr+=8; 
+
+    sgx_status_t aesret = sgx_rijndael128GCM_decrypt(
+        &enclave_decryption_key, enc_buf_ptr + SGX_AESGCM_IV_SIZE, decrypted_block_size,
+        dec_buf_ptr, enc_buf_ptr, SGX_AESGCM_IV_SIZE, NULL, 0,
+        (const sgx_aes_gcm_128bit_tag_t*)(tag_ptr));
+    if (aesret != SGX_SUCCESS) {
+      printf("sgx_rijndael128GCM_decrypt failure (%x)\n", aesret);
+      return -1;
+    }
+    
+    dec_buf_ptr+=decrypted_block_size;
+    enc_buf_ptr+=encrypted_block_size;
+    tag_ptr+=encrypted_block_size;
+  }
+  return(block_size_with_tag);
+}
+
+
+/*
+  Decrypts buffers passed to the enclave that are encrypted with keys from Enclave_LoadTestKeys
+  with AES_GCM.
+  The function assumes the provided encrypted buffer is initalized to the correct length.
+  It returns a buffer of correct size (N * decrypted_block_size) back to the function that
+  invoked decryptBuffer.
+
+  The function returns the decrypted_block_size.
+*/
+
+size_t decryptBuffer(unsigned char *encrypted_buffer, uint64_t N, size_t encrypted_block_size,
+      unsigned char **decrypted_buffer) {
+  
+  
+  size_t decrypted_block_size = encrypted_block_size - SGX_AESGCM_IV_SIZE - SGX_AESGCM_MAC_SIZE;
+  // If decrypted_buffer hasn't been allocated yet, allocate required memory to hold the decrypted
+  // buffer
+  if((*decrypted_buffer)==NULL){
+    (*decrypted_buffer) = (unsigned char *) malloc(N * decrypted_block_size);
+    if(*decrypted_buffer==NULL) {
+      printf("Malloc failed in decryptBuffer for %ld bytes\n", (N*decrypted_block_size));
+    }
+  }
+
+  unsigned char *dec_buf_ptr = *decrypted_buffer;
+  unsigned char *enc_buf_ptr = encrypted_buffer;
+  unsigned char *tag_ptr = enc_buf_ptr + SGX_AESGCM_IV_SIZE + decrypted_block_size;
+
+  
+  for(size_t i =0; i<N; i++){
+    sgx_status_t aesret = sgx_rijndael128GCM_decrypt(
+        &enclave_decryption_key, enc_buf_ptr + SGX_AESGCM_IV_SIZE, decrypted_block_size,
+        dec_buf_ptr, enc_buf_ptr, SGX_AESGCM_IV_SIZE, NULL, 0,
+        (const sgx_aes_gcm_128bit_tag_t*)(tag_ptr));
+    if (aesret != SGX_SUCCESS) {
+      printf("sgx_rijndael128GCM_decrypt failure (%x)\n", aesret);
+      return -1;
+    }
+    
+    dec_buf_ptr+=decrypted_block_size;
+    enc_buf_ptr+=encrypted_block_size;
+    tag_ptr+=encrypted_block_size;
+  }
+  return(decrypted_block_size);
+}
+
+/*
+  Encrypts buffers going out of the Enclave using AESGCM with keys from Enclave_LoadTestKeys.
+  The function assumes the buffers are initalized with the correct length.
+
+  Unlike decryptBuffers, encryptBuffer expects the encrypted_buffer of correct size to be passed to it
+  and it populates it with encryptions of blocks from decrypted_buffer.
+  (This is done to avoid unnecessary additional copying of the encrypted buffer to a result buffer
+    passed by the outside application to the enclave)
+*/
+
+size_t encryptBuffer(unsigned char *decrypted_buffer, uint64_t N, size_t decrypted_block_size,
+      unsigned char *encrypted_buffer) {
+
+  size_t encrypted_block_size = decrypted_block_size + SGX_AESGCM_IV_SIZE + SGX_AESGCM_MAC_SIZE;
+
+  unsigned char *dec_buf_ptr = decrypted_buffer;
+  unsigned char *enc_buf_ptr = encrypted_buffer;
+  unsigned char *tag_ptr = enc_buf_ptr + SGX_AESGCM_IV_SIZE + decrypted_block_size;
+
+  for(size_t i =0; i<N; i++){
+    getRandomBytes(enc_buf_ptr, SGX_AESGCM_IV_SIZE);
+    sgx_status_t aesret = sgx_rijndael128GCM_encrypt(
+        &enclave_encryption_key, dec_buf_ptr, decrypted_block_size,
+        enc_buf_ptr + SGX_AESGCM_IV_SIZE, enc_buf_ptr, SGX_AESGCM_IV_SIZE, NULL, 0,
+        (sgx_aes_gcm_128bit_tag_t*)(tag_ptr));
+    if (aesret != SGX_SUCCESS) {
+      printf("sgx_rijndael128GCM_encrypt failure (%x)\n", aesret);
+      return -1;
+    }
+    dec_buf_ptr+=decrypted_block_size;
+    enc_buf_ptr+=encrypted_block_size;
+    tag_ptr+=encrypted_block_size;
+  }
+
+  return(encrypted_block_size);
+}
+
+/*
+  Removes the random tags attached by decryptBuffer_attachRTags before encrypting the buffer.
+*/
+size_t encryptBuffer_removeRTags(unsigned char *decrypted_buffer, uint64_t N, 
+        size_t decrypted_block_size, unsigned char *encrypted_buffer) {
+
+  size_t real_block_size = decrypted_block_size - 8;
+  size_t encrypted_block_size = real_block_size + SGX_AESGCM_IV_SIZE + SGX_AESGCM_MAC_SIZE;
+
+  unsigned char *dec_buf_ptr = decrypted_buffer;
+  unsigned char *enc_buf_ptr = encrypted_buffer;
+  unsigned char *tag_ptr = enc_buf_ptr + SGX_AESGCM_IV_SIZE + real_block_size;
+
+  for(size_t i =0; i<N; i++){
+    //Skip the attached random tag
+    dec_buf_ptr+=8;
+    getRandomBytes(enc_buf_ptr, SGX_AESGCM_IV_SIZE);
+    sgx_status_t aesret = sgx_rijndael128GCM_encrypt(
+        &enclave_encryption_key, dec_buf_ptr, real_block_size,
+        enc_buf_ptr + SGX_AESGCM_IV_SIZE, enc_buf_ptr, SGX_AESGCM_IV_SIZE, NULL, 0,
+        (sgx_aes_gcm_128bit_tag_t*)(tag_ptr));
+    if (aesret != SGX_SUCCESS) {
+      printf("i = %d\n", i);
+      printf("sgx_rijndael128GCM_encrypt failure (%x)\n", aesret);
+      return -1;
+    }
+    dec_buf_ptr+=real_block_size;
+    enc_buf_ptr+=encrypted_block_size;
+    tag_ptr+=encrypted_block_size;
+  }
+  return(encrypted_block_size);
+}
+
+#endif
+#endif
+
+// Returns log2 rounded up.
+int calculatelog2(uint64_t value){
+  int log2v = 0;
+  uint64_t temp = 1;
+  while(temp<value){
+    temp=temp<<1;
+    log2v+=1;
+  }
+  return log2v;
+}
+
+int calculatelog2_floor(uint64_t value){
+  int log2v = 0;
+  uint64_t temp = 1;
+  while(temp<value){
+    temp=temp<<1;
+    log2v+=1;
+  }
+  if(temp==value)
+    return log2v;
+  else
+    return log2v-1;
+}
+
+// Returns largest power of two less than N
+uint64_t pow2_lt(uint64_t N) {
+  uint64_t N1 = 1;
+  while (N1 < N) {
+    N1 <<= 1;
+  }
+  N1 >>= 1;
+  return N1;
+}
+
+
+// Returns largest power of two greater than N
+uint64_t pow2_gt(uint64_t N) {
+  uint64_t N1 = 1;
+  while (N1 < N) {
+    N1 <<= 1;
+  }
+  return N1;
+}
+
+#ifndef BEFTS_MODE
+/*
+ * printf:
+ *   Invokes OCALL to display the enclave buffer to the terminal.
+ */
+void printf(const char *fmt, ...)
+{
+    char buf[BUFSIZ] = {'\0'};
+    va_list ap;
+    va_start(ap, fmt);
+    vsnprintf(buf, BUFSIZ, fmt, ap);
+    va_end(ap);
+    ocall_print_string(buf);
+}
+
+/*
+ * printf_with_rtclock:
+ *   Invokes OCALL to display the enclave buffer to the terminal with a
+ *   timestamp and returns the timestamp.
+ */
+unsigned long printf_with_rtclock(const char *fmt, ...)
+{
+    unsigned long ret;
+    char buf[BUFSIZ] = {'\0'};
+    va_list ap;
+    va_start(ap, fmt);
+    vsnprintf(buf, BUFSIZ, fmt, ap);
+    va_end(ap);
+    ocall_print_string_with_rtclock(&ret, buf);
+    return ret;
+}
+
+/*
+ * printf_with_rtclock_diff:
+ *   Invokes OCALL to display the enclave buffer to the terminal with a
+ *   timestamp and returns the timestamp.  Also prints the difference from
+ *   the before timestamp.
+ */
+unsigned long printf_with_rtclock_diff(unsigned long before, const char *fmt, ...)
+{
+    unsigned long ret;
+    char buf[BUFSIZ] = {'\0'};
+    va_list ap;
+    va_start(ap, fmt);
+    vsnprintf(buf, BUFSIZ, fmt, ap);
+    va_end(ap);
+    ocall_print_string_with_rtclock_diff(&ret, buf, before);
+    return ret;
+}
+#endif
+
+#if 0
+void displayORPPacket(unsigned char* packet_in, size_t block_size) {
+  unsigned char *packet_ptr = packet_in;
+  uint64_t evict_stream, ORP_label, key;
+  unsigned char data[block_size];
+
+  memcpy(&evict_stream, packet_ptr, sizeof(uint64_t));
+  packet_ptr+=sizeof(uint64_t);
+  memcpy(&ORP_label, packet_ptr, sizeof(uint64_t));
+  packet_ptr+=sizeof(uint64_t);
+  memcpy(&key, packet_ptr, sizeof(uint64_t));
+  packet_ptr+=sizeof(uint64_t);
+  memcpy(data, packet_ptr, block_size);
+
+  data[block_size]='\0';
+  printf("(evict_stream = %ld, ORP_label = %ld, Key = %ld)\n",
+        evict_stream, ORP_label, key);
+  //printf("Hex of data is :");
+  //for(int i=0;i<DATA_SIZE;++i) printf("%02x", data[i]); printf("\n");
+}
+
+
+// isDummy and setDummy works on real packets : <Key, Data>
+bool isDummy(unsigned char *ptr_to_serialized_packet){
+  return(((uint64_t*) ptr_to_serialized_packet)[0] == UINT64_MAX);
+}
+
+void setDummy(unsigned char *ptr_to_serialized_packet){
+  ((uint64_t*) ptr_to_serialized_packet)[0] = UINT64_MAX;
+}
+
+
+// isORPDummy and setORPDummy works on ORP packets : <Eviction_stream, ORP_label, Key, Data>
+
+bool isORPDummy(unsigned char *ptr_to_serialized_packet){
+  return(((uint64_t*) ptr_to_serialized_packet)[1] == UINT64_MAX);
+}
+
+void setORPDummy(unsigned char *ptr_to_packet){
+  ((uint64_t*) ptr_to_packet)[0] = UINT64_MAX;
+  ((uint64_t*) ptr_to_packet)[1] = UINT64_MAX;
+  ((uint64_t*) ptr_to_packet)[2] = UINT64_MAX;
+}
+
+
+size_t packetsConsumedUptoMSN(signed long msn_no, size_t msns_with_extra_packets, size_t packets_per_entry_msn) {
+  if(msn_no<0)
+    return 0;
+
+  if(msn_no<=msns_with_extra_packets){
+    return (msn_no * (packets_per_entry_msn+1));
+  }
+  else{
+    size_t reg_msn = msn_no - msns_with_extra_packets;
+    return ((reg_msn * packets_per_entry_msn) + (msns_with_extra_packets * packets_per_entry_msn));
+  } 
+}
+#endif
+
+
+#ifdef USE_PRB
+  void PRB_pool_init(int nthreads) {
+    PRB_pool = new PRB_buffer[nthreads];
+  }
+
+  void PRB_pool_shutdown() {
+    delete [] PRB_pool;
+  }
+
+  PRB_buffer::PRB_buffer() {
+  }
+
+  PRB_buffer::~PRB_buffer() {
+  }
+
+  sgx_status_t PRB_buffer::init_PRB_buffer(uint32_t buffer_size = PRB_BUFFER_SIZE) {
+    sgx_status_t rt = SGX_SUCCESS;
+    if(initialized==false) {
+      rt = sgx_read_rand((unsigned char*) random_seed, SGX_AESCTR_KEY_SIZE);
+      if(rt!=SGX_SUCCESS){
+        printf("Failed sgx_read_rand (%x)", rt);
+        return rt;
+      }
+      rt = sgx_read_rand((unsigned char*) counter, SGX_AESCTR_KEY_SIZE);
+      if(rt!=SGX_SUCCESS){
+        printf("Failed sgx_read_rand (%x)", rt);
+        return rt;
+      }
+      initialized=true;
+    }
+
+    char zeroes[buffer_size];
+    // We don't bother initializing to zeroes since AES_CTR just adds the PRB_stream to the buffer
+    // Use AES CTR to populate random_bytes
+    rt = sgx_aes_ctr_encrypt(random_seed, (const uint8_t*) zeroes, buffer_size,
+          (uint8_t*) counter, CTR_INC_BITS, random_bytes);
+    *(uint64_t*)counter += 1;
+    if(rt!=SGX_SUCCESS){
+      printf("Failed sgx_aes_ctr_encrypt (%x) in init_getRandomBytes\n", rt);
+      return rt;
+    }
+    random_bytes_left = PRB_BUFFER_SIZE;
+    random_bytes_ptr = random_bytes;
+    return rt;
+  }
+
+
+  sgx_status_t PRB_buffer::getRandomBytes(unsigned char *buffer, size_t size) {
+    sgx_status_t rt = SGX_SUCCESS;
+    
+    if(initialized==false)
+      init_PRB_buffer();
+
+    if(size < random_bytes_left) {
+      // Supply buffer with random bytes from random_bytes
+      memcpy(buffer, random_bytes_ptr, size);
+      random_bytes_ptr+=size;
+      random_bytes_left-= size;
+      return rt;
+    } else {
+      // Consume all the random bytes we have left
+      unsigned char *ptr = buffer;
+      size_t size_left_for_req = size - random_bytes_left;
+      memcpy(ptr, random_bytes_ptr, random_bytes_left);
+      ptr+= random_bytes_left;
+
+      // Use AES CTR to populate random_bytes
+      rt = sgx_aes_ctr_encrypt(random_seed, (const uint8_t*) random_bytes, PRB_BUFFER_SIZE,
+            (uint8_t*) counter, CTR_INC_BITS, random_bytes);
+      *(uint64_t*)counter += 1;
+      if(rt!=SGX_SUCCESS){
+        printf("Failed sgx_aes_ctr_encrypt (%x)", rt);
+        return rt;
+      }
+      random_bytes_left = PRB_BUFFER_SIZE;
+      random_bytes_ptr = random_bytes;
+
+      // Add size_left_for_req random bytes to the buffer
+      memcpy(ptr, random_bytes_ptr, size_left_for_req);
+      random_bytes_ptr+=size_left_for_req;
+      random_bytes_left-=size_left_for_req;
+      return rt;
+    }
+  }
+
+  sgx_status_t PRB_buffer::getBulkRandomBytes(unsigned char *buffer, size_t size) {
+    sgx_status_t rt = SGX_SUCCESS;
+    rt = sgx_aes_ctr_encrypt(random_seed, (const uint8_t*) buffer, size,
+          (uint8_t*) counter, CTR_INC_BITS, buffer);
+    *(uint64_t*)counter += 1;
+    
+    if(rt!=SGX_SUCCESS){
+      printf("Failed sgx_aes_ctr_encrypt (%x) in getBulkRandomBytes [%p %p %lu %p %d %p]\n", rt, random_seed, (const uint8_t*) buffer, size, (uint8_t*) counter, CTR_INC_BITS, buffer);
+      return rt;
+    }
+    return rt;
+  }
+
+
+  sgx_status_t initialize_BRB() {
+    sgx_status_t rt = SGX_SUCCESS;
+    rt = sgx_read_rand((unsigned char*) bulk_random_seed, SGX_AESCTR_KEY_SIZE);
+    if(rt!=SGX_SUCCESS){
+      printf("initialize_BRB(): Failed sgx_read_rand (%x)", rt);
+      return rt;
+    }
+    rt = sgx_read_rand((unsigned char*) bulk_counter, SGX_AESCTR_KEY_SIZE);
+    if(rt!=SGX_SUCCESS){
+      printf("initialize_BRB(): Failed sgx_read_rand (%x)", rt);
+      return rt;
+    }
+    bulk_initialized = true;
+    return rt;
+  }
+
+  sgx_status_t getBulkRandomBytes(unsigned char *buffer, size_t size) {
+
+    if(bulk_initialized == false){
+     initialize_BRB();   
+    }
+    sgx_status_t rt = SGX_SUCCESS;
+    rt = sgx_aes_ctr_encrypt(bulk_random_seed, (const uint8_t*) buffer, size,
+          (uint8_t*) bulk_counter, CTR_INC_BITS, buffer);
+    
+    if(rt!=SGX_SUCCESS){
+      printf("getBulkRandomBytes: Failed sgx_aes_ctr_encrypt (%x) in getBulkRandomBytes [%p %p %lu %p %d %p]\n", rt, bulk_random_seed, (const uint8_t*) buffer, size, (uint8_t*) bulk_counter, CTR_INC_BITS, buffer);
+      return rt;
+    }
+    return rt;
+  }
+#else
+  sgx_status_t getRandomBytes(unsigned char *random_bytes, size_t size) {
+    sgx_status_t rt = SGX_SUCCESS;
+    rt = sgx_read_rand((unsigned char*) random_bytes, size);
+    return rt;
+  }
+#endif
+
+unsigned char* compare_keys(unsigned char *packet_1, unsigned char *packet_2){
+  if( *((uint64_t*)(packet_1)) < *((uint64_t*)(packet_2))){
+    return packet_1;
+  }
+  else {
+    return packet_2;
+  }
+}
+
+void merge(unsigned char *data, size_t data_size, size_t l, size_t m, size_t r, unsigned char* (*comparator)(unsigned char*, unsigned char*)){
+  uint64_t i=0, j=0, k=0;
+  size_t s1, s2;
+
+  s1 = l+(m-l+1);
+  s2 = (m+1)+(r-m);
+
+  //unsigned char merged_array[(r-l+1)*data_size];
+  unsigned char *merged_array = (unsigned char*) malloc((r-l+1)*data_size);
+  i = l;
+  j = m+1;
+  k = 0;
+
+  while (i < s1 && j < s2) {
+    unsigned char *smaller_pkt = comparator(data+(i*data_size), data+(j*data_size));
+    if(smaller_pkt == data+(i*data_size)){
+      memcpy(merged_array+(k*data_size), smaller_pkt, data_size);
+      i++;
+    }
+    else{
+      memcpy(merged_array+(k*data_size), smaller_pkt, data_size);
+      j++;
+    }
+    k++;
+  }
+
+  while (i < s1) {
+    memcpy(merged_array + (k*data_size), data+(i*data_size), data_size);
+    i++;
+    k++;
+  }
+
+  while (j < s2) {
+    memcpy(merged_array + (k*data_size), data+(j*data_size), data_size);
+    j++;
+    k++;
+  }
+
+  memcpy(data+(l*data_size), merged_array, data_size * ((r-l)+1));
+  free(merged_array);
+}
+
+void mergeSort(unsigned char *data, size_t data_size, size_t start_index, size_t end_index, unsigned char* (*comparator)(unsigned char*, unsigned char*)){
+  if(start_index < end_index){
+
+    size_t m = start_index + (end_index-start_index)/2;
+    mergeSort(data, data_size, start_index, m, comparator);
+    mergeSort(data, data_size, m+1, end_index, comparator);
+
+    merge(data, data_size, start_index, m , end_index, comparator);
+  }
+}
+
+
+void mergeSort_OPRM(unsigned char *data, size_t data_size, size_t start_index, size_t end_index, unsigned char* (*comparator)(unsigned char*, unsigned char*)){
+  if(start_index < end_index){
+
+    size_t m = start_index + (end_index-start_index)/2;
+    mergeSort(data, data_size, start_index, m, comparator);
+    mergeSort(data, data_size, m+1, end_index, comparator);
+
+    merge(data, data_size, start_index, m , end_index, comparator);
+  }
+}
+
+#if 0
+//Tight Compaction and Expansion utility functions for testing if a Block is real/dummy
+
+uint8_t isBlockReal_16(unsigned char *block_ptr) {
+  uint16_t label = *((uint16_t *)(block_ptr));
+  return (label==UINT16_MAX);
+}
+
+uint8_t isBlockReal_32(unsigned char *block_ptr) {
+  uint32_t label = *((uint32_t *)(block_ptr));
+  return (label==UINT32_MAX);
+}
+
+uint8_t isBlockReal_64(unsigned char *block_ptr) {
+  uint64_t label = *((uint64_t *)(block_ptr));
+  return (label==UINT64_MAX);
+}
+
+void oswap_buffer(unsigned char *dest, unsigned char *source, uint32_t buffer_size, uint8_t flag){
+  #ifdef COUNT_OSWAPS
+    uint64_t *ltvp = &OSWAP_COUNTER;
+    FOAV_SAFE2_CNTXT(oswap_buffer, buffer_size, *ltvp)
+    OSWAP_COUNTER++;
+  #endif
+  if(buffer_size%16==0){
+    oswap_buffer_16x(dest, source, buffer_size, flag);
+  } else if(buffer_size==8){
+    oswap_buffer_byte(dest, source, buffer_size, flag);
+  }
+  else{
+    oswap_buffer_byte(dest, source, 8, flag);
+    oswap_buffer_16x(dest+8, source+8, buffer_size-8, flag);
+  }
+}
+
+
+uint8_t isCorrect16x(uint32_t block_size){
+  printf("Entered Correctness Tester!!!\n");
+  bool is_correct = true;
+  unsigned char *b1 = new unsigned char[block_size];
+  unsigned char *b2 = new unsigned char[block_size];
+  unsigned char *b3 = new unsigned char[block_size];
+  unsigned char *b4 = new unsigned char[block_size];
+  
+  getBulkRandomBytes(b1, block_size);
+  getBulkRandomBytes(b2, block_size);
+  memcpy(b3, b1, block_size);
+  memcpy(b4, b2, block_size);
+
+  bool swap_flag = false;
+ 
+  oswap_buffer<OSWAP_16X>(b1, b2, block_size, swap_flag);
+   
+  if(memcmp(b1, b3, block_size)){
+    is_correct=false;
+    printf("Failed Test 1\n");
+  }
+    
+  if(memcmp(b2, b4, block_size)){
+    is_correct=false;
+    printf("Failed Test 2\n");
+  }
+
+  memcpy(b1, b3, block_size);
+  memcpy(b2, b4, block_size);
+
+  swap_flag = true;
+  oswap_buffer<OSWAP_16X>(b1, b2, block_size, swap_flag);
+  if(memcmp(b1, b4, block_size)){
+    is_correct=false;
+    printf("Failed Test 3\n");
+  }
+    
+  if(memcmp(b2, b3, block_size)){
+    is_correct=false;
+    printf("Failed Test 4\n");
+  }
+  
+
+  delete []b1;
+  delete []b2; 
+  delete []b3;
+  delete []b4; 
+  if(is_correct){
+    printf("Correctness test SUCCESS! \n");
+    return true;
+  }
+  return false; 
+}
+
+
+uint8_t isCorrect8_16x(uint32_t block_size){
+  printf("Entered Correctness Tester!!!\n");
+  bool is_correct = true;
+  unsigned char *b1 = new unsigned char[block_size];
+  unsigned char *b2 = new unsigned char[block_size];
+  unsigned char *b3 = new unsigned char[block_size];
+  unsigned char *b4 = new unsigned char[block_size];
+  
+  getBulkRandomBytes(b1, block_size);
+  getBulkRandomBytes(b2, block_size);
+  memcpy(b3, b1, block_size);
+  memcpy(b4, b2, block_size);
+
+  bool swap_flag = false;
+ 
+  oswap_buffer<OSWAP_8_16X>(b1, b2, block_size, swap_flag);
+   
+  if(memcmp(b1, b3, block_size)){
+    is_correct=false;
+    printf("Failed Test 1\n");
+  }
+    
+  if(memcmp(b2, b4, block_size)){
+    is_correct=false;
+    printf("Failed Test 2\n");
+  }
+
+  memcpy(b1, b3, block_size);
+  memcpy(b2, b4, block_size);
+
+  swap_flag = true;
+  oswap_buffer<OSWAP_8_16X>(b1, b2, block_size, swap_flag);
+  if(memcmp(b1, b4, block_size)){
+    is_correct=false;
+    printf("Failed Test 3\n");
+  }
+    
+  if(memcmp(b2, b3, block_size)){
+    is_correct=false;
+    printf("Failed Test 4\n");
+  }
+  
+
+  delete []b1;
+  delete []b2; 
+  delete []b3;
+  delete []b4; 
+  if(is_correct){
+    printf("Correctness test SUCCESS! \n");
+    return true;
+  }
+  return false; 
+}
+
+
+void swapBuckets(unsigned char *bkt1, unsigned char *bkt2, unsigned char *temp_bucket, size_t bucket_size) {
+  memcpy(temp_bucket, bkt2, bucket_size);
+  memcpy(bkt2, bkt1, bucket_size);
+  memcpy(bkt1, temp_bucket, bucket_size);
+}
+#endif
+
+/*** Thread pool implementation ***/
+
+/* Implements a restricted-model thread pool.  The restriction is that
+ * every thread is the "parent" of a number of other threads (and no
+ * thread has more than one parent).  Each thread can be dispatched and
+ * joined only by its parent, so there's no contention on the dispatch
+ * and join inter-thread communication.  A parent thread has to specify
+ * the exact thread id of the child thread it dispatches work to. */
+
+thread_local threadid_t g_thread_id = 0;
+
+enum threadstate_t {
+    THREADSTATE_NONE,
+    THREADSTATE_WAITING,
+    THREADSTATE_DISPATCHING,
+    THREADSTATE_WORKING,
+    THREADSTATE_TERMINATE
+};
+
+struct threadblock_t {
+    threadid_t threadid;
+    threadstate_t state;
+    pthread_t thread_handle;
+    pthread_mutex_t mutex;
+    pthread_cond_t dispatch_cond;
+    void *(*dispatch_func)(void *data);
+    void *dispatch_data;
+    pthread_cond_t join_cond;
+    void *ret_data;
+#ifdef COUNT_OSWAPS
+    size_t num_oswaps;
+#endif
+};
+
+static threadblock_t *threadpool_control_blocks = NULL;
+static threadid_t threadpool_numthreads = 0;
+
+/* The main thread loop */
+static void* threadloop(void *vdata) {
+    threadblock_t *block = (threadblock_t *)vdata;
+
+    /* Initialize any per-thread state */
+    g_thread_id = block->threadid;
+    PRB_rand_bits = 0;
+    PRB_rand_bits_remaining = 0;
+
+    pthread_mutex_lock(&block->mutex);
+    while(1) {
+        /* Wait for work */
+        block->state = THREADSTATE_WAITING;
+        pthread_cond_wait(&block->dispatch_cond, &block->mutex);
+
+        if (block->state == THREADSTATE_TERMINATE) {
+            break;
+        }
+
+        /* Do the work */
+        block->state = THREADSTATE_WORKING;
+        pthread_mutex_unlock(&block->mutex);
+        block->ret_data = (block->dispatch_func)(block->dispatch_data);
+
+#ifdef COUNT_OSWAPS
+        /* Account for the oswaps done in this thread */
+        block->num_oswaps = OSWAP_COUNTER;
+        OSWAP_COUNTER = 0;
+#endif
+
+        /* Signal the parent thread that we're done, and loop back to
+         * wait for more work. */
+        pthread_mutex_lock(&block->mutex);
+        pthread_cond_signal(&block->join_cond);
+    }
+    block->state = THREADSTATE_NONE;
+    pthread_mutex_unlock(&block->mutex);
+
+    return NULL;
+}
+
+/* Create the threadpool, with numthreads-1 additional threads (numbered
+ * 1 through numthreads-1) in addition to the current "main" thread
+ * (numbered 0). Returns 0 on success, -1 on failure. It is allowed, but
+ * not very useful, to pass 1 here. */
+int threadpool_init(threadid_t numthreads) {
+    g_thread_id = 0;
+    PRB_rand_bits = 0;
+    PRB_rand_bits_remaining = 0;
+
+    if (numthreads < 1) {
+        return -1;
+    } else if (numthreads == 1) {
+        threadpool_numthreads = 1;
+        return 0;
+    }
+
+    /* We don't actually create a thread control block for the main
+     * thread 0, so the internal indexing into this array will be that
+     * thread i's control block lives at index i-1 in this array. */
+    threadpool_control_blocks = new threadblock_t[numthreads-1];
+    if (threadpool_control_blocks == NULL) {
+        return -1;
+    }
+    threadpool_numthreads = numthreads;
+
+    /* Init each thread control block */
+    bool thread_create_failure = false;
+    for (threadid_t i = 0; i < numthreads-1; ++i) {
+        threadblock_t *block = threadpool_control_blocks + i;
+        block->threadid = i+1;
+        block->state = THREADSTATE_NONE;
+        pthread_mutex_init(&block->mutex, NULL);
+        pthread_cond_init(&block->dispatch_cond, NULL);
+        pthread_cond_init(&block->join_cond, NULL);
+        block->thread_handle = NULL;
+        int create_ret =
+                pthread_create(&block->thread_handle, NULL, threadloop, block);
+        if (create_ret) {
+            thread_create_failure = true;
+            printf("Failed to launch thread %lu; ret=%d\n", i+1, create_ret);
+        }
+    }
+
+    if (thread_create_failure) {
+        threadpool_shutdown();
+        return -1;
+    }
+
+    return 0;
+}
+
+/* Ask all the threads to terminate, wait for that to happen, and clean
+ * up. */
+void threadpool_shutdown() {
+    /* Note that this function may be called when some threads failed to
+     * launch at all in threadpool_init. In that case, the thread field
+     * in the thread's control block will be NULL.  The mutex/cond
+     * variables will still have been initialized, however, and need
+     * cleaning. */
+    if (threadpool_numthreads == 0) {
+        /* Nothing to do */
+        return;
+    }
+    if (threadpool_numthreads == 1) {
+        /* Almost nothing to do */
+        threadpool_numthreads = 0;
+        return;
+    }
+    for (threadid_t i=0;i<threadpool_numthreads-1; ++i) {
+        threadblock_t *block = threadpool_control_blocks + i;
+        pthread_mutex_lock(&block->mutex);
+        if (block->state == THREADSTATE_WORKING) {
+            /* There's a thread actively running?  Wait for it to
+             * finish. */
+            pthread_mutex_unlock(&block->mutex);
+            threadpool_join(i+1, NULL);
+            pthread_mutex_lock(&block->mutex);
+        }
+        if (block->state == THREADSTATE_WAITING) {
+            /* Tell the thread to exit */
+            block->state = THREADSTATE_TERMINATE;
+            pthread_mutex_unlock(&block->mutex);
+            pthread_cond_signal(&block->dispatch_cond);
+            pthread_join(block->thread_handle, NULL);
+            block->thread_handle = NULL;
+        }
+        if (block->state != THREADSTATE_NONE) {
+            printf("Unexpected state on thread %lu during shutdown: %u\n", i+1, block->state);
+            pthread_cond_destroy(&block->dispatch_cond);
+            pthread_cond_destroy(&block->join_cond);
+            pthread_mutex_destroy(&block->mutex);
+        }
+    }
+    delete[] threadpool_control_blocks;
+    threadpool_control_blocks = NULL;
+    threadpool_numthreads = 0;
+}
+
+/* Dispatch some work to a particular thread in the thread pool. */
+void threadpool_dispatch(threadid_t threadid, void *(*func)(void*),
+        void *data) {
+    threadblock_t *block = threadpool_control_blocks + (threadid-1);
+    pthread_mutex_lock(&block->mutex);
+    if (block->state != THREADSTATE_WAITING) {
+        printf("Thread %lu not in expected WAITING state: %u\n",
+            threadid, block->state);
+        pthread_mutex_unlock(&block->mutex);
+        return;
+    }
+    block->dispatch_func = func;
+    block->dispatch_data = data;
+    block->state = THREADSTATE_DISPATCHING;
+    pthread_mutex_unlock(&block->mutex);
+    /* Tell the thread there's work to do */
+    pthread_cond_signal(&block->dispatch_cond);
+}
+
+/* Join a thread */
+void threadpool_join(threadid_t threadid, void **resp) {
+    threadblock_t *block = threadpool_control_blocks + (threadid-1);
+
+    pthread_mutex_lock(&block->mutex);
+    /* Did the thread finish already? */
+    if (block->state == THREADSTATE_DISPATCHING ||
+            block->state == THREADSTATE_WORKING) {
+        /* Wait until the thread completes */
+        pthread_cond_wait(&block->join_cond, &block->mutex);
+    } else if (block->state != THREADSTATE_WAITING) {
+        printf("Thread %lu in unexpected state (not WORKING or WAITING) on join: %u\n",
+            threadid, block->state);
+    }
+    if (resp) {
+        *resp = block->ret_data;
+    }
+#ifdef COUNT_OSWAPS
+    uint64_t *ltvp = &OSWAP_COUNTER;
+    FOAV_SAFE_CNTXT(oswap_buffer, *ltvp)
+    OSWAP_COUNTER += block->num_oswaps;
+    block->num_oswaps = 0;
+#endif
+    pthread_mutex_unlock(&block->mutex);
+}

+ 285 - 0
Enclave/OblivAlgs/utils.hpp

@@ -0,0 +1,285 @@
+#ifndef __UTILS_HPP__
+#define __UTILS_HPP__
+
+  #ifndef BEFTS_MODE
+    #include <string.h>
+    #include <vector>
+
+    #include <stdarg.h>
+    #include <stdio.h>      /* vsnprintf */
+    #include "Enclave_t.h"  /* print_string */
+    #include <stdlib.h>
+    #include <stdint.h>
+    #include <math.h>
+    #include "sgx_thread.h"
+    #include <sgx_tcrypto.h>
+    #include "sgx_trts.h"
+    #include <assert.h>
+
+#if 0
+    #include <openssl/ec.h>
+    #include <openssl/bn.h>
+    #include <openssl/rsa.h>
+    #include <openssl/evp.h>
+    #include <openssl/err.h>
+    #include <openssl/rand.h>
+#endif
+
+    #include "CONFIG.h"
+
+    #include "oasm_lib.h" 
+  #else
+    #include<pthread.h>
+  #endif
+
+  #define CTR_INC_BITS 128   
+
+  inline size_t min(size_t a, size_t b){
+    return (a<b)? a: b;
+  }
+
+  int compare(const void *buf1, const void *buf2);
+
+  // High-level oswap_buffer function that handles all buffer sizes internally
+  void oswap_buffer(unsigned char *dest, unsigned char *source, uint32_t buffer_size, uint8_t flag);
+
+
+#if 0
+  // Encrypt/Decrypt Buffers
+  size_t decryptBuffer(unsigned char *buffer, uint64_t N, size_t block_size,
+      unsigned char **decrypted_buffer);
+  size_t encryptBuffer(unsigned char *buffer, uint64_t N, size_t block_size,
+      unsigned char *encrypted_buffer);
+
+  size_t decryptBuffer_attachRTags(unsigned char *encrypted_buffer, uint64_t N, 
+        size_t encrypted_block_size, unsigned char *random_bytes, unsigned char **decrypted_buffer);
+  size_t encryptBuffer_removeRTags(unsigned char *decrypted_buffer, uint64_t N, 
+        size_t decrypted_block_size, unsigned char *encrypted_buffer);
+
+  size_t decryptBuffer_attachRTags_addDummies(unsigned char *encrypted_buffer, uint64_t N, 
+          uint64_t N_prime, uint64_t B, uint64_t Z, size_t encrypted_block_size, unsigned char *random_bytes,
+          unsigned char **decrypted_buffer);
+#endif
+
+  // Display/Debug functions
+  
+  #ifndef BEFTS_MODE
+    void printf(const char *fmt, ...);
+  #endif
+
+  template <typename t>
+  void print_array(t array, size_t N) {
+    for(size_t i = 0; i < N; i++)
+      printf("%d, ", array[i]);
+    printf("\n");
+  }
+
+  unsigned long printf_with_rtclock(const char *fmt, ...);
+  unsigned long printf_with_rtclock_diff(unsigned long before, const char *fmt, ...);
+#if 0
+  void displayPacket(unsigned char* packet_in);
+  void displayZeroEncryptedPacket(unsigned char* packet_in);
+  void displayEncryptedPacket(unsigned char* packet_in);
+  void displayORPPacket(unsigned char* packet_in, size_t block_size);
+  void displayKeysInBuffer(unsigned char *buffer, size_t N, size_t block_size);
+
+  // Packet processing functions
+  bool isDummy(unsigned char *ptr_to_serialized_packet);
+  void setDummy(unsigned char *ptr_to_serialized_packet);
+
+  // Test Packet Dummy
+  bool isORPDummy(unsigned char *ptr_to_serialized_packet);
+  void setORPDummy(unsigned char *ptr_to_serialized_packet);
+
+  // BORP utility
+  size_t packetsConsumedUptoMSN(signed long msn_no, size_t msns_with_extra_packets, size_t packets_per_entry_msn);
+#endif
+
+  // Other utility functions
+  int calculatelog2(uint64_t value);
+  int calculatelog2_floor(uint64_t value);
+  uint64_t pow2_lt(uint64_t N);
+  uint64_t pow2_gt(uint64_t N);
+
+  void merge(unsigned char *data, size_t l, size_t m, size_t r, unsigned char* (*comparator)(unsigned char*, unsigned char*));
+  void mergeSort(unsigned char *data, size_t data_size, size_t start_index, size_t end_index, unsigned char* (*comparator)(unsigned char*, unsigned char*));
+  unsigned char* compare_keys(unsigned char *packet_1, unsigned char *packet_2);
+
+#if 0
+  // For TightCompaction & Expansion:
+  uint8_t isBlockReal_16(unsigned char *block_ptr);
+  uint8_t isBlockReal_32(unsigned char *block_ptr);
+  uint8_t isBlockReal_64(unsigned char *block_ptr);
+
+  // Correctness test for new inline oswap functions
+  uint8_t isCorrect16x(uint32_t block_size);
+  uint8_t isCorrect8_16x(uint32_t block_size);
+
+  // For BOS_TC:
+  void swapBuckets(unsigned char *bkt1, unsigned char *bkt2, unsigned char *temp_bucket, size_t bucket_size);
+#endif
+
+
+  /*** Thread pool implementation ***/
+
+  /* Implements a restricted-model thread pool.  The restriction is that
+   * every thread is the "parent" of a number of other threads (and no
+   * thread has more than one parent).  Each thread can be dispatched and
+   * joined only by its parent, so there's no contention on the dispatch
+   * and join inter-thread communication.  A parent thread has to specify
+   * the exact thread id of the child thread it dispatches work to. */
+
+  typedef size_t threadid_t;
+  extern thread_local threadid_t g_thread_id;
+
+  /* Create the threadpool, with numthreads-1 additional threads (numbered
+   * 1 through numthreads-1) in addition to the current "main" thread
+   * (numbered 0). Returns 0 on success, -1 on failure. It is allowed, but
+   * not very useful, to pass 1 here. */
+  int threadpool_init(threadid_t numthreads);
+
+  /* Ask all the threads to terminate, wait for that to happen, and clean
+   * up. */
+  void threadpool_shutdown();
+
+  /* Dispatch some work to a particular thread in the thread pool. */
+  void threadpool_dispatch(threadid_t threadid, void *(*func)(void*), void *data);
+
+  /* Join a thread */
+  void threadpool_join(threadid_t threadid, void **resp);
+
+  // PRB = PseudoRandomBytes
+  #ifdef USE_PRB
+    class PRB_buffer{ 
+      private:
+        sgx_aes_ctr_128bit_key_t random_seed[SGX_AESCTR_KEY_SIZE];
+        unsigned char counter[SGX_AESCTR_KEY_SIZE];
+        unsigned char random_bytes[PRB_BUFFER_SIZE];
+        unsigned char *random_bytes_ptr;
+        int64_t random_bytes_left;
+        uint64_t req_ctr; 
+        bool initialized = false;
+
+      public:
+        PRB_buffer();
+        ~PRB_buffer();
+        sgx_status_t init_PRB_buffer(uint32_t buffer_size);
+        /*  Intended for getting random bytes of size << PRB_BUFFER_SIZE at a time.
+         Draws random bytes from the (typically) pre-filled random_bytes[PRB_BUFFER_SIZE] 
+         buffer, refilling random_bytes[PRB_BUFFER_SIZE] when the call uses up all the
+         PRB stored in the buffer. */
+        sgx_status_t getRandomBytes(unsigned char *random_bytes, size_t size);
+        /* Intended for getting random bytes of sizes > PRB_BUFFER_SIZE at a time.
+         Populates the random_bytes buffer directly with output of SGX_AES_CTR_ENCRYPT, without
+         touching the pre-filled random_bytes[PRB_BUFFER_SIZE].
+        */
+        sgx_status_t getBulkRandomBytes(unsigned char *random_bytes, size_t size);
+    };
+    extern PRB_buffer* PRB_pool;
+
+    // Spawn a PRB pool for each thread
+    void PRB_pool_init(int nthreads);
+    // Cleanup PRBPool
+    void PRB_pool_shutdown();
+    
+    inline sgx_status_t getRandomBytes(unsigned char *random_bytes, size_t size) {
+      FOAV_SAFE_CNTXT(PRB, size)
+      FOAV_SAFE_CNTXT(PRB, g_thread_id)
+      return((PRB_pool[g_thread_id]).getRandomBytes(random_bytes, size));
+    }
+
+    // Return a random bit
+    extern thread_local uint64_t PRB_rand_bits;
+    extern thread_local uint32_t PRB_rand_bits_remaining;
+    inline bool getRandomBit() {
+        FOAV_SAFE_CNTXT(getRandomBit, PRB_rand_bits_remaining)
+        if (PRB_rand_bits_remaining == 0) {
+            getRandomBytes((unsigned char *)&PRB_rand_bits,
+                sizeof(PRB_rand_bits));
+            PRB_rand_bits_remaining = 64;
+        }
+        bool ret = PRB_rand_bits & 1;
+        PRB_rand_bits >>= 1;
+        PRB_rand_bits_remaining -= 1;
+        return ret;
+    }
+
+
+    sgx_status_t initialize_BRB();
+    sgx_status_t getBulkRandomBytes(unsigned char *buffer, size_t size);
+  #else
+    sgx_status_t getRandomBytes(unsigned char *random_bytes, size_t size);
+  #endif
+
+#if 0
+  #include "SortingNetwork/SortingNetwork.hpp"
+  
+  void generateSortPermutation_OA(uint32_t N, unsigned char *buffer, size_t block_size, uint32_t *permutation);
+  void generateSortPermutation_DJB(size_t N, unsigned char *buffer, size_t block_size, size_t *permutation);
+  /*
+    Generate a random permutation of range(N), and return it in
+  random_permutation
+      
+    - random_permutation: a pointer to a uint64_t array. The function expects
+      this array to have been initialized already and populates it with the
+      random permutation.
+    - N : the number of elements (and correspondingly the MAX+1 value) of the
+      returned array.
+  */
+
+  template <typename T>
+  void generateRandomPermutation(size_t N, T *random_permutation){
+    //Initialize random permutation as 1,...,N
+    FOAV_SAFE_CNTXT(GRP, N)
+    for(T i=0; i<N; i++) {
+      FOAV_SAFE_CNTXT(i, N)
+      random_permutation[i]=i;
+    }
+
+    //Convert it to a random permutation of [1,N] 
+    RecursiveShuffle_M2((unsigned char*) random_permutation, N, sizeof(T));
+    /*
+    random_permutation[0] = 8;
+    random_permutation[1] = 12;
+    random_permutation[2] = 6;
+    random_permutation[3] = 29;
+    random_permutation[4] = 22;
+    random_permutation[5] = 0;
+    random_permutation[6] = 15;
+    random_permutation[7] = 24;
+    random_permutation[8] = 30;
+    random_permutation[9] = 19;
+    random_permutation[10] = 13;
+    random_permutation[11] = 28;
+    random_permutation[12] = 7;
+    random_permutation[13] = 17;
+    random_permutation[14] = 14;
+    random_permutation[15] = 27;
+    random_permutation[16] = 18;
+    random_permutation[17] = 25;
+    random_permutation[18] = 5;
+    random_permutation[19] = 10;
+    random_permutation[20] = 23;
+    random_permutation[21] = 2;
+    random_permutation[22] = 4;
+    random_permutation[23] = 9;
+    random_permutation[24] = 26;
+    random_permutation[25] = 1;
+    random_permutation[26] = 21;
+    random_permutation[27] = 3;
+    random_permutation[28] = 20;
+    random_permutation[29] = 16;
+    random_permutation[30] = 11;
+    random_permutation[31] = 31;
+    */
+    /*
+    printf("\nPermutation output\n");
+    for(T i=0; i<N; i++)
+      printf("%ld, ", random_permutation[i]);
+    printf("\n");
+    */
+  }
+#endif
+
+  #define __OSORT_UTILS__
+#endif

+ 0 - 53
Enclave/utils.cpp

@@ -1,53 +0,0 @@
-#include <stdio.h>
-
-#include "utils.hpp"
-
-/*
- * printf:
- *   Invokes OCALL to display the enclave buffer to the terminal.
- */
-void printf(const char *fmt, ...)
-{
-    char buf[BUFSIZ] = {'\0'};
-    va_list ap;
-    va_start(ap, fmt);
-    vsnprintf(buf, BUFSIZ, fmt, ap);
-    va_end(ap);
-    ocall_print_string(buf);
-}
-
-/*
- * printf_with_rtclock:
- *   Invokes OCALL to display the enclave buffer to the terminal with a
- *   timestamp and returns the timestamp.
- */
-unsigned long printf_with_rtclock(const char *fmt, ...)
-{
-    unsigned long ret;
-    char buf[BUFSIZ] = {'\0'};
-    va_list ap;
-    va_start(ap, fmt);
-    vsnprintf(buf, BUFSIZ, fmt, ap);
-    va_end(ap);
-    ocall_print_string_with_rtclock(&ret, buf);
-    return ret;
-}
-
-/*
- * printf_with_rtclock_diff:
- *   Invokes OCALL to display the enclave buffer to the terminal with a
- *   timestamp and returns the timestamp.  Also prints the difference from
- *   the before timestamp.
- */
-unsigned long printf_with_rtclock_diff(unsigned long before, const char *fmt, ...)
-{
-    unsigned long ret;
-    char buf[BUFSIZ] = {'\0'};
-    va_list ap;
-    va_start(ap, fmt);
-    vsnprintf(buf, BUFSIZ, fmt, ap);
-    va_end(ap);
-    ocall_print_string_with_rtclock_diff(&ret, buf, before);
-    return ret;
-}
-

+ 0 - 11
Enclave/utils.hpp

@@ -1,11 +0,0 @@
-#ifndef __UTILS_HPP__
-#define __UTILS_HPP__
-
-#include "Enclave_t.h"
-
-void printf(const char *fmt, ...);
-unsigned long printf_with_rtclock(const char *fmt, ...);
-unsigned long printf_with_rtclock_diff(unsigned long before,
-    const char *fmt, ...);
-
-#endif

+ 55 - 9
Makefile

@@ -50,7 +50,7 @@ ifeq ($(SGX_ARCH), x86)
 	SGX_ENCLAVE_SIGNER := $(SGX_SDK)/bin/x86/sgx_sign
 	SGX_EDGER8R := $(SGX_SDK)/bin/x86/sgx_edger8r
 else
-	SGX_COMMON_FLAGS := -m64
+	SGX_COMMON_FLAGS := -m64 -maes -msse2
 	SGX_LIBRARY_PATH := $(SGX_SDK)/lib64
 	SGX_ENCLAVE_SIGNER := $(SGX_SDK)/bin/x64/sgx_sign
 	SGX_EDGER8R := $(SGX_SDK)/bin/x64/sgx_edger8r
@@ -128,8 +128,8 @@ else
 endif
 Crypto_Library_Name := sgx_tcrypto
 
-Enclave_Cpp_Files := $(wildcard Enclave/*.cpp)
-Enclave_Include_Paths := -IEnclave -I$(SGX_SDK)/include -I$(SGX_SDK)/include/libcxx -I$(SGX_SDK)/include/tlibc
+Enclave_Cpp_Files := $(wildcard Enclave/*.cpp Enclave/OblivAlgs/*.cpp)
+Enclave_Include_Paths := -IEnclave -IEnclave/OblivAlgs -I$(SGX_SDK)/include -I$(SGX_SDK)/include/libcxx -I$(SGX_SDK)/include/tlibc
 
 Enclave_C_Flags := -nostdinc -fvisibility=hidden -fpie -fstack-protector -fno-builtin-printf $(Enclave_Include_Paths)
 Enclave_Cpp_Flags := $(Enclave_C_Flags) -nostdinc++
@@ -147,7 +147,7 @@ Enclave_Security_Link_Flags := -Wl,-z,relro,-z,now,-z,noexecstack
 Enclave_Link_Flags := $(Enclave_Security_Link_Flags) \
     -Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles -L$(SGX_LIBRARY_PATH) \
 	-Wl,--whole-archive -l$(Trts_Library_Name) -Wl,--no-whole-archive \
-	-Wl,--start-group -lsgx_tstdc -lsgx_tcxx -l$(Crypto_Library_Name) -l$(Service_Library_Name) -Wl,--end-group \
+	-Wl,--start-group -lsgx_tstdc -lsgx_tcxx -l$(Crypto_Library_Name) -l$(Service_Library_Name) -lsgx_pthread -Wl,--end-group \
 	-Wl,-Bstatic -Wl,-Bsymbolic -Wl,--no-undefined \
 	-Wl,-pie,-eenclave_entry -Wl,--export-dynamic  \
 	-Wl,--defsym,__ImageBase=0 \
@@ -224,7 +224,7 @@ Untrusted/Enclave_u.h: $(SGX_EDGER8R) Enclave/Enclave.edl
 
 Untrusted/Enclave_u.c: Untrusted/Enclave_u.h
 
-Untrusted/Enclave_u.o: Untrusted/Enclave_u.c
+Untrusted/Enclave_u.o: Untrusted/Enclave_u.c Untrusted/Enclave_u.h
 	@echo "CC   <=  $<"
 	@$(CC) $(SGX_COMMON_CFLAGS) $(App_C_Flags) -c $< -o $@
 
@@ -248,7 +248,7 @@ Enclave/Enclave_t.h: $(SGX_EDGER8R) Enclave/Enclave.edl
 
 Enclave/Enclave_t.c: Enclave/Enclave_t.h
 
-Enclave/Enclave_t.o: Enclave/Enclave_t.c
+Enclave/Enclave_t.o: Enclave/Enclave_t.c Enclave/Enclave_t.h
 	@echo "CC   <=  $<"
 	@$(CC) $(SGX_COMMON_CFLAGS) $(Enclave_C_Flags) -c $< -o $@
 
@@ -293,8 +293,54 @@ App/teems.o: App/net.hpp App/start.hpp
 Untrusted/Untrusted.o: Untrusted/Untrusted.hpp Enclave/enclave_api.h
 Untrusted/Untrusted.o: Untrusted/Enclave_u.h
 
-Enclave/comms.o: Enclave/Enclave_t.h Enclave/enclave_api.h Enclave/utils.hpp
-Enclave/comms.o: Enclave/config.hpp Enclave/enclave_api.h
+Enclave/comms.o: Enclave/Enclave_t.h Enclave/enclave_api.h Enclave/config.hpp
+Enclave/comms.o: Enclave/enclave_api.h
 Enclave/config.o: Enclave/Enclave_t.h Enclave/enclave_api.h Enclave/comms.hpp
 Enclave/config.o: Enclave/enclave_api.h Enclave/config.hpp
-Enclave/utils.o: Enclave/utils.hpp Enclave/Enclave_t.h Enclave/enclave_api.h
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/oasm_lib.h
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/CONFIG.h
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/oasm_lib.tcc
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/foav.h
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/utils.hpp
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/Enclave_t.h
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/enclave_api.h
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/RecursiveShuffle.hpp
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/TightCompaction_v2.hpp
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/TightCompaction_v2.tcc
+Enclave/OblivAlgs/RecursiveShuffle.o: Enclave/OblivAlgs/RecursiveShuffle.tcc
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/OblivAlgs/SortingNetwork.hpp
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/OblivAlgs/oasm_lib.h
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/OblivAlgs/CONFIG.h
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/OblivAlgs/oasm_lib.tcc
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/OblivAlgs/foav.h
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/OblivAlgs/utils.hpp
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/Enclave_t.h Enclave/enclave_api.h
+Enclave/OblivAlgs/SortingNetwork.o: Enclave/OblivAlgs/SortingNetwork.tcc
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/OblivAlgs/TightCompaction_v2.hpp
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/OblivAlgs/oasm_lib.h
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/OblivAlgs/CONFIG.h
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/OblivAlgs/oasm_lib.tcc
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/OblivAlgs/foav.h
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/OblivAlgs/utils.hpp
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/Enclave_t.h
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/enclave_api.h
+Enclave/OblivAlgs/TightCompaction_v2.o: Enclave/OblivAlgs/TightCompaction_v2.tcc
+Enclave/OblivAlgs/utils.o: Enclave/OblivAlgs/utils.hpp Enclave/Enclave_t.h
+Enclave/OblivAlgs/utils.o: Enclave/enclave_api.h Enclave/OblivAlgs/CONFIG.h
+Enclave/OblivAlgs/utils.o: Enclave/OblivAlgs/oasm_lib.h
+Enclave/OblivAlgs/utils.o: Enclave/OblivAlgs/oasm_lib.tcc
+Enclave/OblivAlgs/utils.o: Enclave/OblivAlgs/foav.h
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/SortingNetwork.hpp
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/oasm_lib.h
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/CONFIG.h
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/oasm_lib.tcc
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/foav.h
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/utils.hpp
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/Enclave_t.h Enclave/enclave_api.h
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/SortingNetwork.tcc
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/WaksmanNetwork.hpp
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/RecursiveShuffle.hpp
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/TightCompaction_v2.hpp
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/TightCompaction_v2.tcc
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/RecursiveShuffle.tcc
+Enclave/OblivAlgs/WaksmanNetwork.o: Enclave/OblivAlgs/aes.hpp