2 年之前 · f7bdf834c3
--- a/2p-preprocessing/preprocessing.cpp
+++ b/2p-preprocessing/preprocessing.cpp
@@ -56,7 +56,9 @@ size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) *
 
				 bool is_packed = (sizeof(leaf_t) < sizeof(node_t));

			
 
				 size_t leaves_per_node = is_packed ? sizeof(node_t) * CHAR_BIT / bits_per_leaf : 1;

			
 
				 

			
 
				-size_t input_bits(const size_t nitems) { return std::ceil(std::log2(nitems)); }

			
 
				+size_t input_bits(const size_t nitems) {

			
 
				+    return std::ceil(std::log2(nitems));

			
 
				+}

			
 
				 

			
 
				 leaf_t val;

			
 
				 

			
@@ -65,513 +67,515 @@ using namespace dpf;
 
				 #include "mpc.h"

			
 
				 void generate_random_targets(uint8_t **target_share_read, size_t n_threads, bool party, size_t expo)

			
 
				 {

			
 
				-	for (size_t i = 0; i < n_threads; i++)

			
 
				-	{

			
 
				-		target_share_read[i] = new uint8_t[64];

			
 
				-	}

			
 
				-

			
 
				-	for (size_t j = 0; j < 64; ++j)

			
 
				-	{

			
 
				-		for (size_t i = 0; i < n_threads; ++i)

			
 
				-		{

			
 
				-			uint8_t random_value;

			
 
				-			arc4random_buf(&random_value, sizeof(uint8_t));

			
 
				-			target_share_read[i][j] = random_value; // rand();

			
 
				-		}

			
 
				-	}

			
 
				+    for (size_t i = 0; i < n_threads; i++)

			
 
				+    {

			
 
				+        target_share_read[i] = new uint8_t[64];

			
 
				+    }

			
 
				+

			
 
				+    for (size_t j = 0; j < 64; ++j)

			
 
				+    {

			
 
				+        for (size_t i = 0; i < n_threads; ++i)

			
 
				+        {

			
 
				+            uint8_t random_value;

			
 
				+            arc4random_buf(&random_value, sizeof(uint8_t));

			
 
				+            target_share_read[i][j] = random_value; // rand();

			
 
				+        }

			
 
				+    }

			
 
				 }

			
 
				 

			
 
				 void compute_CW(bool party, tcp::socket &sout, __m128i L, __m128i R, uint8_t bit, __m128i &CW)

			
 
				 {

			
 
				 

			
 
				-	// struct cw_construction

			
 
				-	//{

			
 
				-	__m128i rand_b, gamma_b;

			
 
				-	uint8_t bit_b;

			
 
				-	//};

			
 
				-

			
 
				-	__m128i *X, *Y;

			
 
				-

			
 
				-	if (party)

			
 
				-	{

			
 
				-		std::string qfile = std::string("./gamma1");

			
 
				-		int qfd = open(qfile.c_str(), O_RDWR);

			
 
				-		X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				-							PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				-

			
 
				-		qfile = std::string("./x1");

			
 
				-		qfd = open(qfile.c_str(), O_RDWR);

			
 
				-		Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				-							PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				-	}

			
 
				-

			
 
				-	if (!party)

			
 
				-	{

			
 
				-		std::string qfile = std::string("./gamma0");

			
 
				-		int qfd = open(qfile.c_str(), O_RDWR);

			
 
				-		X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				-							PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				-

			
 
				-		qfile = std::string("./x0");

			
 
				-		qfd = open(qfile.c_str(), O_RDWR);

			
 
				-		Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				-							PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				-	}

			
 
				-

			
 
				-	// cw_construction computecw;

			
 
				-	//	read(sin, boost::asio::buffer(&computecw, sizeof(computecw)));

			
 
				-

			
 
				-	// computecw.rand_b;

			
 
				-	//__m128i gamma_b = computecw.gamma_b;

			
 
				-

			
 
				-	if (party)

			
 
				-	{

			
 
				-		rand_b = Y[0];	//_mm_set_epi32(0x6fef9434, 0x6768121e, 0x20942286, 0x1b59f7a7);

			
 
				-		gamma_b = X[0]; // _mm_set_epi32(0x6a499109 , 0x803067dd , 0xd1e2281b , 0xe71b6262);

			
 
				-		bit_b = 1;		// computecw.bit_b;

			
 
				-	}

			
 
				-	else

			
 
				-	{

			
 
				-		rand_b = Y[0];	// _mm_set_epi32(0xb29747df, 0xf7300f6d, 0x9476d971, 0xd5f75d98);

			
 
				-		gamma_b = X[0]; // _mm_set_epi32(0xb73142e2 , 0x10687aae , 0x06500d3ec , 0x29b5c85d);

			
 
				-		bit_b = 1;		// computecw.bit_b;

			
 
				-	}

			
 
				-

			
 
				-	uint8_t blinded_bit, blinded_bit_read;

			
 
				-	blinded_bit = bit ^ bit_b;

			
 
				-

			
 
				-	__m128i blinded_L = L ^ R ^ rand_b;

			
 
				-	__m128i blinded_L_read;

			
 
				-

			
 
				-	struct BlindsCW

			
 
				-	{

			
 
				-		__m128i blinded_message;

			
 
				-		uint8_t blinded_bit;

			
 
				-	};

			
 
				-

			
 
				-	BlindsCW blinds_sent, blinds_recv;

			
 
				-

			
 
				-	blinds_sent.blinded_bit = blinded_bit;

			
 
				-	blinds_sent.blinded_message = blinded_L;

			
 
				-

			
 
				-	boost::asio::write(sout, boost::asio::buffer(&blinds_sent, sizeof(blinds_sent)));

			
 
				-	boost::asio::read(sout, boost::asio::buffer(&blinds_recv, sizeof(blinds_recv)));

			
 
				- communication_cost += sizeof(blinds_recv);

			
 
				-	

			
 
				- blinded_bit_read = blinds_recv.blinded_bit;

			
 
				-	blinded_L_read = blinds_recv.blinded_message;

			
 
				-

			
 
				-	__m128i out_ = R ^ gamma_b; //_mm_setzero_si128;

			
 
				-

			
 
				-	if (bit)

			
 
				-	{

			
 
				-		out_ ^= (L ^ R ^ blinded_L_read);

			
 
				-	}

			
 
				-	if (blinded_bit_read)

			
 
				-	{

			
 
				-		out_ ^= rand_b;

			
 
				-	}

			
 
				-

			
 
				-	__m128i out_reconstruction;

			
 
				-	boost::asio::write(sout, boost::asio::buffer(&out_, sizeof(out_)));

			
 
				-	boost::asio::read(sout, boost::asio::buffer(&out_reconstruction, sizeof(out_reconstruction)));

			
 
				- communication_cost += sizeof(out_reconstruction);

			
 
				-	

			
 
				- out_reconstruction = out_ ^ out_reconstruction;

			
 
				-

			
 
				-	CW = out_reconstruction;

			
 
				-

			
 
				- #ifdef DEBUG

			
 
				-  uint8_t bit_reconstruction;

			
 
				-  boost::asio::write(sout, boost::asio::buffer(&bit, sizeof(bit)));

			
 
				-  boost::asio::read(sout, boost::asio::buffer(&bit_reconstruction, sizeof(bit_reconstruction)));

			
 
				-  bit_reconstruction = bit ^ bit_reconstruction;

			
 
				-

			
 
				-  __m128i L_reconstruction;

			
 
				-  boost::asio::write(sout, boost::asio::buffer(&L, sizeof(L)));

			
 
				-  boost::asio::read(sout, boost::asio::buffer(&L_reconstruction, sizeof(L_reconstruction)));

			
 
				-  L_reconstruction = L ^ L_reconstruction;

			
 
				-

			
 
				-  __m128i R_reconstruction;

			
 
				-  boost::asio::write(sout, boost::asio::buffer(&R, sizeof(R)));

			
 
				-  boost::asio::read(sout, boost::asio::buffer(&R_reconstruction, sizeof(R_reconstruction)));

			
 
				-  R_reconstruction = R ^ R_reconstruction;

			
 
				-

			
 
				-  __m128i CW_debug;

			
 
				-

			
 
				-  if (bit_reconstruction != 0)

			
 
				-  {

			
 
				-   CW_debug = L_reconstruction;

			
 
				-  }

			
 
				-  else

			
 
				-  {

			
 
				-   CW_debug = R_reconstruction;

			
 
				-  }

			
 
				-

			
 
				-  assert(CW_debug[0] == CW[0]);

			
 
				-  assert(CW_debug[1] == CW[1]);

			
 
				- #endif

			
 
				+    // struct cw_construction

			
 
				+    //{

			
 
				+    __m128i rand_b, gamma_b;

			
 
				+    uint8_t bit_b;

			
 
				+    //};

			
 
				+

			
 
				+    __m128i *X, *Y;

			
 
				+

			
 
				+    if (party)

			
 
				+    {

			
 
				+      std::string qfile = std::string("./gamma1");

			
 
				+      int qfd = open(qfile.c_str(), O_RDWR);

			
 
				+      X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				+                          PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				+

			
 
				+      qfile = std::string("./x1");

			
 
				+      qfd = open(qfile.c_str(), O_RDWR);

			
 
				+      Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				+                          PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				+      close(qfd);

			
 
				+      munmap(X, 8 * sizeof(__m128i));

			
 
				+      munmap(Y, 8 * sizeof(__m128i));

			
 
				+    }

			
 
				+

			
 
				+    if (!party)

			
 
				+    {

			
 
				+      std::string qfile = std::string("./gamma0");

			
 
				+      int qfd = open(qfile.c_str(), O_RDWR);

			
 
				+      X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				+                          PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				+

			
 
				+      qfile = std::string("./x0");

			
 
				+      qfd = open(qfile.c_str(), O_RDWR);

			
 
				+      Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),

			
 
				+                          PROT_READ, MAP_PRIVATE, qfd, 0);

			
 
				+      close(qfd);

			
 
				+      munmap(X, 8 * sizeof(__m128i));

			
 
				+      munmap(Y, 8 * sizeof(__m128i));

			
 
				+    }

			
 
				+

			
 
				+    // cw_construction computecw;

			
 
				+    //	read(sin, boost::asio::buffer(&computecw, sizeof(computecw)));

			
 
				+

			
 
				+    // computecw.rand_b;

			
 
				+    //__m128i gamma_b = computecw.gamma_b;

			
 
				+

			
 
				+    if (party)

			
 
				+    {

			
 
				+      rand_b = Y[0];	//_mm_set_epi32(0x6fef9434, 0x6768121e, 0x20942286, 0x1b59f7a7);

			
 
				+      gamma_b = X[0]; // _mm_set_epi32(0x6a499109 , 0x803067dd , 0xd1e2281b , 0xe71b6262);

			
 
				+      bit_b = 1;		// computecw.bit_b;

			
 
				+    }

			
 
				+    else

			
 
				+    {

			
 
				+      rand_b = Y[0];	// _mm_set_epi32(0xb29747df, 0xf7300f6d, 0x9476d971, 0xd5f75d98);

			
 
				+      gamma_b = X[0]; // _mm_set_epi32(0xb73142e2 , 0x10687aae , 0x06500d3ec , 0x29b5c85d);

			
 
				+      bit_b = 1;		// computecw.bit_b;

			
 
				+    }

			
 
				+

			
 
				+    uint8_t blinded_bit, blinded_bit_read;

			
 
				+    blinded_bit = bit ^ bit_b;

			
 
				+

			
 
				+    __m128i blinded_L = L ^ R ^ rand_b;

			
 
				+    __m128i blinded_L_read;

			
 
				+

			
 
				+    struct BlindsCW

			
 
				+    {

			
 
				+        __m128i blinded_message;

			
 
				+        uint8_t blinded_bit;

			
 
				+    };

			
 
				+

			
 
				+    BlindsCW blinds_sent, blinds_recv;

			
 
				+

			
 
				+    blinds_sent.blinded_bit = blinded_bit;

			
 
				+    blinds_sent.blinded_message = blinded_L;

			
 
				+

			
 
				+    boost::asio::write(sout, boost::asio::buffer(&blinds_sent, sizeof(blinds_sent)));

			
 
				+    boost::asio::read(sout, boost::asio::buffer(&blinds_recv, sizeof(blinds_recv)));

			
 
				+    communication_cost += sizeof(blinds_recv);

			
 
				+

			
 
				+    blinded_bit_read = blinds_recv.blinded_bit;

			
 
				+    blinded_L_read = blinds_recv.blinded_message;

			
 
				+

			
 
				+    __m128i out_ = R ^ gamma_b; //_mm_setzero_si128;

			
 
				+

			
 
				+    if (bit)

			
 
				+    {

			
 
				+        out_ ^= (L ^ R ^ blinded_L_read);

			
 
				+    }

			
 
				+    if (blinded_bit_read)

			
 
				+    {

			
 
				+        out_ ^= rand_b;

			
 
				+    }

			
 
				+

			
 
				+    __m128i out_reconstruction;

			
 
				+    boost::asio::write(sout, boost::asio::buffer(&out_, sizeof(out_)));

			
 
				+    boost::asio::read(sout, boost::asio::buffer(&out_reconstruction, sizeof(out_reconstruction)));

			
 
				+    communication_cost += sizeof(out_reconstruction);

			
 
				+

			
 
				+    out_reconstruction = out_ ^ out_reconstruction;

			
 
				+

			
 
				+    CW = out_reconstruction;

			
 
				+

			
 
				+		#ifdef DEBUG

			
 
				+		    uint8_t bit_reconstruction;

			
 
				+		    boost::asio::write(sout, boost::asio::buffer(&bit, sizeof(bit)));

			
 
				+		    boost::asio::read(sout, boost::asio::buffer(&bit_reconstruction, sizeof(bit_reconstruction)));

			
 
				+		    bit_reconstruction = bit ^ bit_reconstruction;

			
 
				+

			
 
				+		    __m128i L_reconstruction;

			
 
				+		    boost::asio::write(sout, boost::asio::buffer(&L, sizeof(L)));

			
 
				+		    boost::asio::read(sout, boost::asio::buffer(&L_reconstruction, sizeof(L_reconstruction)));

			
 
				+		    L_reconstruction = L ^ L_reconstruction;

			
 
				+

			
 
				+		    __m128i R_reconstruction;

			
 
				+		    boost::asio::write(sout, boost::asio::buffer(&R, sizeof(R)));

			
 
				+		    boost::asio::read(sout, boost::asio::buffer(&R_reconstruction, sizeof(R_reconstruction)));

			
 
				+		    R_reconstruction = R ^ R_reconstruction;

			
 
				+

			
 
				+		    __m128i CW_debug;

			
 
				+

			
 
				+		    if (bit_reconstruction != 0)

			
 
				+		    {

			
 
				+		        CW_debug = L_reconstruction;

			
 
				+		    }

			
 
				+		    else

			
 
				+		    {

			
 
				+		        CW_debug = R_reconstruction;

			
 
				+		    }

			
 
				+

			
 
				+		    assert(CW_debug[0] == CW[0]);

			
 
				+		    assert(CW_debug[1] == CW[1]);

			
 
				+		#endif

			
 
				 }

			
 
				 

			
 
				 __m128i bit_mask_avx2_msb(unsigned int n)

			
 
				 {

			
 
				-	__m128i ones = _mm_set1_epi32(-1);

			
 
				-	__m128i cnst32_128 = _mm_set_epi32(32, 64, 96, 128);

			
 
				+    __m128i ones = _mm_set1_epi32(-1);

			
 
				+    __m128i cnst32_128 = _mm_set_epi32(32, 64, 96, 128);

			
 
				 

			
 
				-	__m128i shift = _mm_set1_epi32(n);

			
 
				-	shift = _mm_subs_epu16(cnst32_128, shift);

			
 
				-	return _mm_sllv_epi32(ones, shift);

			
 
				+    __m128i shift = _mm_set1_epi32(n);

			
 
				+    shift = _mm_subs_epu16(cnst32_128, shift);

			
 
				+    return _mm_sllv_epi32(ones, shift);

			
 
				 }

			
 
				 

			
 
				 __m128i bit_mask_avx2_lsb(unsigned int n)

			
 
				 {

			
 
				-	__m128i ones = _mm_set1_epi32(-1);

			
 
				-	__m128i cnst32_128 = _mm_set_epi32(128, 96, 64, 32);

			
 
				+    __m128i ones = _mm_set1_epi32(-1);

			
 
				+    __m128i cnst32_128 = _mm_set_epi32(128, 96, 64, 32);

			
 
				 

			
 
				-	__m128i shift = _mm_set1_epi32(n);

			
 
				-	shift = _mm_subs_epu16(cnst32_128, shift);

			
 
				-	return _mm_srlv_epi32(ones, shift);

			
 
				+    __m128i shift = _mm_set1_epi32(n);

			
 
				+    shift = _mm_subs_epu16(cnst32_128, shift);

			
 
				+    return _mm_srlv_epi32(ones, shift);

			
 
				 }

			
 
				 

			
 
				 template <typename node_t, typename prgkey_t>

			
 
				 static inline void traverse(const prgkey_t &prgkey, const node_t &seed, node_t s[2])

			
 
				 {

			
 
				-	dpf::PRG(prgkey, clear_lsb(seed, 0b11), s, 2);

			
 
				+    dpf::PRG(prgkey, clear_lsb(seed, 0b11), s, 2);

			
 
				 } // dpf::expand

			
 
				 

			
 
				 inline void evalfull_mpc(const size_t &nodes_per_leaf, const size_t &depth, const size_t &nbits, const size_t &nodes_in_interval,

			
 
				-						 const AES_KEY &prgkey, uint8_t target_share[64], std::vector<socket_t> &socketsPb,

			
 
				-						 const size_t from, const size_t to, __m128i *output, int8_t *_t, __m128i &final_correction_word, bool party, size_t socket_no = 0)

			
 
				+                         const AES_KEY &prgkey, uint8_t target_share[64], std::vector<socket_t> &socketsPb,

			
 
				+                         const size_t from, const size_t to, __m128i *output, int8_t *_t, __m128i &final_correction_word, bool party, size_t socket_no = 0)

			
 
				 {

			
 
				 

			
 
				-	__m128i root;

			
 
				-

			
 
				-	arc4random_buf(&root, sizeof(root));

			
 
				+    __m128i root;

			
 
				 

			
 
				-	root = set_lsb(root, party);

			
 
				+    arc4random_buf(&root, sizeof(root));

			
 
				 

			
 
				-	const size_t from_node = std::floor(static_cast<double>(from) / nodes_per_leaf);

			
 
				+    root = set_lsb(root, party);

			
 
				 

			
 
				-	__m128i *s[2] = {

			
 
				-		reinterpret_cast<__m128i *>(output) + nodes_in_interval * (nodes_per_leaf - 1),

			
 
				-		s[0] + nodes_in_interval / 2};

			
 
				+    const size_t from_node = std::floor(static_cast<double>(from) / nodes_per_leaf);

			
 
				 

			
 
				-	int8_t *t[2] = {_t, _t + nodes_in_interval / 2};

			
 
				+    __m128i *s[2] = {

			
 
				+        reinterpret_cast<__m128i *>(output) + nodes_in_interval * (nodes_per_leaf - 1),

			
 
				+        s[0] + nodes_in_interval / 2

			
 
				+    };

			
 
				 

			
 
				-	int curlayer = depth % 2;

			
 
				+    int8_t *t[2] = {_t, _t + nodes_in_interval / 2};

			
 
				 

			
 
				-	s[curlayer][0] = root;

			
 
				-	t[curlayer][0] = get_lsb(root, 0b01);

			
 
				+    int curlayer = depth % 2;

			
 
				 

			
 
				-	__m128i *CW = (__m128i *)std::aligned_alloc(sizeof(__m256i), depth * sizeof(__m128i));

			
 
				+    s[curlayer][0] = root;

			
 
				+    t[curlayer][0] = get_lsb(root, 0b01);

			
 
				 

			
 
				-	for (size_t layer = 0; layer < depth; ++layer)

			
 
				-	{

			
 
				-   #ifdef VERBOSE

			
 
				-		  printf("layer = %zu\n", layer);

			
 
				-   #endif

			
 
				-		 curlayer = 1 - curlayer;

			
 
				+    __m128i *CW = (__m128i *)std::aligned_alloc(sizeof(__m256i), depth * sizeof(__m128i));

			
 
				 

			
 
				-		 size_t i = 0, j = 0;

			
 
				-		 auto nextbit = (from_node >> (nbits - layer - 1)) & 1;

			
 
				-		 size_t nodes_in_prev_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth - layer)));

			
 
				-		 size_t nodes_in_cur_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth - layer - 1)));

			
 
				+    for (size_t layer = 0; layer < depth; ++layer)

			
 
				+    {

			
 
				+#ifdef VERBOSE

			
 
				+        printf("layer = %zu\n", layer);

			
 
				+#endif

			
 
				+        curlayer = 1 - curlayer;

			
 
				+

			
 
				+        size_t i = 0, j = 0;

			
 
				+        auto nextbit = (from_node >> (nbits - layer - 1)) & 1;

			
 
				+        size_t nodes_in_prev_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth - layer)));

			
 
				+        size_t nodes_in_cur_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth - layer - 1)));

			
 
				+

			
 
				+        __m128i L = _mm_setzero_si128();

			
 
				+        __m128i R = _mm_setzero_si128();

			
 
				+

			
 
				+        for (i = nextbit, j = nextbit; j < nodes_in_prev_layer - 1; ++j, i += 2)

			
 
				+        {

			
 
				+            traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]);

			
 
				+            L ^= s[curlayer][i];

			
 
				+            R ^= s[curlayer][i + 1];

			
 
				+        }

			
 
				+

			
 
				+        if (nodes_in_prev_layer > j)

			
 
				+        {

			
 
				+            if (i < nodes_in_cur_layer - 1)

			
 
				+            {

			
 
				+                traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]);

			
 
				+                L ^= s[curlayer][i];

			
 
				+                R ^= s[curlayer][i + 1];

			
 
				+            }

			
 
				+        }

			
 
				+

			
 
				+        compute_CW(party, socketsPb[socket_no], L, R, target_share[layer], CW[layer]);

			
 
				+

			
 
				+        uint8_t advice_L = get_lsb(L) ^ target_share[layer];

			
 
				+        uint8_t advice_R = get_lsb(R) ^ target_share[layer];

			
 
				+

			
 
				+        uint8_t cwt_L, cwt_R;

			
 
				+        uint8_t advice[2];

			
 
				+        uint8_t cwts[2];

			
 
				+        advice[0] = advice_L;

			
 
				+        advice[1] = advice_R;

			
 
				+

			
 
				+        boost::asio::write(socketsPb[socket_no + 1], boost::asio::buffer(&advice, sizeof(advice)));

			
 
				+        boost::asio::read(socketsPb[socket_no + 1], boost::asio::buffer(&cwts, sizeof(cwts)));

			
 
				+

			
 
				+        cwt_L = cwts[0];

			
 
				+        cwt_R = cwts[1];

			
 
				+

			
 
				+        cwt_L = cwt_L ^ advice_L ^ 1;

			
 
				+        cwt_R = cwt_R ^ advice_R;

			
 
				+

			
 
				+        for (size_t j = 0; j < nodes_in_prev_layer; ++j)

			
 
				+        {

			
 
				+            t[curlayer][2 * j] = get_lsb(s[curlayer][2 * j]) ^ (cwt_L & t[1 - curlayer][j]);

			
 
				+            s[curlayer][2 * j] = clear_lsb(xor_if(s[curlayer][2 * j], CW[layer], !t[1 - curlayer][j]), 0b11);

			
 
				+            t[curlayer][(2 * j) + 1] = get_lsb(s[curlayer][(2 * j) + 1]) ^ (cwt_R & t[1 - curlayer][j]);

			
 
				+            s[curlayer][(2 * j) + 1] = clear_lsb(xor_if(s[curlayer][(2 * j) + 1], CW[layer], !t[1 - curlayer][j]), 0b11);

			
 
				+        }

			
 
				+    }

			
 
				 

			
 
				-		 __m128i L = _mm_setzero_si128();

			
 
				-		 __m128i R = _mm_setzero_si128();

			
 
				+    free(CW);

			
 
				+    __m128i Gamma = _mm_setzero_si128();

			
 
				 

			
 
				-   for (i = nextbit, j = nextbit; j < nodes_in_prev_layer - 1; ++j, i += 2)

			
 
				-   {

			
 
				-    traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]);

			
 
				-    L ^= s[curlayer][i];

			
 
				-    R ^= s[curlayer][i + 1];

			
 
				-   }

			
 
				+    for (size_t i = 0; i < to + 1; ++i)

			
 
				+    {

			
 
				+        Gamma[0] += output[i][0];

			
 
				+        Gamma[1] += output[i][1];

			
 
				+    }

			
 
				 

			
 
				-   if (nodes_in_prev_layer > j)

			
 
				-   {

			
 
				-    if (i < nodes_in_cur_layer - 1)

			
 
				+    if (party)

			
 
				     {

			
 
				-     traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]);

			
 
				-     L ^= s[curlayer][i];

			
 
				-     R ^= s[curlayer][i + 1];

			
 
				+        Gamma[0] = -Gamma[0];

			
 
				+        Gamma[1] = -Gamma[1];

			
 
				     }

			
 
				-   }

			
 
				 

			
 
				-		 compute_CW(party, socketsPb[socket_no], L, R, target_share[layer], CW[layer]);

			
 
				+    boost::asio::write(socketsPb[socket_no + 3], boost::asio::buffer(&Gamma, sizeof(Gamma)));

			
 
				+    boost::asio::read(socketsPb[socket_no + 3], boost::asio::buffer(&final_correction_word, sizeof(final_correction_word)));

			
 
				+    communication_cost += sizeof(Gamma);

			
 
				+    final_correction_word = Gamma; // final_correction_word + Gamma;

			
 
				 

			
 
				-		 uint8_t advice_L = get_lsb(L) ^ target_share[layer];

			
 
				-		 uint8_t advice_R = get_lsb(R) ^ target_share[layer];

			
 
				+} // dpf::__evalinterval

			
 
				 

			
 
				-		 uint8_t cwt_L, cwt_R;

			
 
				-		 uint8_t advice[2];

			
 
				-		 uint8_t cwts[2];

			
 
				-		 advice[0] = advice_L;

			
 
				-		 advice[1] = advice_R;

			
 
				+void convert_shares(__m128i **output, int8_t **flags, size_t n_threads, size_t db_nitems, __m128i *final_correction_word, tcp::socket &sb, bool party)

			
 
				+{

			
 
				 

			
 
				-		 boost::asio::write(socketsPb[socket_no + 1], boost::asio::buffer(&advice, sizeof(advice)));

			
 
				-		 boost::asio::read(socketsPb[socket_no + 1], boost::asio::buffer(&cwts, sizeof(cwts)));

			
 
				+    for (size_t j = 0; j < db_nitems; ++j)

			
 
				+    {

			
 
				+        for (size_t k = 0; k < n_threads; ++k)

			
 
				+        {

			
 
				+            if (party)

			
 
				+            {

			
 
				+                output[k][j] = -output[k][j];

			
 
				+                flags[k][j] = -flags[k][j];

			
 
				+            }

			
 
				+        }

			
 
				+

			
 
				+#ifdef DEBUG

			
 
				+        int8_t out = flags[0][j];

			
 
				+        int8_t out_rec;

			
 
				+

			
 
				+        boost::asio::write(sb, boost::asio::buffer(&out, sizeof(out)));

			
 
				+        boost::asio::read(sb, boost::asio::buffer(&out_rec, sizeof(out_rec)));

			
 
				+        out_rec = out_rec + out;

			
 
				+

			
 
				+

			
 
				+        if (out_rec != 0)

			
 
				+            std::cout << j << "(flags) --> " << (int)out_rec << std::endl

			
 
				+                      << std::endl;

			
 
				+

			
 
				+        __m128i out2 = output[0][j];

			
 
				+        __m128i out_rec2;

			
 
				+

			
 
				+        boost::asio::write(sb, boost::asio::buffer(&out2, sizeof(out2)));

			
 
				+        boost::asio::read(sb, boost::asio::buffer(&out_rec2, sizeof(out_rec2)));

			
 
				+        out_rec2 = out_rec2 + out2;

			
 
				+        if (out_rec2[0] != 0)

			
 
				+            std::cout << j << "--> " << out_rec2[0] << std::endl;

			
 
				+#endif

			
 
				+    }

			
 
				 

			
 
				-		 cwt_L = cwts[0];

			
 
				-		 cwt_R = cwts[1];

			
 
				+    for (size_t i = 0; i < n_threads; ++i)

			
 
				+    {

			
 
				 

			
 
				-		 cwt_L = cwt_L ^ advice_L ^ 1;

			
 
				-		 cwt_R = cwt_R ^ advice_R;

			
 
				+        int64_t pm = 0;

			
 
				+        int64_t rb;

			
 
				+

			
 
				+        arc4random_buf(&rb, sizeof(rb));

			
 
				+        for (size_t j = 0; j < db_nitems; ++j)

			
 
				+        {

			
 
				+            if (party)

			
 
				+            {

			
 
				+                if (flags[i][j] != 0)

			
 
				+                    pm -= 1;

			
 
				+            }

			
 
				+            if (!party)

			
 
				+            {

			
 
				+                if (flags[i][j] != 0)

			
 
				+                    pm += 1; // flags[0][j];

			
 
				+            }

			
 
				+        }

			
 
				+    }

			
 
				+}

			
 
				 

			
 
				-		 for (size_t j = 0; j < nodes_in_prev_layer; ++j)

			
 
				-		 {

			
 
				-			 t[curlayer][2 * j] = get_lsb(s[curlayer][2 * j]) ^ (cwt_L & t[1 - curlayer][j]);

			
 
				-			 s[curlayer][2 * j] = clear_lsb(xor_if(s[curlayer][2 * j], CW[layer], !t[1 - curlayer][j]), 0b11);

			
 
				-			 t[curlayer][(2 * j) + 1] = get_lsb(s[curlayer][(2 * j) + 1]) ^ (cwt_R & t[1 - curlayer][j]);

			
 
				-			 s[curlayer][(2 * j) + 1] = clear_lsb(xor_if(s[curlayer][(2 * j) + 1], CW[layer], !t[1 - curlayer][j]), 0b11);

			
 
				-		 }

			
 
				-	 }

			
 
				+void accept_conncections_from_Pb(boost::asio::io_context &io_context, std::vector<socket_t> &socketsPb, int port, size_t j)

			
 
				+{

			
 
				+    tcp::acceptor acceptor_a(io_context, tcp::endpoint(tcp::v4(), port));

			
 
				+    tcp::socket sb_a(acceptor_a.accept());

			
 
				+    socketsPb[j] = std::move(sb_a);

			
 
				+}

			
 
				 

			
 
				-	__m128i Gamma = _mm_setzero_si128();

			
 
				+int main(int argc, char *argv[])

			
 
				+{

			
 
				 

			
 
				-	for (size_t i = 0; i < to + 1; ++i)

			
 
				-	{

			
 
				-		Gamma[0] += output[i][0];

			
 
				-		Gamma[1] += output[i][1];

			
 
				-	}

			
 
				+    boost::asio::io_context io_context;

			
 
				+    tcp::resolver resolver(io_context);

			
 
				+    const std::string host1 = argv[1];

			
 
				 

			
 
				-	if (party)

			
 
				-	{

			
 
				-		Gamma[0] = -Gamma[0];

			
 
				-		Gamma[1] = -Gamma[1];

			
 
				-	}

			
 
				 

			
 
				-	boost::asio::write(socketsPb[socket_no + 3], boost::asio::buffer(&Gamma, sizeof(Gamma)));

			
 
				-	boost::asio::read(socketsPb[socket_no + 3], boost::asio::buffer(&final_correction_word, sizeof(final_correction_word)));

			
 
				- communication_cost += sizeof(Gamma);

			
 
				-	final_correction_word = Gamma; // final_correction_word + Gamma;

			
 
				+    const size_t n_threads = atoi(argv[2]);

			
 
				+    const size_t number_of_sockets = 5 * n_threads;

			
 
				+    const size_t expo = atoi(argv[3]);

			
 
				 

			
 
				-} // dpf::__evalinterval

			
 
				+    const size_t maxRAM = atoi(argv[4]);

			
 
				 

			
 
				-void convert_shares(__m128i **output, int8_t **flags, size_t n_threads, size_t db_nitems, __m128i *final_correction_word, tcp::socket &sb, bool party)

			
 
				-{

			
 
				+    const size_t db_nitems = 1ULL << expo;

			
 
				 

			
 
				-	for (size_t j = 0; j < db_nitems; ++j)

			
 
				-	{

			
 
				-		for (size_t k = 0; k < n_threads; ++k)

			
 
				-		{

			
 
				-			if (party)

			
 
				-			{

			
 
				-				output[k][j] = -output[k][j];

			
 
				-				flags[k][j] = -flags[k][j];

			
 
				-			}

			
 
				-		}

			
 
				-

			
 
				-		 #ifdef DEBUG

			
 
				-		int8_t out = flags[0][j];

			
 
				-		int8_t out_rec;

			
 
				-

			
 
				-		boost::asio::write(sb, boost::asio::buffer(&out, sizeof(out)));

			
 
				-		boost::asio::read(sb, boost::asio::buffer(&out_rec, sizeof(out_rec)));

			
 
				-		out_rec = out_rec + out;

			
 
				-

			
 
				-		

			
 
				-		if (out_rec != 0)

			
 
				-			std::cout << j << "(flags) --> " << (int)out_rec << std::endl

			
 
				-					  << std::endl;

			
 
				-

			
 
				-		__m128i out2 = output[0][j];

			
 
				-		__m128i out_rec2;

			
 
				-

			
 
				-		boost::asio::write(sb, boost::asio::buffer(&out2, sizeof(out2)));

			
 
				-		boost::asio::read(sb, boost::asio::buffer(&out_rec2, sizeof(out_rec2)));

			
 
				-		out_rec2 = out_rec2 + out2;

			
 
				-		if (out_rec2[0] != 0)

			
 
				-			std::cout << j << "--> " << out_rec2[0] << std::endl;

			
 
				-		 #endif

			
 
				-	}

			
 
				-

			
 
				-	for (size_t i = 0; i < n_threads; ++i)

			
 
				-	{

			
 
				-

			
 
				-		int64_t pm = 0;

			
 
				-		int64_t rb;

			
 
				-

			
 
				-		arc4random_buf(&rb, sizeof(rb));

			
 
				-		for (size_t j = 0; j < db_nitems; ++j)

			
 
				-		{

			
 
				-			if (party)

			
 
				-			{

			
 
				-				if (flags[i][j] != 0)

			
 
				-					pm -= 1;

			
 
				-			}

			
 
				-			if (!party)

			
 
				-			{

			
 
				-				if (flags[i][j] != 0)

			
 
				-					pm += 1; // flags[0][j];

			
 
				-			}

			
 
				-		}

			
 
				-	}

			
 
				-}

			
 
				+    size_t RAM_needed_per_thread = 164 * db_nitems;

			
 
				+    std::cout << "RAM needed = " << n_threads*RAM_needed_per_thread << " bytes = " << n_threads*RAM_needed_per_thread/1073741824 << " GiB" << std::endl;

			
 
				+    std::cout << "RAM needed per thread = " << RAM_needed_per_thread << " bytes = " << (RAM_needed_per_thread>>30) << " GiB" << std::endl;

			
 
				+    size_t thread_per_batch = std::floor(double(maxRAM<<30)/RAM_needed_per_thread);

			
 
				+    if (thread_per_batch > n_threads) {

			
 
				+        thread_per_batch = n_threads;

			
 
				+    }

			
 
				+    std::cout << "thread_per_batch = " << thread_per_batch << std::endl;

			
 
				+    if (thread_per_batch < 1) {

			
 
				+        std::cout << "You need more RAM" << std::endl;

			
 
				+        exit(0);

			
 
				+    }

			
 
				+    size_t n_batches = std::ceil(double(n_threads)/thread_per_batch);

			
 
				+    std::cout << "n_batches = " << n_batches << std::endl;

			
 
				 

			
 
				-void accept_conncections_from_Pb(boost::asio::io_context &io_context, std::vector<socket_t> &socketsPb, int port, size_t j)

			
 
				-{

			
 
				-	tcp::acceptor acceptor_a(io_context, tcp::endpoint(tcp::v4(), port));

			
 
				-	tcp::socket sb_a(acceptor_a.accept());

			
 
				-	socketsPb[j] = std::move(sb_a);

			
 
				-}

			
 
				+    std::vector<socket_t> socketsPb;

			
 
				+    for (size_t j = 0; j < number_of_sockets + 1; ++j)

			
 
				+    {

			
 
				+        tcp::socket emptysocket(io_context);

			
 
				+        socketsPb.emplace_back(std::move(emptysocket));

			
 
				+    }

			
 
				+    socketsPb.reserve(number_of_sockets + 1);

			
 
				 

			
 
				-int main(int argc, char *argv[])

			
 
				-{

			
 
				 

			
 
				-	boost::asio::io_context io_context;

			
 
				-	tcp::resolver resolver(io_context);

			
 
				-	const std::string host1 = argv[1];

			
 
				- 

			
 
				-

			
 
				-	const size_t n_threads = atoi(argv[2]);

			
 
				-	const size_t number_of_sockets = 5 * n_threads;

			
 
				-	const size_t expo = atoi(argv[3]);

			
 
				-

			
 
				-	const size_t maxRAM = atoi(argv[4]);

			
 
				-

			
 
				-	const size_t db_nitems = 1ULL << expo;

			
 
				-

			
 
				-      size_t RAM_needed_per_thread = 164 * db_nitems;

			
 
				-      std::cout << "RAM needed = " << n_threads*RAM_needed_per_thread << " bytes = " << n_threads*RAM_needed_per_thread/1073741824 << " GiB" << std::endl;

			
 
				-      std::cout << "RAM needed per thread = " << RAM_needed_per_thread << " bytes = " << (RAM_needed_per_thread>>30) << " GiB" << std::endl;

			
 
				-      size_t thread_per_batch = std::floor(double(maxRAM<<30)/RAM_needed_per_thread);

			
 
				-      if (thread_per_batch > n_threads) {

			
 
				-	thread_per_batch = n_threads;

			
 
				-      }

			
 
				-      std::cout << "thread_per_batch = " << thread_per_batch << std::endl;

			
 
				-      if (thread_per_batch < 1) {

			
 
				-       std::cout << "You need more RAM" << std::endl;

			
 
				-       exit(0);

			
 
				-      }

			
 
				-      size_t n_batches = std::ceil(double(n_threads)/thread_per_batch);

			
 
				-      std::cout << "n_batches = " << n_batches << std::endl;

			
 
				-

			
 
				-	std::vector<socket_t> socketsPb;

			
 
				-	for (size_t j = 0; j < number_of_sockets + 1; ++j)

			
 
				-	{

			
 
				-		tcp::socket emptysocket(io_context);

			
 
				-		socketsPb.emplace_back(std::move(emptysocket));

			
 
				-	}

			
 
				-	socketsPb.reserve(number_of_sockets + 1);

			
 
				-

			
 
				-

			
 
				-	std::vector<int> ports;

			
 
				-	for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				-	{

			
 
				-		int port = 6000;

			
 
				-		ports.push_back(port + j);

			
 
				-	}

			
 
				-

			
 
				-	std::vector<int> ports2_0;

			
 
				-	for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				-	{

			
 
				-		int port = 20000;

			
 
				-		ports2_0.push_back(port + j);

			
 
				-	}

			
 
				-

			
 
				-	std::vector<int> ports2_1;

			
 
				-	for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				-	{

			
 
				-		int port = 40000;

			
 
				-		ports2_1.push_back(port + j);

			
 
				-	}

			
 
				-

			
 
				-bool party;

			
 
				+    std::vector<int> ports;

			
 
				+    for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				+    {

			
 
				+        int port = 6000;

			
 
				+        ports.push_back(port + j);

			
 
				+    }

			
 
				+

			
 
				+    std::vector<int> ports2_0;

			
 
				+    for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				+    {

			
 
				+        int port = 20000;

			
 
				+        ports2_0.push_back(port + j);

			
 
				+    }

			
 
				+

			
 
				+    std::vector<int> ports2_1;

			
 
				+    for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				+    {

			
 
				+        int port = 40000;

			
 
				+        ports2_1.push_back(port + j);

			
 
				+    }

			
 
				+

			
 
				+    bool party;

			
 
				 

			
 
				 #if (PARTY == 0)

			
 
				-	party = false; 

			
 
				-	for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				-	{

			
 
				-		tcp::socket sb_a(io_context);

			
 
				-		boost::asio::connect(sb_a, resolver.resolve({host1, std::to_string(ports[j])}));

			
 
				-		socketsPb[j] = std::move(sb_a);

			
 
				-	}

			
 
				+    party = false;

			
 
				+    for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				+    {

			
 
				+        tcp::socket sb_a(io_context);

			
 
				+        boost::asio::connect(sb_a, resolver.resolve({host1, std::to_string(ports[j])}));

			
 
				+        socketsPb[j] = std::move(sb_a);

			
 
				+    }

			
 
				 #else

			
 
				-	party = true;	

			
 
				-	boost::asio::thread_pool pool2(number_of_sockets);

			
 
				-	for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				-	{

			
 
				-		boost::asio::post(pool2, std::bind(accept_conncections_from_Pb, std::ref(io_context), std::ref(socketsPb), ports[j], j));

			
 
				-	}

			
 
				-

			
 
				-	pool2.join();

			
 
				+    party = true;

			
 
				+    boost::asio::thread_pool pool2(number_of_sockets);

			
 
				+    for (size_t j = 0; j < number_of_sockets; ++j)

			
 
				+    {

			
 
				+        boost::asio::post(pool2, std::bind(accept_conncections_from_Pb, std::ref(io_context), std::ref(socketsPb), ports[j], j));

			
 
				+    }

			
 
				+

			
 
				+    pool2.join();

			
 
				 #endif

			
 
				 

			
 
				- 

			
 
				-	__m128i *final_correction_word = (__m128i *)std::aligned_alloc(sizeof(__m256i), thread_per_batch * sizeof(__m128i));

			
 
				 

			
 
				-	AES_KEY aeskey;

			
 
				+    __m128i *final_correction_word = (__m128i *)std::aligned_alloc(sizeof(__m256i), thread_per_batch * sizeof(__m128i));

			
 
				 

			
 
				-	__m128i **output = (__m128i **)malloc(sizeof(__m128i *) * thread_per_batch);

			
 
				-	int8_t **flags = (int8_t **)malloc(sizeof(uint8_t *) * thread_per_batch);

			
 
				+    AES_KEY aeskey;

			
 
				 

			
 
				-	for (size_t j = 0; j < thread_per_batch; ++j)

			
 
				-	{

			
 
				-		output[j] = (__m128i *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(__m128i));

			
 
				-		flags[j] = (int8_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(uint8_t));

			
 
				-	}

			
 
				+    __m128i **output = (__m128i **)malloc(sizeof(__m128i *) * thread_per_batch);

			
 
				+    int8_t **flags = (int8_t **)malloc(sizeof(uint8_t *) * thread_per_batch);

			
 
				 

			
 
				-	const size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) * CHAR_BIT;

			
 
				-	const bool is_packed = (sizeof(leaf_t) < sizeof(node_t));

			
 
				-	const size_t nodes_per_leaf = is_packed ? 1 : std::ceil(static_cast<double>(bits_per_leaf) / (sizeof(node_t) * CHAR_BIT));

			
 
				-	const size_t depth = std::ceil(std::log2(db_nitems));

			
 
				-	const size_t nbits = std::ceil(std::log2(db_nitems));

			
 
				-	const size_t nodes_in_interval = db_nitems - 1;

			
 
				-	auto start = std::chrono::steady_clock::now();

			
 
				+    for (size_t j = 0; j < thread_per_batch; ++j)

			
 
				+    {

			
 
				+        output[j] = (__m128i *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(__m128i));

			
 
				+        flags[j] = (int8_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(uint8_t));

			
 
				+    }

			
 
				+

			
 
				+    const size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) * CHAR_BIT;

			
 
				+    const bool is_packed = (sizeof(leaf_t) < sizeof(node_t));

			
 
				+    const size_t nodes_per_leaf = is_packed ? 1 : std::ceil(static_cast<double>(bits_per_leaf) / (sizeof(node_t) * CHAR_BIT));

			
 
				+    const size_t depth = std::ceil(std::log2(db_nitems));

			
 
				+    const size_t nbits = std::ceil(std::log2(db_nitems));

			
 
				+    const size_t nodes_in_interval = db_nitems - 1;

			
 
				+    auto start = std::chrono::steady_clock::now();

			
 
				 

			
 
				 

			
 
				 #ifdef VERBOSE

			
 
				-		printf("n_threads = %zu\n\n", n_threads);

			
 
				+    printf("n_threads = %zu\n\n", n_threads);

			
 
				 #endif

			
 
				- 

			
 
				 

			
 
				 

			
 
				 

			
 
				 

			
 
				- for(size_t iters = 0; iters < n_batches; ++iters)

			
 
				-{

			
 
				-   if (n_batches > 1) {

			
 
				-    printf("Starting evalfull_mpc batch %lu / %lu\n", iters+1, n_batches);

			
 
				-   }

			
 
				-   uint8_t **target_share_read = new uint8_t *[thread_per_batch];

			
 
				-   generate_random_targets(target_share_read, thread_per_batch, party, expo);

			
 
				-   boost::asio::thread_pool pool(thread_per_batch);

			
 
				-   for (size_t j = 0; j < thread_per_batch; ++j)

			
 
				-   {

			
 
				-    boost::asio::post(pool, std::bind(evalfull_mpc, std::ref(nodes_per_leaf), std::ref(depth), std::ref(nbits), std::ref(nodes_in_interval),

			
 
				-              std::ref(aeskey), target_share_read[j], std::ref(socketsPb), 0, db_nitems - 1, output[j],

			
 
				-              flags[j], std::ref(final_correction_word[j]), party, 5 * j));

			
 
				-   }

			
 
				-

			
 
				-   pool.join();

			
 
				-

			
 
				-

			
 
				-   convert_shares(output, flags, thread_per_batch, db_nitems, final_correction_word, socketsPb[0], party);

			
 
				-}

			
 
				- auto end = std::chrono::steady_clock::now();

			
 
				-	std::chrono::duration<double> elapsed_seconds = end - start;

			
 
				-	std::cout << "WallClockTime: " << elapsed_seconds.count() << " s" << std::endl;

			
 
				- std::cout << "CommunicationCost: " << communication_cost << " bytes" << std::endl;

			
 
				- 

			
 
				-

			
 
				-	if(!party)

			
 
				-	{

			
 
				-		char const *p0_filename0;

			
 
				-		p0_filename0 = "../duoram-online/preprocflags/party0_read_flags_b";

			
 
				-		int w0 = open(p0_filename0, O_WRONLY | O_CREAT, S_IWRITE | S_IREAD);

			
 
				-		int written = write(w0, flags[0], db_nitems * sizeof(flags[0][0]));

			
 
				-  if (written < 0) perror("Write error");

			
 
				-		close(w0);

			
 
				-	}

			
 
				-	else

			
 
				-	{

			
 
				-		char const *p0_filename0;

			
 
				-		p0_filename0 = "../duoram-online/preprocflags/party1_read_flags_b";

			
 
				-		int w0 = open(p0_filename0, O_WRONLY | O_CREAT, S_IWRITE | S_IREAD);

			
 
				-		int written = write(w0, flags[0], db_nitems * sizeof(flags[0][0]));

			
 
				-		if (written < 0) perror("Write error"); 

			
 
				-		close(w0);

			
 
				-	}

			
 
				-

			
 
				-	return 0;

			
 
				+

			
 
				+    for(size_t iters = 0; iters < n_batches; ++iters)

			
 
				+    {

			
 
				+        if (n_batches > 1) {

			
 
				+            printf("Starting evalfull_mpc batch %lu / %lu\n", iters+1, n_batches);

			
 
				+        }

			
 
				+        uint8_t **target_share_read = new uint8_t *[thread_per_batch];

			
 
				+        generate_random_targets(target_share_read, thread_per_batch, party, expo);

			
 
				+        boost::asio::thread_pool pool(thread_per_batch);

			
 
				+        for (size_t j = 0; j < thread_per_batch; ++j)

			
 
				+        {

			
 
				+            boost::asio::post(pool, std::bind(evalfull_mpc, std::ref(nodes_per_leaf), std::ref(depth), std::ref(nbits), std::ref(nodes_in_interval),

			
 
				+                                              std::ref(aeskey), target_share_read[j], std::ref(socketsPb), 0, db_nitems - 1, output[j],

			
 
				+                                              flags[j], std::ref(final_correction_word[j]), party, 5 * j));

			
 
				+        }

			
 
				+

			
 
				+        pool.join();

			
 
				+        for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				+        {

			
 
				+            delete[] target_share_read[j];

			
 
				+        }

			
 
				+        delete[] target_share_read;

			
 
				+        convert_shares(output, flags, thread_per_batch, db_nitems, final_correction_word, socketsPb[0], party);

			
 
				+    }

			
 
				+

			
 
				+    for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				+    {

			
 
				+

			
 
				+        free(output[j]);

			
 
				+        free(flags[j]);

			
 
				+    }

			
 
				+    free(output);

			
 
				+    free(flags);

			
 
				+    free(final_correction_word);

			
 
				+

			
 
				+    auto end = std::chrono::steady_clock::now();

			
 
				+    std::chrono::duration<double> elapsed_seconds = end - start;

			
 
				+    std::cout << "WallClockTime: " << elapsed_seconds.count() << " s" << std::endl;

			
 
				+    std::cout << "CommunicationCost: " << communication_cost << " bytes" << std::endl;

			
 
				+    

			
 
				+    return 0;

			
 
				 }

			
--- a/duoram-online/duoram.cpp
+++ b/duoram-online/duoram.cpp
@@ -315,6 +315,7 @@ int main(const int argc, const char * argv[])
 
				     #endif

			
 
				     delete[] WritePb_;

			
 
				     delete[] WritePb_recv;

			
 
				+    delete[] where_to_write;

			
 
				 

			
 
				     for(size_t w = 0; w < number_of_writes; ++w)

			
 
				     {			

			
@@ -354,7 +355,7 @@ int main(const int argc, const char * argv[])
 
				       

			
 
				      

			
 
				     for(size_t r = 0; r < number_of_ind_reads; ++r) WritePb_ind_reads[r] = where_to_read_independent[r] -ri;

			
 
				-

			
 
				+    delete[] where_to_read_independent;

			
 
				     boost::asio::write(sockets_2[3], boost::asio::buffer(WritePb_ind_reads, number_of_ind_reads * sizeof(size_t)));

			
 
				     boost::asio::write(sockets_[3], boost::asio::buffer(WritePb_ind_reads, number_of_ind_reads * sizeof(size_t)));

			
 
				     boost::asio::read(sockets_[3], boost::asio::buffer(WritePb_ind_reads_recv, number_of_ind_reads * sizeof(size_t)));

			
@@ -381,7 +382,10 @@ int main(const int argc, const char * argv[])
 
				       std::cout << "---> [duoram independent reads] " <<  print_reconstruction(sockets_[0], read_out_independent_reads[r]) << std::endl;

			
 
				       #endif

			
 
				     }

			
 
				-

			
 
				+    delete[] rotate;

			
 
				+    delete[] Gamma_reads;

			
 
				+    delete[] WritePb_ind_reads_recv;

			
 
				+    delete[] WritePb_ind_reads;

			
 
				     auto end_ind_reads = std::chrono::steady_clock::now();

			
 
				     std::chrono::duration<double> elapsed_seconds_ind_reads = end_ind_reads - start_ind_reads;

			
 
				     //printf("elapsed_seconds_ind_reads = %f\n",elapsed_seconds_ind_reads.count());

			
@@ -406,7 +410,7 @@ int main(const int argc, const char * argv[])
 
				        std::cout << print_reconstruction(sockets_[0], read_out_dependent_reads[r]) << std::endl;

			
 
				       #endif

			
 
				     }

			
 
				-    

			
 
				+    delete[] where_to_read_dependent;

			
 
				     auto end_dep_reads = std::chrono::steady_clock::now();

			
 
				     std::chrono::duration<double> elapsed_seconds_dep_reads = end_dep_reads - start_dep_reads;

			
 
				     dependent_read_time = elapsed_seconds_dep_reads.count();

			
@@ -416,6 +420,29 @@ int main(const int argc, const char * argv[])
 
				      std::cout << std::endl << std::endl << "============== DEPENDENT READS END  ==============" << std::endl << std::endl;

			
 
				     #endif

			
 
				    #endif

			
 
				+    

			
 
				+    free(reading_b);

			
 
				+    free(reading_c);

			
 
				+    free(reading_d);

			
 
				+    free(writing_b);

			
 
				+    free(writing_c);

			
 
				+    free(writing_d);

			
 
				+    free(reading_temp);

			
 
				+    free(DB);

			
 
				+    free(updated_DB);

			
 
				+    free(blinded_DB);

			
 
				+    free(blinded_DB_recv);

			
 
				+    free(updated_blinded_DB_recv);

			
 
				+    free(b);

			
 
				+    free(c);

			
 
				+    free(d);

			
 
				+    free(blinds);

			
 
				+    free(updated_blinds);

			
 
				+    

			
 
				+    #ifdef ThreeParty

			
 
				+     delete[] read_out;

			
 
				+     delete[] Gamma;

			
 
				+    #endif

			
 
				   }  

			
 
				 

			
 
				 auto end_total = std::chrono::steady_clock::now();

			
--- a/duoram-online/readvectors.h
+++ b/duoram-online/readvectors.h
@@ -13,6 +13,7 @@ int read_final_correction_word(bool party, DB_t& FCW_read, int i = 0)
 
				    concatanate_index("../duoram-online/preprocflags/FCW0", tmp, i);

			
 
				 			int const in0 { open(tmp, O_RDONLY ) };

			
 
				 	 	size_t r = read(in0, &FCW_read,   sizeof(FCW_read));	

			
 
				+	 	close(in0);

			
 
				    if(r < 0) perror("Read error");

			
 
				 		}

			
 
				 		

			
@@ -21,7 +22,8 @@ int read_final_correction_word(bool party, DB_t& FCW_read, int i = 0)
 
				    char tmp[100];

			
 
				    concatanate_index("../duoram-online/preprocflags/FCW1", tmp, i);

			
 
				 		 int const in0 { open(tmp, O_RDONLY ) };

			
 
				-	 	size_t r = read(in0, &FCW_read,   sizeof(FCW_read));	

			
 
				+	 	size_t r = read(in0, &FCW_read,   sizeof(FCW_read));

			
 
				+	 		close(in0);	

			
 
				  		if(r < 0) perror("Read error");

			
 
				 		}

			
 
				 

			
@@ -35,7 +37,8 @@ int read_rand_indx(bool party, DB_t& R, int i = 0)
 
				    char tmp[100];

			
 
				    concatanate_index("../duoram-online/preprocflags/R0", tmp, i);

			
 
				 			int const in0 { open(tmp, O_RDONLY ) };

			
 
				-	 	size_t r = read(in0, &R,   sizeof(R));	

			
 
				+	 	size_t r = read(in0, &R,   sizeof(R));

			
 
				+	 	close(in0);	

			
 
				    if(r < 0) perror("Read error");

			
 
				 		}

			
 
				 		

			
@@ -45,6 +48,7 @@ int read_rand_indx(bool party, DB_t& R, int i = 0)
 
				    concatanate_index("../duoram-online/preprocflags/R1", tmp, i);

			
 
				 		 int const in0 { open(tmp, O_RDONLY ) };

			
 
				 	 	size_t r = read(in0, &R,   sizeof(R));	

			
 
				+	 	close(in0);

			
 
				  		if(r < 0) perror("Read error");

			
 
				 		}

			
 
				 

			
@@ -59,16 +63,19 @@ int read_flags_for_reading(bool party, size_t db_nitems, int i = 0)
 
				    concatanate_index("../duoram-online/preprocflags/party0_read_flags_b", tmp, i);

			
 
				 	 	int const in0 { open(tmp, O_RDONLY ) };

			
 
				 	 	size_t r = read(in0, reading_b,  sizeof(reading_b));	

			
 
				+	 	close(in0);

			
 
				    if(r < 0) perror("Read error");

			
 
				   

			
 
				    concatanate_index("../duoram-online/preprocflags/party0_read_flags_c", tmp, i);

			
 
				    int const in1 { open( tmp, O_RDONLY ) };

			
 
				 	 	r = read(in1, reading_c,  sizeof(reading_c));

			
 
				+	 	close(in1);

			
 
				    if(r < 0) perror("Read error");

			
 
				 

			
 
				    concatanate_index("../duoram-online/preprocflags/party0_read_flags_d", tmp, i);

			
 
				 	  int const in2 { open( tmp, O_RDONLY ) };

			
 
				-	  r = read(in2, reading_d,  sizeof(reading_d));	

			
 
				+	  r = read(in2, reading_d,  sizeof(reading_d));

			
 
				+	  close(in2);	

			
 
				    if(r < 0) perror("Read error");

			
 
				 	}

			
 
				 

			
@@ -79,16 +86,19 @@ int read_flags_for_reading(bool party, size_t db_nitems, int i = 0)
 
				    concatanate_index("../duoram-online/preprocflags/party1_read_flags_b", tmp, i);

			
 
				 		 int const in0 { open(tmp, O_RDONLY ) };

			
 
				 	 	size_t r = read(in0, reading_b,  sizeof(reading_b));	

			
 
				+	 	close(in0);

			
 
				 	 	if(r < 0) perror("Read error");

			
 
				 

			
 
				    concatanate_index("../duoram-online/preprocflags/party1_read_flags_c", tmp, i);

			
 
				  	 int const in1 { open(tmp, O_RDONLY ) };

			
 
				 		 r = read(in1, reading_c,  sizeof(reading_c));

			
 
				+		 close(in1);

			
 
				 		 if(r < 0) perror("Read error");

			
 
				     

			
 
				    concatanate_index("../duoram-online/preprocflags/party1_read_flags_d", tmp, i);

			
 
				 		 int const in2 { open(tmp, O_RDONLY ) };

			
 
				 		 r = read(in2, reading_d,  sizeof(reading_d));	

			
 
				+		 close(in2);

			
 
				    if(r < 0) perror("Read error");

			
 
				 }

			
 
				 

			
@@ -103,17 +113,20 @@ int read_flags_for_writing(bool party, size_t db_nitems, int i = 0)
 
				     char tmp[100];

			
 
				     concatanate_index("../duoram-online/preprocflags/party0_write_flags_b", tmp, i);

			
 
				 	   int const in0_w { open(tmp, O_RDONLY ) };

			
 
				-    size_t r = read(in0_w, writing_b,  sizeof(writing_b));	

			
 
				+    size_t r = read(in0_w, writing_b,  sizeof(writing_b));

			
 
				+    close(in0_w);	

			
 
				     if(r < 0) perror("Read error");

			
 
				 

			
 
				     concatanate_index("../duoram-online/preprocflags/party0_write_flags_c", tmp,i);	   

			
 
				 	   int const in1_w { open( tmp, O_RDONLY ) };

			
 
				     r = read(in1_w, writing_c,  sizeof(writing_c));

			
 
				+    close(in1_w);

			
 
				     if(r < 0) perror("Read error");

			
 
				 	

			
 
				     concatanate_index("../duoram-online/preprocflags/party0_write_flags_d", tmp,i);

			
 
				 	   int const in2_w { open( tmp, O_RDONLY ) };

			
 
				    	r = read(in2_w, writing_d,  sizeof(writing_d));	

			
 
				+   	close(in2_w);

			
 
				     if(r < 0) perror("Read error");

			
 
				 	}

			
 
				 

			
@@ -123,16 +136,19 @@ int read_flags_for_writing(bool party, size_t db_nitems, int i = 0)
 
				    concatanate_index("../duoram-online/preprocflags/party1_write_flags_b", tmp,i);

			
 
				  		int const in0_w { open( tmp, O_RDONLY ) };

			
 
				 	 	size_t r = read(in0_w, writing_b,  sizeof(writing_b));	

			
 
				+	 	close(in0_w);

			
 
				 	 	if(r < 0) perror("Read error");

			
 
				 

			
 
				    concatanate_index("../duoram-online/preprocflags/party1_write_flags_c", tmp, i);

			
 
				    int const in1_w { open(tmp, O_RDONLY ) };

			
 
				 		 r = read(in1_w, writing_c,  sizeof(writing_c));

			
 
				+		 close(in1_w);

			
 
				    if(r < 0) perror("Read error");

			
 
				 

			
 
				 		 concatanate_index("../duoram-online/preprocflags/party1_write_flags_d", tmp, i);

			
 
				 		 int const in2_w { open( tmp, O_RDONLY ) };

			
 
				 		 r = read(in2_w, writing_d,  sizeof(writing_d));	

			
 
				+		 close(in2_w);

			
 
				 	  if(r < 0) perror("Read error");

			
 
				 	}

			
 
				 

			
@@ -145,21 +161,25 @@ int read_flags_for_writing(bool party, size_t db_nitems, int i = 0)
 
				    concatanate_index("../duoram-online/preprocflags/P2_party0_write_flags_c", tmp, i);

			
 
				    int const in1_w { open(tmp, O_RDONLY ) };

			
 
				    size_t r = read(in1_w, writing_c,  sizeof(writing_c));

			
 
				+    close(in1_w);

			
 
				    if(r < 0) perror("Read error");

			
 
				    

			
 
				    concatanate_index("../duoram-online/preprocflags/P2_party1_write_flags_d", tmp, i);

			
 
				    int const in2_w { open(tmp, O_RDONLY ) };

			
 
				    r = read(in2_w, writing_d,  sizeof(writing_d)); 

			
 
				+   close(in2_w);

			
 
				    if(r < 0) perror("Read error");

			
 
				 

			
 
				    concatanate_index("../duoram-online/preprocflags/P2_party0_write_c", tmp, i);

			
 
				    int const in1_w_ { open(tmp, O_RDONLY ) };

			
 
				    r = read(in1_w_, c,  sizeof(c));

			
 
				+   close(in1_w_);

			
 
				    if(r < 0) perror("Read error");

			
 
				    

			
 
				    concatanate_index("../duoram-online/preprocflags/P2_party1_write_d", tmp, i);

			
 
				    int const in2_w_ { open(tmp, O_RDONLY ) };

			
 
				    r = read(in2_w_, d,  sizeof(d));  

			
 
				+   close(in2_w_);

			
 
				    if(r < 0) perror("Read error");

			
 
				 

			
 
				    return 0;

			
@@ -171,11 +191,13 @@ int read_flags_for_generating_cancellation_terms(size_t db_nitems, int i = 0)
 
				   concatanate_index("../duoram-online/preprocflags/P2_party1_read_flags_d", tmp, i);

			
 
				   int const in2 { open(tmp, O_RDONLY ) };

			
 
				   size_t r = read(in2, reading_d,  sizeof(reading_d)); 

			
 
				+  close(in2);

			
 
				   if(r < 0) perror("Read error");

			
 
				 

			
 
				   concatanate_index("../duoram-online/preprocflags/P2_party0_read_flags_c", tmp, i);

			
 
				   int const in2_ { open(tmp, O_RDONLY ) };

			
 
				   r = read(in2_, reading_c,  sizeof(reading_c)); 

			
 
				+  close(in2_);	

			
 
				   if(r < 0) perror("Read error");

			
 
				    

			
 
				   return 0;

			
@@ -190,16 +212,19 @@ int read_flags_for_updating(bool party, size_t db_nitems, int i = 0)
 
				   concatanate_index("../duoram-online/preprocflags/party0_write_b", tmp, i);

			
 
				 		int const in0_w_ { open(tmp, O_RDONLY ) };

			
 
				 		size_t r = read(in0_w_, b,  sizeof(b));	

			
 
				+		close(in0_w_);	

			
 
				 	 if(r < 0) perror("Read error");

			
 
				  	

			
 
				   concatanate_index("../duoram-online/preprocflags/party0_write_c", tmp, i);

			
 
				   int const in1_w_ { open(tmp, O_RDONLY ) };

			
 
				 		r = read(in1_w_, c,  sizeof(c));

			
 
				+		close(in1_w_);	

			
 
				   if(r < 0) perror("Read error");

			
 
				 

			
 
				   concatanate_index("../duoram-online/preprocflags/party0_write_d", tmp, i); 

			
 
				   int const in2_w_ { open(tmp, O_RDONLY ) };

			
 
				 	 r = read(in2_w_, d,  sizeof(d));	

			
 
				+	 close(in2_w_);	

			
 
				   if(r < 0) perror("Read error");

			
 
				 	}

			
 
				 

			
@@ -207,17 +232,20 @@ int read_flags_for_updating(bool party, size_t db_nitems, int i = 0)
 
				 	{

			
 
				   concatanate_index("../duoram-online/preprocflags/party1_write_b", tmp, i);

			
 
				  	int const in0_w_ { open( tmp, O_RDONLY ) };

			
 
				-	 size_t r = read(in0_w_, b,  sizeof(b));	

			
 
				+	 size_t r = read(in0_w_, b,  sizeof(b));

			
 
				+	 close(in0_w_);	

			
 
				   if(r < 0) perror("Read error");

			
 
				  	

			
 
				   concatanate_index("../duoram-online/preprocflags/party1_write_c", tmp, i);

			
 
				   int const in1_w_ { open(tmp, O_RDONLY ) };

			
 
				 		r = read(in1_w_, c,  sizeof(c));

			
 
				+	 close(in1_w_);	

			
 
				   if(r < 0) perror("Read error");

			
 
				 

			
 
				   concatanate_index("../duoram-online/preprocflags/party1_write_d", tmp, i); 	

			
 
				   int const in2_w_ { open(tmp, O_RDONLY ) };

			
 
				-		 r = read(in2_w_, d,  sizeof(d));	

			
 
				+		 r = read(in2_w_, d,  sizeof(d));

			
 
				+		 close(in2_w_);		

			
 
				   if(r < 0) perror("Read error");

			
 
				 	}

			
 
				 

			
--- a/preprocessing/preprocessing.cpp
+++ b/preprocessing/preprocessing.cpp
@@ -68,20 +68,20 @@ int main(int argc, char * argv[])
 
				     /* The function make_connections appears in network.h */

			
 
				    make_connections(party, host1, host2,  io_context, socketsPb, socketsP2, ports,  ports2_1, ports2_0, number_of_sockets);

			
 
				  

			
 
				-      size_t RAM_needed_per_thread = 164 * db_nitems;

			
 
				-      std::cout << "RAM needed = " << n_threads*RAM_needed_per_thread << " bytes = " << n_threads*RAM_needed_per_thread/1073741824 << " GiB" << std::endl;

			
 
				-      std::cout << "RAM needed per thread = " << RAM_needed_per_thread << " bytes = " << (RAM_needed_per_thread>>30) << " GiB" << std::endl;

			
 
				-      size_t thread_per_batch = std::floor(double(maxRAM<<30)/RAM_needed_per_thread);

			
 
				-      if (thread_per_batch > n_threads) {

			
 
				-	thread_per_batch = n_threads;

			
 
				-      }

			
 
				-      std::cout << "thread_per_batch = " << thread_per_batch << std::endl;

			
 
				-      if (thread_per_batch < 1) {

			
 
				-       std::cout << "You need more RAM" << std::endl;

			
 
				-       exit(0);

			
 
				-      }

			
 
				-      size_t n_batches = std::ceil(double(n_threads)/thread_per_batch);

			
 
				-      std::cout << "n_batches = " << n_batches << std::endl;

			
 
				+   size_t RAM_needed_per_thread = 164 * db_nitems;

			
 
				+   std::cout << "RAM needed = " << n_threads*RAM_needed_per_thread << " bytes = " << n_threads*RAM_needed_per_thread/1073741824 << " GiB" << std::endl;

			
 
				+   std::cout << "RAM needed per thread = " << RAM_needed_per_thread << " bytes = " << (RAM_needed_per_thread>>30) << " GiB" << std::endl;

			
 
				+   size_t thread_per_batch = std::floor(double(maxRAM<<30)/RAM_needed_per_thread);

			
 
				+   if (thread_per_batch > n_threads) {

			
 
				+     thread_per_batch = n_threads;

			
 
				+   }

			
 
				+   std::cout << "thread_per_batch = " << thread_per_batch << std::endl;

			
 
				+   if (thread_per_batch < 1) {

			
 
				+    std::cout << "You need more RAM" << std::endl;

			
 
				+    exit(0);

			
 
				+   }

			
 
				+   size_t n_batches = std::ceil(double(n_threads)/thread_per_batch);

			
 
				+   std::cout << "n_batches = " << n_batches << std::endl;

			
 
				 

			
 
				    uint8_t ** target_share_read = new uint8_t*[thread_per_batch];

			
 
				 

			
@@ -104,61 +104,49 @@ int main(int argc, char * argv[])
 
				    }

			
 
				      

			
 
				  

			
 
				-   boost::asio::thread_pool pool_share_conversion(thread_per_batch);

			
 
				+  boost::asio::thread_pool pool_share_conversion(thread_per_batch);

			
 
				     

			
 
				-

			
 
				-    

			
 
				-    // The following function call creates and evaluates DPFs at target_share_read[j] for j \in \{0, \ldots, n_threads}

			
 
				-    // the flag vectors are stored in flags

			
 
				-    // the leaves are stored in output

			
 
				-    // the final correctionword is stored in final_correction_word

			
 
				-

			
 
				   dpfP2 * dpf_instance = (dpfP2 * ) malloc (sizeof(dpfP2) * n_threads);

			
 
				 

			
 
				 

			
 
				   cw_construction computecw_array;

			
 
				 

			
 
				  

			
 
				-     boost::asio::read(socketsP2[0], boost::asio::buffer(&computecw_array, sizeof(computecw_array)));

			
 
				-     #ifdef VERBOSE

			
 
				-      std::cout << "computecw_array.rand_b: " << computecw_array.rand_b[0] << " " << computecw_array.rand_b[1] << std::endl;

			
 
				-     #endif

			
 
				-

			
 
				-      /* The function create_dpfs appears in dpf-gen.h*/

			
 
				-      bool reading = true;

			
 
				-

			
 
				-      

			
 
				-

			
 
				-     size_t *thread_communication_costs = new size_t[thread_per_batch];

			
 
				-     for(size_t iter = 0; iter < n_batches; ++iter)

			
 
				-     { 

			
 
				-        if (n_batches > 1) {

			
 
				-          printf("Starting create_dpfs batch %lu / %lu\n", iter+1, n_batches);

			
 
				-        }

			
 
				-        boost::asio::thread_pool pool(thread_per_batch);

			
 
				-        for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				-        {

			
 
				-	  thread_communication_costs[j] = 0; 

			
 
				-	  boost::asio::post(pool,

			
 
				-	    std::bind(create_dpfs, reading,  db_nitems, std::ref(aeskey),

			
 
				-		target_share_read[j], std::ref(socketsPb), std::ref(socketsP2),

			
 
				-		0, db_nitems-1, output[j],  flags[j],

			
 
				-		std::ref(final_correction_word[j]), computecw_array,

			
 
				-		std::ref(dpf_instance), party, 5 * j, j,

			
 
				-		std::ref(thread_communication_costs[j])));

			
 
				-        }    

			
 
				-        pool.join();

			
 
				-        for(size_t j = 0; j < thread_per_batch; ++j) {

			
 
				-	  communication_cost += thread_communication_costs[j];

			
 
				-	}

			
 
				-     }

			
 
				-     delete[] thread_communication_costs;

			
 
				-      

			
 
				-     boost::asio::write(socketsP2[0], boost::asio::buffer(dpf_instance, n_threads * sizeof(dpfP2))); // do this in parallel.

			
 
				-     communication_cost += (n_threads * sizeof(dpfP2));

			
 
				+  boost::asio::read(socketsP2[0], boost::asio::buffer(&computecw_array, sizeof(computecw_array)));

			
 
				  

			
 
				-   #ifdef DEBUG

			
 
				-

			
 
				+  #ifdef VERBOSE

			
 
				+  std::cout << "computecw_array.rand_b: " << computecw_array.rand_b[0] << " " << computecw_array.rand_b[1] << std::endl;

			
 
				+  #endif

			
 
				+

			
 
				+    /* The function create_dpfs appears in dpf-gen.h*/

			
 
				+  bool reading = true;

			
 
				+  size_t *thread_communication_costs = new size_t[thread_per_batch];

			
 
				+  for(size_t iter = 0; iter < n_batches; ++iter)

			
 
				+  { 

			
 
				+      if (n_batches > 1) {

			
 
				+        printf("Starting create_dpfs batch %lu / %lu\n", iter+1, n_batches);

			
 
				+      }

			
 
				+      boost::asio::thread_pool pool(thread_per_batch);

			
 
				+      for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				+      {

			
 
				+    	  thread_communication_costs[j] = 0; 

			
 
				+    	  boost::asio::post(pool,

			
 
				+    	  std::bind(create_dpfs, reading,  db_nitems, std::ref(aeskey), target_share_read[j], std::ref(socketsPb), std::ref(socketsP2), 0, db_nitems-1, output[j],  flags[j],

			
 
				+    		          std::ref(final_correction_word[j]), computecw_array, std::ref(dpf_instance), party, 5 * j, j, std::ref(thread_communication_costs[j])));

			
 
				+      }    

			
 
				+      pool.join();

			
 
				+      for(size_t j = 0; j < thread_per_batch; ++j) {

			
 
				+       communication_cost += thread_communication_costs[j];

			
 
				+      }

			
 
				+  }

			
 
				+  

			
 
				+  delete[] thread_communication_costs;

			
 
				+      

			
 
				+  boost::asio::write(socketsP2[0], boost::asio::buffer(dpf_instance, n_threads * sizeof(dpfP2))); // do this in parallel.

			
 
				+  communication_cost += (n_threads * sizeof(dpfP2));

			
 
				+  free(dpf_instance);

			
 
				+   

			
 
				+  #ifdef DEBUG

			
 
				     for(size_t j = 0; j < n_threads; ++j)

			
 
				     {

			
 
				       std::cout << "n_threads = " << j << std::endl;

			
@@ -183,19 +171,20 @@ int main(int argc, char * argv[])
 
				         final_correction_word_reconstruction = final_correction_word_reconstruction + final_correction_word[j][0];

			
 
				         std::cout << "final_correction_word_reconstruction = " << final_correction_word_reconstruction << std::endl << std::endl;

			
 
				      }

			
 
				-    #endif

			
 
				+  #endif

			
 
				  

			
 
				     /* 

			
 
				      leaves is a additive shares of the outputs (leaves of the DPF)

			
 
				      leafbits is the additive shares of flag bits of the DPFs

			
 
				     */

			
 
				-   int64_t ** leaves = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch);

			
 
				-   int64_t ** leafbits  = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch); 

			
 
				-   for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				-   {

			
 
				-    leaves[j] = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));

			
 
				-    leafbits[j]  = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));

			
 
				-   }

			
 
				+  int64_t ** leaves = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch);

			
 
				+  int64_t ** leafbits  = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch); 

			
 
				+ 

			
 
				+  for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				+  {

			
 
				+   leaves[j] = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));

			
 
				+   leafbits[j]  = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));

			
 
				+  }

			
 
				 

			
 
				 

			
 
				 

			
@@ -203,23 +192,36 @@ int main(int argc, char * argv[])
 
				    for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				    {

			
 
				      boost::asio::post(pool_share_conversion,  std::bind(convert_shares, j, output, flags, n_threads, db_nitems, final_correction_word, 	leaves, leafbits, 

			
 
				-                                                          std::ref(socketsPb), std::ref(socketsP2), party));	 	

			
 
				+                                               std::ref(socketsPb), std::ref(socketsP2), party));	 	

			
 
				    }

			
 
				     

			
 
				-    pool_share_conversion.join();

			
 
				+   pool_share_conversion.join();

			
 
				 

			
 
				-    boost::asio::thread_pool pool_xor_to_additive(thread_per_batch); 

			
 
				+   boost::asio::thread_pool pool_xor_to_additive(thread_per_batch); 

			
 
				 

			
 
				-    int64_t *additve_shares = new int64_t[thread_per_batch]; 

			
 
				-    for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				-    {

			
 
				+   int64_t *additve_shares = new int64_t[thread_per_batch]; 

			
 
				+   

			
 
				+   for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				+   {

			
 
				      boost::asio::post(pool_xor_to_additive, std::bind(xor_to_additive, party, target_share_read[j], std::ref(socketsPb[j]), std::ref(socketsP2[j]), expo, std::ref(additve_shares[j])));

			
 
				-    }

			
 
				+   }

			
 
				 

			
 
				-    pool_xor_to_additive.join();

			
 
				+   pool_xor_to_additive.join();

			
 
				     

			
 
				- 

			
 
				+   for(size_t j = 0; j < thread_per_batch; ++j)

			
 
				+   {

			
 
				+    free(leaves[j]);

			
 
				+    free(leafbits[j]);

			
 
				+    free(output[j]);

			
 
				+    free(flags[j]);

			
 
				+    delete[] target_share_read[j];

			
 
				+   }

			
 
				     

			
 
				+    free(leaves);

			
 
				+    free(leafbits);

			
 
				+    free(output);

			
 
				+    free(flags);

			
 
				+    delete[] target_share_read;

			
 
				     /* For the artifact, don't actually write these in order to not use very

			
 
				      * large amounts of storage

			
 
				 

			
--- a/preprocessing/share-conversion.h
+++ b/preprocessing/share-conversion.h
@@ -315,7 +315,7 @@ void convert_shares(size_t i, __m128i ** output, int8_t ** flags, size_t n_threa
 
				 		leaves[i][j]  = output[i][j][0];

			
 
				 		flags_[j] = (flags[i][j] * pm) + (flags[i][j] * share_b_recv.PM) + (flags[i][j] * rb); 

			
 
				 		flags_[j] += output[i][j][1]; 

			
 
				-  flags_[j] -= (flags[i][j] * P2_shareconversion.FCWshare_reconstruction);		

			
 
				+    flags_[j] -= (flags[i][j] * P2_shareconversion.FCWshare_reconstruction);		

			
 
				 

			
 
				 	

			
 
				 		#ifdef DEBUG