2 년 전 · ff2653d6ea
--- a/Makefile
+++ b/Makefile
@@ -31,26 +31,27 @@ depend:
 
				 
			
 
				 # DO NOT DELETE THIS LINE -- make depend depends on it.
			
 
				 
			
 
				-prac.o: mpcio.hpp types.hpp corotypes.hpp mpcio.tcc preproc.hpp options.hpp
			
 
				-prac.o: online.hpp
			
 
				-mpcio.o: mpcio.hpp types.hpp corotypes.hpp mpcio.tcc rdpf.hpp coroutine.hpp
			
 
				-mpcio.o: bitutils.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp cdpf.hpp
			
 
				-mpcio.o: cdpf.tcc
			
 
				-preproc.o: types.hpp coroutine.hpp corotypes.hpp mpcio.hpp mpcio.tcc
			
 
				-preproc.o: preproc.hpp options.hpp rdpf.hpp bitutils.hpp dpf.hpp prg.hpp
			
 
				-preproc.o: aes.hpp rdpf.tcc mpcops.hpp cdpf.hpp cdpf.tcc
			
 
				-online.o: online.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc options.hpp
			
 
				-online.o: mpcops.hpp coroutine.hpp rdpf.hpp bitutils.hpp dpf.hpp prg.hpp
			
 
				-online.o: aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc cell.hpp
			
 
				-mpcops.o: mpcops.hpp types.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				-mpcops.o: coroutine.hpp bitutils.hpp
			
 
				-rdpf.o: rdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc coroutine.hpp
			
 
				-rdpf.o: bitutils.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp
			
 
				+prac.o: mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc preproc.hpp
			
 
				+prac.o: options.hpp online.hpp
			
 
				+mpcio.o: mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc rdpf.hpp
			
 
				+mpcio.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp mpcops.tcc
			
 
				+mpcio.o: cdpf.hpp cdpf.tcc
			
 
				+preproc.o: types.hpp bitutils.hpp coroutine.hpp corotypes.hpp mpcio.hpp
			
 
				+preproc.o: mpcio.tcc preproc.hpp options.hpp rdpf.hpp dpf.hpp prg.hpp aes.hpp
			
 
				+preproc.o: rdpf.tcc mpcops.hpp mpcops.tcc cdpf.hpp cdpf.tcc
			
 
				+online.o: online.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
			
 
				+online.o: options.hpp mpcops.hpp coroutine.hpp mpcops.tcc rdpf.hpp dpf.hpp
			
 
				+online.o: prg.hpp aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc
			
 
				+online.o: cell.hpp
			
 
				+mpcops.o: mpcops.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				+mpcops.o: coroutine.hpp mpcops.tcc
			
 
				+rdpf.o: rdpf.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
			
 
				+rdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp mpcops.tcc
			
 
				 cdpf.o: bitutils.hpp cdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc
			
 
				 cdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp cdpf.tcc
			
 
				-duoram.o: duoram.hpp types.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				-duoram.o: coroutine.hpp duoram.tcc mpcops.hpp cdpf.hpp dpf.hpp prg.hpp
			
 
				-duoram.o: bitutils.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
			
 
				-cell.o: types.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc coroutine.hpp
			
 
				-cell.o: duoram.tcc mpcops.hpp cdpf.hpp dpf.hpp prg.hpp bitutils.hpp aes.hpp
			
 
				-cell.o: cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp
			
 
				+duoram.o: duoram.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				+duoram.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
			
 
				+duoram.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
			
 
				+cell.o: types.hpp bitutils.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				+cell.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
			
 
				+cell.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp
			
--- a/bitutils.hpp
+++ b/bitutils.hpp
@@ -5,6 +5,7 @@
 
				 #ifndef __BITUTILS_HPP__
			
 
				 #define __BITUTILS_HPP__
			
 
				 
			
 
				+#include <array>
			
 
				 #include <cstdint>
			
 
				 #include <x86intrin.h>  // SSE and AVX intrinsics
			
 
				 
			
@@ -42,12 +43,30 @@ inline __m128i xor_if(const __m128i & block1, const __m128i & block2, bool flag)
 
				     return _mm_xor_si128(block1, _mm_and_si128(block2, if128_mask[flag ? 1 : 0]));
			
 
				 }
			
 
				 
			
 
				+template <size_t LWIDTH>
			
 
				+inline std::array<__m128i,LWIDTH> xor_if(
			
 
				+    const std::array<__m128i,LWIDTH> & block1,
			
 
				+    const std::array<__m128i,LWIDTH> & block2, bool flag)
			
 
				+{
			
 
				+    std::array<__m128i,LWIDTH> res;
			
 
				+    for (size_t j=0;j<LWIDTH;++j) {
			
 
				+        res[j] = xor_if(block1[j], block2[j], flag);
			
 
				+    }
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				 inline uint8_t get_lsb(const __m128i & block, uint8_t bits = 0b01)
			
 
				 {
			
 
				     __m128i vcmp = _mm_xor_si128(_mm_and_si128(block, lsb128_mask[bits]), lsb128_mask[bits]);
			
 
				     return static_cast<uint8_t>(_mm_testz_si128(vcmp, vcmp));
			
 
				 }
			
 
				 
			
 
				+template <size_t LWIDTH>
			
 
				+inline uint8_t get_lsb(const std::array<__m128i,LWIDTH> & block)
			
 
				+{
			
 
				+    return get_lsb(block[0]);
			
 
				+}
			
 
				+
			
 
				 inline __m128i clear_lsb(const __m128i & block, uint8_t bits = 0b01)
			
 
				 {
			
 
				     return _mm_and_si128(block, lsb128_mask_inv[bits]);
			
--- a/mpcops.hpp
+++ b/mpcops.hpp
@@ -110,6 +110,17 @@ void mpc_xs_to_as(MPCTIO &tio, yield_t &yield,
 
				 void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
			
 
				     DPFnode &z, RegBS f, DPFnode x, DPFnode y);
			
 
				 
			
 
				+// As above, but for arrays of DPFnode
			
 
				+//
			
 
				+// Cost:
			
 
				+// 6*LWIDTH 64-bit words sent in 2 messages
			
 
				+// consumes LWIDTH AndTriples
			
 
				+template <size_t LWIDTH>
			
 
				+void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
			
 
				+    std::array<DPFnode,LWIDTH> &z, RegBS f,
			
 
				+    const std::array<DPFnode,LWIDTH> &x,
			
 
				+    const std::array<DPFnode,LWIDTH> &y);
			
 
				+
			
 
				 // P0 and P1 hold bit shares of x and y.  Set z to bit shares of x & y.
			
 
				 //
			
 
				 // Cost:
			
@@ -126,4 +137,6 @@ void mpc_and(MPCTIO &tio, yield_t &yield,
 
				 void mpc_or(MPCTIO &tio, yield_t &yield,
			
 
				     RegBS &z, RegBS x, RegBS y);
			
 
				 
			
 
				+#include "mpcops.tcc"
			
 
				+
			
 
				 #endif
			
--- a/mpcops.tcc
+++ b/mpcops.tcc
@@ -0,0 +1,16 @@
 
				+template <size_t LWIDTH>
			
 
				+void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
			
 
				+    std::array<DPFnode,LWIDTH> &z, RegBS f,
			
 
				+    const std::array<DPFnode,LWIDTH> &x,
			
 
				+    const std::array<DPFnode,LWIDTH> &y)
			
 
				+{
			
 
				+    std::vector<coro_t> coroutines;
			
 
				+    for (size_t j=0;j<LWIDTH;++j) {
			
 
				+        coroutines.emplace_back(
			
 
				+            [&tio, &z, f, &x, &y, j](yield_t &yield) {
			
 
				+                mpc_reconstruct_choice(tio, yield, z[j],
			
 
				+                    f, x[j], y[j]);
			
 
				+            });
			
 
				+    }
			
 
				+    run_coroutines(yield, coroutines);
			
 
				+}
			
--- a/prg.hpp
+++ b/prg.hpp
@@ -50,7 +50,7 @@ static inline void prgboth(__m128i &left, __m128i &right, __m128i seed,
 
				 // Compute one of the leaf children of node seed; whichchild=0 for
			
 
				 // the left child, 1 for the right child
			
 
				 template <size_t LWIDTH>
			
 
				-static inline void prgleaf(std::array<__m128i,LWIDTH> &out,
			
 
				+static inline void prg(std::array<__m128i,LWIDTH> &out,
			
 
				     __m128i seed, bool whichchild, size_t &aes_ops)
			
 
				 {
			
 
				     __m128i in = set_lsb(seed, whichchild);
			
@@ -73,7 +73,7 @@ static inline void prgleaf(std::array<__m128i,LWIDTH> &out,
 
				 
			
 
				 // Compute both of the leaf children of node seed
			
 
				 template <size_t LWIDTH>
			
 
				-static inline void prgleafboth(std::array<__m128i,LWIDTH> &left,
			
 
				+static inline void prgboth(std::array<__m128i,LWIDTH> &left,
			
 
				     std::array<__m128i,LWIDTH> &right, __m128i seed, size_t &aes_ops)
			
 
				 {
			
 
				     __m128i inl = set_lsb(seed, 0);
			
--- a/rdpf.hpp
+++ b/rdpf.hpp
@@ -45,6 +45,7 @@ struct RDPF : public DPF {
 
				     // one leaf level (at the bottom), but incremental RDPFs have a leaf
			
 
				     // level for each level of the DPF.
			
 
				     struct LeafInfo {
			
 
				+        static const nbits_t W = WIDTH;
			
 
				         // The correction word for this leaf level
			
 
				         LeafNode leaf_cw;
			
 
				         // The amount we have to scale the low words of the leaf values by
			
@@ -153,7 +154,7 @@ struct RDPF : public DPF {
 
				         value_t highword =
			
 
				             value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[0],8)));
			
 
				         x[j++].xshare = highword;
			
 
				-        for (nbits_t i=1;i<WIDTH;++i) {
			
 
				+        for (nbits_t i=1;i<LWIDTH;++i) {
			
 
				             value_t lowword =
			
 
				                 value_t(_mm_cvtsi128_si64x(leaf[i]));
			
 
				             value_t highword =
			
--- a/rdpf.tcc
+++ b/rdpf.tcc
@@ -191,7 +191,7 @@ inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
 
				 {
			
 
				     typename RDPF<WIDTH>::LeafNode prgout;
			
 
				     bool flag = get_lsb(parent);
			
 
				-    prgleaf(prgout, parent, whichchild, aes_ops);
			
 
				+    prg(prgout, parent, whichchild, aes_ops);
			
 
				     if (flag) {
			
 
				         LeafNode CW = li[0].leaf_cw;
			
 
				         LeafNode CWR = CW;
			
@@ -335,6 +335,463 @@ T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
 
				     return is;
			
 
				 }
			
 
				 
			
 
				+// Set a DPFnode to zero
			
 
				+static inline void zero(DPFnode &z)
			
 
				+{
			
 
				+    z = _mm_setzero_si128();
			
 
				+}
			
 
				+
			
 
				+// Set a LeafNode to zero
			
 
				+template <size_t LWIDTH>
			
 
				+static inline void zero(std::array<DPFnode,LWIDTH> &z)
			
 
				+{
			
 
				+    for (size_t j=0;j<LWIDTH;++j) {
			
 
				+        zero(z[j]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Set an array of value_r to zero
			
 
				+template <size_t WIDTH>
			
 
				+static inline void zero(std::array<value_t,WIDTH> &z)
			
 
				+{
			
 
				+    for (size_t j=0;j<WIDTH;++j) {
			
 
				+        z[j] = 0;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// Expand a level of the RDPF into the next level without threads. This
			
 
				+// just computes the PRGs without computing or applying the correction
			
 
				+// words.  L and R will be set to the XORs of the left children and the
			
 
				+// XORs of the right children respectively. NT will be LeafNode if we
			
 
				+// are expanding into a leaf level, DPFnode if not.
			
 
				+template <typename NT>
			
 
				+static inline void expand_level_nothreads(size_t start, size_t end,
			
 
				+    const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
			
 
				+    size_t &aes_ops)
			
 
				+{
			
 
				+    // Only touch registers in the inner loop if possible
			
 
				+    NT lL, lR;
			
 
				+    zero(lL);
			
 
				+    zero(lR);
			
 
				+    size_t laes_ops = 0;
			
 
				+    for(size_t i=start;i<end;++i) {
			
 
				+        NT lchild, rchild;
			
 
				+        prgboth(lchild, rchild, curlevel[i], laes_ops);
			
 
				+        lL ^= lchild;
			
 
				+        lR ^= rchild;
			
 
				+        nextlevel[2*i] = lchild;
			
 
				+        nextlevel[2*i+1] = rchild;
			
 
				+    }
			
 
				+    L = lL;
			
 
				+    R = lR;
			
 
				+    aes_ops += laes_ops;
			
 
				+}
			
 
				+
			
 
				+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
			
 
				+// timing benchmarks
			
 
				+template <typename NT>
			
 
				+static inline void expand_level(int max_nthreads, nbits_t level,
			
 
				+    const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
			
 
				+    size_t &aes_ops)
			
 
				+{
			
 
				+    size_t curlevel_size = (size_t(1)<<level);
			
 
				+    if (max_nthreads == 1 || level < 19) {
			
 
				+        // No threading
			
 
				+        expand_level_nothreads(0, curlevel_size,
			
 
				+            curlevel, nextlevel, L, R, aes_ops);
			
 
				+    } else {
			
 
				+        int nthreads =
			
 
				+            int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				+        if (nthreads > max_nthreads) {
			
 
				+            nthreads = max_nthreads;
			
 
				+        }
			
 
				+        NT tL[nthreads];
			
 
				+        NT tR[nthreads];
			
 
				+        size_t taes_ops[nthreads];
			
 
				+        size_t threadstart = 0;
			
 
				+        size_t threadchunk = curlevel_size / nthreads;
			
 
				+        size_t threadextra = curlevel_size % nthreads;
			
 
				+        boost::asio::thread_pool pool(nthreads);
			
 
				+        for (int t=0;t<nthreads;++t) {
			
 
				+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				+            size_t threadend = threadstart + threadsize;
			
 
				+            taes_ops[t] = 0;
			
 
				+            boost::asio::post(pool,
			
 
				+                [t, &tL, &tR, &taes_ops, threadstart, threadend,
			
 
				+                &curlevel, &nextlevel] {
			
 
				+                    expand_level_nothreads(threadstart, threadend,
			
 
				+                        curlevel, nextlevel, tL[t], tR[t], taes_ops[t]);
			
 
				+                });
			
 
				+            threadstart = threadend;
			
 
				+        }
			
 
				+        pool.join();
			
 
				+        // Again work on registers as much as possible
			
 
				+        NT lL, lR;
			
 
				+        zero(lL);
			
 
				+        zero(lR);
			
 
				+        size_t laes_ops = 0;
			
 
				+        for (int t=0;t<nthreads;++t) {
			
 
				+            lL ^= tL[t];
			
 
				+            lR ^= tR[t];
			
 
				+            laes_ops += taes_ops[t];
			
 
				+        }
			
 
				+        L = lL;
			
 
				+        R = lR;
			
 
				+        aes_ops += laes_ops;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Apply the correction words to an expanded non-leaf level (nextlevel),
			
 
				+// based on the flag bits in curlevel. This version does not use
			
 
				+// threads.
			
 
				+static inline void finalize_nonleaf_layer_nothreads(size_t start,
			
 
				+    size_t end, const DPFnode *curlevel, DPFnode *nextlevel,
			
 
				+    DPFnode CWL, DPFnode CWR)
			
 
				+{
			
 
				+    for(size_t i=start;i<end;++i) {
			
 
				+        bool flag = get_lsb(curlevel[i]);
			
 
				+        nextlevel[2*i] = xor_if(nextlevel[2*i], CWL, flag);
			
 
				+        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
			
 
				+// timing benchmarks.  The timing of each iteration of the inner loop is
			
 
				+// comparable to the above, so just use the same computations.  All of
			
 
				+// this could be tuned, of course.
			
 
				+static inline void finalize_nonleaf_layer(int max_nthreads, nbits_t level,
			
 
				+    const DPFnode *curlevel, DPFnode *nextlevel, DPFnode CWL,
			
 
				+    DPFnode CWR)
			
 
				+{
			
 
				+    size_t curlevel_size = (size_t(1)<<level);
			
 
				+    if (max_nthreads == 1 || level < 19) {
			
 
				+        // No threading
			
 
				+        finalize_nonleaf_layer_nothreads(0, curlevel_size,
			
 
				+            curlevel, nextlevel, CWL, CWR);
			
 
				+    } else {
			
 
				+        int nthreads =
			
 
				+            int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				+        if (nthreads > max_nthreads) {
			
 
				+            nthreads = max_nthreads;
			
 
				+        }
			
 
				+        size_t threadstart = 0;
			
 
				+        size_t threadchunk = curlevel_size / nthreads;
			
 
				+        size_t threadextra = curlevel_size % nthreads;
			
 
				+        boost::asio::thread_pool pool(nthreads);
			
 
				+        for (int t=0;t<nthreads;++t) {
			
 
				+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				+            size_t threadend = threadstart + threadsize;
			
 
				+            boost::asio::post(pool,
			
 
				+                [threadstart, threadend, CWL, CWR,
			
 
				+                &curlevel, &nextlevel] {
			
 
				+                    finalize_nonleaf_layer_nothreads(threadstart, threadend,
			
 
				+                        curlevel, nextlevel, CWL, CWR);
			
 
				+                });
			
 
				+            threadstart = threadend;
			
 
				+        }
			
 
				+        pool.join();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Finalize a leaf layer. This applies the correction words, and
			
 
				+// computes the low and high sums and XORs.  This version does not use
			
 
				+// threads.  You can pass save_expansion = false here if you don't need
			
 
				+// to save the expansion.  LN is a LeafNode.
			
 
				+template <size_t WIDTH, typename LN>
			
 
				+static inline void finalize_leaf_layer_nothreads(size_t start,
			
 
				+    size_t end, const DPFnode *curlevel, LN *nextlevel,
			
 
				+    bool save_expansion, LN CWL, LN CWR, value_t &low_sum,
			
 
				+    std::array<value_t,WIDTH> &high_sum,
			
 
				+    std::array<value_t,WIDTH> &high_xor)
			
 
				+{
			
 
				+    value_t llow_sum = 0;
			
 
				+    std::array<value_t,WIDTH> lhigh_sum;
			
 
				+    std::array<value_t,WIDTH> lhigh_xor;
			
 
				+    zero(lhigh_sum);
			
 
				+    zero(lhigh_xor);
			
 
				+    for(size_t i=start;i<end;++i) {
			
 
				+        bool flag = get_lsb(curlevel[i]);
			
 
				+        LN leftchild = xor_if(nextlevel[2*i], CWL, flag);
			
 
				+        LN rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				+        if (save_expansion) {
			
 
				+            nextlevel[2*i] = leftchild;
			
 
				+            nextlevel[2*i+1] = rightchild;
			
 
				+        }
			
 
				+        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[0]));
			
 
				+        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[0]));
			
 
				+        value_t lefthigh =
			
 
				+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[0],8)));
			
 
				+        value_t righthigh =
			
 
				+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[0],8)));
			
 
				+        llow_sum += (leftlow + rightlow);
			
 
				+        lhigh_sum[0] += (lefthigh + righthigh);
			
 
				+        lhigh_xor[0] ^= (lefthigh ^ righthigh);
			
 
				+        size_t w = 0;
			
 
				+        for (size_t j=1; j<WIDTH; j+=2) {
			
 
				+            ++w;
			
 
				+            value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[w]));
			
 
				+            value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[w]));
			
 
				+            value_t lefthigh =
			
 
				+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[w],8)));
			
 
				+            value_t righthigh =
			
 
				+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[w],8)));
			
 
				+            lhigh_sum[j] += (leftlow + rightlow);
			
 
				+            lhigh_xor[j] ^= (leftlow ^ rightlow);
			
 
				+            if (j+1 < WIDTH) {
			
 
				+                lhigh_sum[j+1] += (lefthigh + righthigh);
			
 
				+                lhigh_xor[j+1] ^= (lefthigh ^ righthigh);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    low_sum = llow_sum;
			
 
				+    high_sum = lhigh_sum;
			
 
				+    high_xor = lhigh_xor;
			
 
				+}
			
 
				+
			
 
				+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
			
 
				+// timing benchmarks.  The timing of each iteration of the inner loop is
			
 
				+// comparable to the above, so just use the same computations.  All of
			
 
				+// this could be tuned, of course.
			
 
				+template <size_t WIDTH, typename LN>
			
 
				+static inline void finalize_leaf_layer(int max_nthreads, nbits_t level,
			
 
				+    const DPFnode *curlevel, LN *nextlevel, bool save_expansion,
			
 
				+    LN CWL, LN CWR, value_t &low_sum,
			
 
				+    std::array<value_t,WIDTH> &high_sum,
			
 
				+    std::array<value_t,WIDTH> &high_xor)
			
 
				+{
			
 
				+    size_t curlevel_size = (size_t(1)<<level);
			
 
				+    if (max_nthreads == 1 || level < 19) {
			
 
				+        // No threading
			
 
				+        finalize_leaf_layer_nothreads(0, curlevel_size,
			
 
				+            curlevel, nextlevel, save_expansion, CWL, CWR,
			
 
				+            low_sum, high_sum, high_xor);
			
 
				+    } else {
			
 
				+        int nthreads =
			
 
				+            int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				+        if (nthreads > max_nthreads) {
			
 
				+            nthreads = max_nthreads;
			
 
				+        }
			
 
				+        value_t tlow_sum[nthreads];
			
 
				+        std::array<value_t,WIDTH> thigh_sum[nthreads];
			
 
				+        std::array<value_t,WIDTH> thigh_xor[nthreads];
			
 
				+        size_t threadstart = 0;
			
 
				+        size_t threadchunk = curlevel_size / nthreads;
			
 
				+        size_t threadextra = curlevel_size % nthreads;
			
 
				+        boost::asio::thread_pool pool(nthreads);
			
 
				+        for (int t=0;t<nthreads;++t) {
			
 
				+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				+            size_t threadend = threadstart + threadsize;
			
 
				+            boost::asio::post(pool,
			
 
				+                [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
			
 
				+                &curlevel, &nextlevel, CWL, CWR, save_expansion] {
			
 
				+                    finalize_leaf_layer_nothreads(threadstart, threadend,
			
 
				+                        curlevel, nextlevel, save_expansion, CWL, CWR,
			
 
				+                        tlow_sum[t], thigh_sum[t], thigh_xor[t]);
			
 
				+                });
			
 
				+            threadstart = threadend;
			
 
				+        }
			
 
				+        pool.join();
			
 
				+        low_sum = 0;
			
 
				+        zero(high_sum);
			
 
				+        zero(high_xor);
			
 
				+        for (int t=0;t<nthreads;++t) {
			
 
				+            low_sum += tlow_sum[t];
			
 
				+            high_sum += thigh_sum[t];
			
 
				+            high_xor ^= thigh_xor[t];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+// Create one level of the RDPF.  NT will be as above: LeafNode if we
			
 
				+// are expanding into a leaf level, DPFnode if not.  LI will be LeafInfo
			
 
				+// if we are expanding into a leaf level, and it is unused otherwise.
			
 
				+template<typename NT, typename LI>
			
 
				+static inline void create_level(MPCTIO &tio, yield_t &yield,
			
 
				+    const DPFnode *curlevel, NT *nextlevel,
			
 
				+    int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW,
			
 
				+    bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops)
			
 
				+{
			
 
				+    // tio.cpu_nthreads() is the maximum number of threads we
			
 
				+    // have available.
			
 
				+    int max_nthreads = tio.cpu_nthreads();
			
 
				+
			
 
				+    NT L, R;
			
 
				+    zero(L);
			
 
				+    zero(R);
			
 
				+    // The server doesn't need to do this computation, but it does
			
 
				+    // need to execute mpc_reconstruct_choice so that it sends
			
 
				+    // the AndTriples at the appropriate time.
			
 
				+    if (player < 2) {
			
 
				+#ifdef RDPF_MTGEN_TIMING_1
			
 
				+        if (player == 0) {
			
 
				+            mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				+                nextlevel, aes_ops);
			
 
				+            size_t niters = 2048;
			
 
				+            if (level > 8) niters = (1<<20)>>level;
			
 
				+            for(int t=1;t<=8;++t) {
			
 
				+                mtgen_timetest_1(level, t, niters, curlevel,
			
 
				+                    nextlevel, aes_ops);
			
 
				+            }
			
 
				+            mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				+                nextlevel, aes_ops);
			
 
				+        }
			
 
				+#endif
			
 
				+        // Using the timing results gathered above, decide whether
			
 
				+        // to multithread, and if so, how many threads to use.
			
 
				+        expand_level(max_nthreads, level, curlevel, nextlevel,
			
 
				+            L, R, aes_ops);
			
 
				+    }
			
 
				+
			
 
				+    // If we're going left (bs_choice = 0), we want the correction
			
 
				+    // word to be the XOR of our right side and our peer's right
			
 
				+    // side; if bs_choice = 1, it should be the XOR or our left side
			
 
				+    // and our peer's left side.
			
 
				+
			
 
				+    // We also have to ensure that the flag bits (the lsb) of the
			
 
				+    // side that will end up the same be of course the same, but
			
 
				+    // also that the flag bits (the lsb) of the side that will end
			
 
				+    // up different _must_ be different.  That is, it's not enough
			
 
				+    // for the nodes of the child selected by choice to be different
			
 
				+    // as 128-bit values; they also have to be different in their
			
 
				+    // lsb.
			
 
				+
			
 
				+    // This is where we make a small optimization over Appendix C of
			
 
				+    // the Duoram paper: instead of keeping separate correction flag
			
 
				+    // bits for the left and right children, we observe that the low
			
 
				+    // bit of the overall correction word effectively serves as one
			
 
				+    // of those bits, so we just need to store one extra bit per
			
 
				+    // level, not two.  (We arbitrarily choose the one for the right
			
 
				+    // child.)
			
 
				+
			
 
				+    // Note that the XOR of our left and right child before and
			
 
				+    // after applying the correction word won't change, since the
			
 
				+    // correction word is applied to either both children or
			
 
				+    // neither, depending on the value of the parent's flag. So in
			
 
				+    // particular, the XOR of the flag bits won't change, and if our
			
 
				+    // children's flag's XOR equals our peer's children's flag's
			
 
				+    // XOR, then we won't have different flag bits even for the
			
 
				+    // children that have different 128-bit values.
			
 
				+
			
 
				+    // So we compute our_parity = lsb(L^R)^player, and we XOR that
			
 
				+    // into the R value in the correction word computation.  At the
			
 
				+    // same time, we exchange these parity values to compute the
			
 
				+    // combined parity, which we store in the DPF.  Then when the
			
 
				+    // DPF is evaluated, if the parent's flag is set, not only apply
			
 
				+    // the correction work to both children, but also apply the
			
 
				+    // (combined) parity bit to just the right child.  Then for
			
 
				+    // unequal nodes (where the flag bit is different), exactly one
			
 
				+    // of the four children (two for P0 and two for P1) will have
			
 
				+    // the parity bit applied, which will set the XOR of the lsb of
			
 
				+    // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
			
 
				+    // = 1 because everything cancels out except player (for which
			
 
				+    // one player is 0 and the other is 1).
			
 
				+
			
 
				+    bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player;
			
 
				+    xor_lsb(R, our_parity_bit);
			
 
				+
			
 
				+    NT CWL;
			
 
				+    bool peer_parity_bit;
			
 
				+    // Exchange the parities and do mpc_reconstruct_choice at the
			
 
				+    // same time (bundled into the same rounds)
			
 
				+    run_coroutines(yield,
			
 
				+        [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
			
 
				+            tio.queue_peer(&our_parity_bit, 1);
			
 
				+            yield();
			
 
				+            uint8_t peer_parity_byte;
			
 
				+            tio.recv_peer(&peer_parity_byte, 1);
			
 
				+            peer_parity_bit = peer_parity_byte & 1;
			
 
				+        },
			
 
				+        [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) {
			
 
				+            mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L);
			
 
				+        });
			
 
				+    cfbit = our_parity_bit ^ peer_parity_bit;
			
 
				+    CW = CWL;
			
 
				+    NT CWR = CWL;
			
 
				+    xor_lsb(CWR, cfbit);
			
 
				+    if (player < 2) {
			
 
				+        // The timing of each iteration of the inner loop is
			
 
				+        // comparable to the above, so just use the same
			
 
				+        // computations.  All of this could be tuned, of course.
			
 
				+
			
 
				+        if constexpr (std::is_same_v<NT, DPFnode>) {
			
 
				+            finalize_nonleaf_layer(max_nthreads, level, curlevel,
			
 
				+                nextlevel, CWL, CWR);
			
 
				+        } else {
			
 
				+            // Recall there are four potentially useful vectors that
			
 
				+            // can come out of a DPF:
			
 
				+            // - (single-bit) bitwise unit vector
			
 
				+            // - additive-shared unit vector
			
 
				+            // - XOR-shared scaled unit vector
			
 
				+            // - additive-shared scaled unit vector
			
 
				+            //
			
 
				+            // (No single DPF should be used for both of the first
			
 
				+            // two or both of the last two, though, since they're
			
 
				+            // correlated; you _can_ use one of the first two and
			
 
				+            // one of the last two.)
			
 
				+            //
			
 
				+            // For each 128-bit leaf, the low bit is the flag bit,
			
 
				+            // and we're guaranteed that the flag bits (and indeed
			
 
				+            // the whole 128-bit value) for P0 and P1 are the same
			
 
				+            // for every leaf except the target, and that the flag
			
 
				+            // bits definitely differ for the target (and the other
			
 
				+            // 127 bits are independently random on each side).
			
 
				+            //
			
 
				+            // We divide the 128-bit leaf into a low 64-bit word and
			
 
				+            // a high 64-bit word.  We use the low word for the unit
			
 
				+            // vector and the high word for the scaled vector; this
			
 
				+            // choice is not arbitrary: the flag bit in the low word
			
 
				+            // means that the sum of all the low words (with P1's
			
 
				+            // low words negated) across both P0 and P1 is
			
 
				+            // definitely odd, so we can compute that sum's inverse
			
 
				+            // mod 2^64, and store it now during precomputation.  At
			
 
				+            // evaluation time for the additive-shared unit vector,
			
 
				+            // we will output this global inverse times the low word
			
 
				+            // of each leaf, which will make the sum of all of those
			
 
				+            // values 1.  (This technique replaces the protocol in
			
 
				+            // Appendix D of the Duoram paper.)
			
 
				+            //
			
 
				+            // For the scaled vector, we just have to compute shares
			
 
				+            // of what the scaled vector is a sharing _of_, but
			
 
				+            // that's just XORing or adding all of each party's
			
 
				+            // local high words; no communication needed.
			
 
				+
			
 
				+            value_t low_sum;
			
 
				+            const size_t WIDTH = LI::W;
			
 
				+            std::array<value_t,WIDTH> high_sum;
			
 
				+            std::array<value_t,WIDTH> high_xor;
			
 
				+            finalize_leaf_layer(max_nthreads, level, curlevel,
			
 
				+                nextlevel, save_expansion, CWL, CWR, low_sum, high_sum,
			
 
				+                high_xor);
			
 
				+
			
 
				+            if (player == 1) {
			
 
				+                low_sum = -low_sum;
			
 
				+                for(size_t j=0; j<WIDTH; ++j) {
			
 
				+                    high_sum[j] = -high_sum[j];
			
 
				+                }
			
 
				+            }
			
 
				+            for(size_t j=0; j<WIDTH; ++j) {
			
 
				+                li.scaled_sum[j].ashare = high_sum[j];
			
 
				+                li.scaled_xor[j].xshare = high_xor[j];
			
 
				+            }
			
 
				+            // Exchange low_sum and add them up
			
 
				+            tio.queue_peer(&low_sum, sizeof(low_sum));
			
 
				+            yield();
			
 
				+            value_t peer_low_sum;
			
 
				+            tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
			
 
				+            low_sum += peer_low_sum;
			
 
				+            // The low_sum had better be odd
			
 
				+            assert(low_sum & 1);
			
 
				+            li.unit_sum_inverse = inverse_value_t(low_sum);
			
 
				+        }
			
 
				+    } else if (level == depth-1) {
			
 
				+        yield();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				 // Construct a DPF with the given (XOR-shared) target location, and
			
 
				 // of the given depth, to be used for random-access memory reads and
			
 
				 // writes.  The DPF is construction collaboratively by P0 and P1,
			
@@ -369,362 +826,56 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				 
			
 
				     // Construct each intermediate level
			
 
				     while(level < depth) {
			
 
				+        LeafNode *leaflevel = NULL;
			
 
				         if (player < 2) {
			
 
				             delete[] curlevel;
			
 
				             curlevel = nextlevel;
			
 
				+            nextlevel = NULL;
			
 
				             if (save_expansion && level == depth-1) {
			
 
				                 expansion.resize(1<<depth);
			
 
				-                nextlevel = (DPFnode *)expansion.data();
			
 
				+                leaflevel = expansion.data();
			
 
				+            } else if (level == depth-1) {
			
 
				+                leaflevel = new LeafNode[1<<depth];
			
 
				             } else {
			
 
				                 nextlevel = new DPFnode[1<<(level+1)];
			
 
				             }
			
 
				         }
			
 
				         // Invariant: curlevel has 2^level elements; nextlevel has
			
 
				-        // 2^{level+1} elements
			
 
				+        // 2^{level+1} DPFnode elements if we're not at the last level,
			
 
				+        // and leaflevel has 2^{level+1} LeafNode elements if we are at
			
 
				+        // a leaf level (the last level always, and all levels if we are
			
 
				+        // making an incremental RDPF).
			
 
				 
			
 
				         // The bit-shared choice bit is bit (depth-level-1) of the
			
 
				         // XOR-shared target index
			
 
				         RegBS bs_choice = target.bit(depth-level-1);
			
 
				-        size_t curlevel_size = (size_t(1)<<level);
			
 
				-        DPFnode L = _mm_setzero_si128();
			
 
				-        DPFnode R = _mm_setzero_si128();
			
 
				-        // The server doesn't need to do this computation, but it does
			
 
				-        // need to execute mpc_reconstruct_choice so that it sends
			
 
				-        // the AndTriples at the appropriate time.
			
 
				-        if (player < 2) {
			
 
				-#ifdef RDPF_MTGEN_TIMING_1
			
 
				-            if (player == 0) {
			
 
				-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				-                    nextlevel, aes_ops);
			
 
				-                size_t niters = 2048;
			
 
				-                if (level > 8) niters = (1<<20)>>level;
			
 
				-                for(int t=1;t<=8;++t) {
			
 
				-                    mtgen_timetest_1(level, t, niters, curlevel,
			
 
				-                        nextlevel, aes_ops);
			
 
				-                }
			
 
				-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				-                    nextlevel, aes_ops);
			
 
				-            }
			
 
				-#endif
			
 
				-            // Using the timing results gathered above, decide whether
			
 
				-            // to multithread, and if so, how many threads to use.
			
 
				-            // tio.cpu_nthreads() is the maximum number we have
			
 
				-            // available.
			
 
				-            int max_nthreads = tio.cpu_nthreads();
			
 
				-            if (max_nthreads == 1 || level < 19) {
			
 
				-                // No threading
			
 
				-                size_t laes_ops = 0;
			
 
				-                for(size_t i=0;i<curlevel_size;++i) {
			
 
				-                    DPFnode lchild, rchild;
			
 
				-                    prgboth(lchild, rchild, curlevel[i], laes_ops);
			
 
				-                    L = (L ^ lchild);
			
 
				-                    R = (R ^ rchild);
			
 
				-                    nextlevel[2*i] = lchild;
			
 
				-                    nextlevel[2*i+1] = rchild;
			
 
				-                }
			
 
				-                aes_ops += laes_ops;
			
 
				-            } else {
			
 
				-                size_t curlevel_size = size_t(1)<<level;
			
 
				-                int nthreads =
			
 
				-                    int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				-                if (nthreads > max_nthreads) {
			
 
				-                    nthreads = max_nthreads;
			
 
				-                }
			
 
				-                DPFnode tL[nthreads];
			
 
				-                DPFnode tR[nthreads];
			
 
				-                size_t taes_ops[nthreads];
			
 
				-                size_t threadstart = 0;
			
 
				-                size_t threadchunk = curlevel_size / nthreads;
			
 
				-                size_t threadextra = curlevel_size % nthreads;
			
 
				-                boost::asio::thread_pool pool(nthreads);
			
 
				-                for (int t=0;t<nthreads;++t) {
			
 
				-                    size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				-                    size_t threadend = threadstart + threadsize;
			
 
				-                    boost::asio::post(pool,
			
 
				-                        [t, &tL, &tR, &taes_ops, threadstart, threadend,
			
 
				-                        &curlevel, &nextlevel] {
			
 
				-                            DPFnode L = _mm_setzero_si128();
			
 
				-                            DPFnode R = _mm_setzero_si128();
			
 
				-                            size_t aes_ops = 0;
			
 
				-                            for(size_t i=threadstart;i<threadend;++i) {
			
 
				-                                DPFnode lchild, rchild;
			
 
				-                                prgboth(lchild, rchild, curlevel[i], aes_ops);
			
 
				-                                L = (L ^ lchild);
			
 
				-                                R = (R ^ rchild);
			
 
				-                                nextlevel[2*i] = lchild;
			
 
				-                                nextlevel[2*i+1] = rchild;
			
 
				-                            }
			
 
				-                            tL[t] = L;
			
 
				-                            tR[t] = R;
			
 
				-                            taes_ops[t] = aes_ops;
			
 
				-                        });
			
 
				-                    threadstart = threadend;
			
 
				-                }
			
 
				-                pool.join();
			
 
				-                for (int t=0;t<nthreads;++t) {
			
 
				-                    L ^= tL[t];
			
 
				-                    R ^= tR[t];
			
 
				-                    aes_ops += taes_ops[t];
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-        // If we're going left (bs_choice = 0), we want the correction
			
 
				-        // word to be the XOR of our right side and our peer's right
			
 
				-        // side; if bs_choice = 1, it should be the XOR or our left side
			
 
				-        // and our peer's left side.
			
 
				-
			
 
				-        // We also have to ensure that the flag bits (the lsb) of the
			
 
				-        // side that will end up the same be of course the same, but
			
 
				-        // also that the flag bits (the lsb) of the side that will end
			
 
				-        // up different _must_ be different.  That is, it's not enough
			
 
				-        // for the nodes of the child selected by choice to be different
			
 
				-        // as 128-bit values; they also have to be different in their
			
 
				-        // lsb.
			
 
				-
			
 
				-        // This is where we make a small optimization over Appendix C of
			
 
				-        // the Duoram paper: instead of keeping separate correction flag
			
 
				-        // bits for the left and right children, we observe that the low
			
 
				-        // bit of the overall correction word effectively serves as one
			
 
				-        // of those bits, so we just need to store one extra bit per
			
 
				-        // level, not two.  (We arbitrarily choose the one for the right
			
 
				-        // child.)
			
 
				-
			
 
				-        // Note that the XOR of our left and right child before and
			
 
				-        // after applying the correction word won't change, since the
			
 
				-        // correction word is applied to either both children or
			
 
				-        // neither, depending on the value of the parent's flag. So in
			
 
				-        // particular, the XOR of the flag bits won't change, and if our
			
 
				-        // children's flag's XOR equals our peer's children's flag's
			
 
				-        // XOR, then we won't have different flag bits even for the
			
 
				-        // children that have different 128-bit values.
			
 
				-
			
 
				-        // So we compute our_parity = lsb(L^R)^player, and we XOR that
			
 
				-        // into the R value in the correction word computation.  At the
			
 
				-        // same time, we exchange these parity values to compute the
			
 
				-        // combined parity, which we store in the DPF.  Then when the
			
 
				-        // DPF is evaluated, if the parent's flag is set, not only apply
			
 
				-        // the correction work to both children, but also apply the
			
 
				-        // (combined) parity bit to just the right child.  Then for
			
 
				-        // unequal nodes (where the flag bit is different), exactly one
			
 
				-        // of the four children (two for P0 and two for P1) will have
			
 
				-        // the parity bit applied, which will set the XOR of the lsb of
			
 
				-        // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
			
 
				-        // = 1 because everything cancels out except player (for which
			
 
				-        // one player is 0 and the other is 1).
			
 
				-
			
 
				-        bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
			
 
				-        DPFnode our_parity = lsb128_mask[our_parity_bit];
			
 
				-
			
 
				-        DPFnode CW;
			
 
				-        bool peer_parity_bit;
			
 
				-        // Exchange the parities and do mpc_reconstruct_choice at the
			
 
				-        // same time (bundled into the same rounds)
			
 
				-        run_coroutines(yield,
			
 
				-            [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
			
 
				-                tio.queue_peer(&our_parity_bit, 1);
			
 
				-                yield();
			
 
				-                uint8_t peer_parity_byte;
			
 
				-                tio.recv_peer(&peer_parity_byte, 1);
			
 
				-                peer_parity_bit = peer_parity_byte & 1;
			
 
				-            },
			
 
				-            [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
			
 
				-                mpc_reconstruct_choice(tio, yield, CW, bs_choice,
			
 
				-                    (R ^ our_parity), L);
			
 
				-            });
			
 
				-        bool parity_bit = our_parity_bit ^ peer_parity_bit;
			
 
				-        cfbits |= (value_t(parity_bit)<<level);
			
 
				-        DPFnode CWR = CW ^ lsb128_mask[parity_bit];
			
 
				-        if (player < 2) {
			
 
				-            // The timing of each iteration of the inner loop is
			
 
				-            // comparable to the above, so just use the same
			
 
				-            // computations.  All of this could be tuned, of course.
			
 
				-
			
 
				-            if (level < depth-1) {
			
 
				-                // Using the timing results gathered above, decide whether
			
 
				-                // to multithread, and if so, how many threads to use.
			
 
				-                // tio.cpu_nthreads() is the maximum number we have
			
 
				-                // available.
			
 
				-                int max_nthreads = tio.cpu_nthreads();
			
 
				-                if (max_nthreads == 1 || level < 19) {
			
 
				-                    // No threading
			
 
				-                    for(size_t i=0;i<curlevel_size;++i) {
			
 
				-                        bool flag = get_lsb(curlevel[i]);
			
 
				-                        nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                    }
			
 
				-                } else {
			
 
				-                    int nthreads =
			
 
				-                        int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				-                    if (nthreads > max_nthreads) {
			
 
				-                        nthreads = max_nthreads;
			
 
				-                    }
			
 
				-                    size_t threadstart = 0;
			
 
				-                    size_t threadchunk = curlevel_size / nthreads;
			
 
				-                    size_t threadextra = curlevel_size % nthreads;
			
 
				-                    boost::asio::thread_pool pool(nthreads);
			
 
				-                    for (int t=0;t<nthreads;++t) {
			
 
				-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				-                        size_t threadend = threadstart + threadsize;
			
 
				-                        boost::asio::post(pool, [CW, CWR, threadstart, threadend,
			
 
				-                            &curlevel, &nextlevel] {
			
 
				-                                for(size_t i=threadstart;i<threadend;++i) {
			
 
				-                                    bool flag = get_lsb(curlevel[i]);
			
 
				-                                    nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                                    nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                                }
			
 
				-                        });
			
 
				-                        threadstart = threadend;
			
 
				-                    }
			
 
				-                    pool.join();
			
 
				-                }
			
 
				-            } else {
			
 
				-                // Recall there are four potentially useful vectors that
			
 
				-                // can come out of a DPF:
			
 
				-                // - (single-bit) bitwise unit vector
			
 
				-                // - additive-shared unit vector
			
 
				-                // - XOR-shared scaled unit vector
			
 
				-                // - additive-shared scaled unit vector
			
 
				-                //
			
 
				-                // (No single DPF should be used for both of the first
			
 
				-                // two or both of the last two, though, since they're
			
 
				-                // correlated; you _can_ use one of the first two and
			
 
				-                // one of the last two.)
			
 
				-                //
			
 
				-                // For each 128-bit leaf, the low bit is the flag bit,
			
 
				-                // and we're guaranteed that the flag bits (and indeed
			
 
				-                // the whole 128-bit value) for P0 and P1 are the same
			
 
				-                // for every leaf except the target, and that the flag
			
 
				-                // bits definitely differ for the target (and the other
			
 
				-                // 127 bits are independently random on each side).
			
 
				-                //
			
 
				-                // We divide the 128-bit leaf into a low 64-bit word and
			
 
				-                // a high 64-bit word.  We use the low word for the unit
			
 
				-                // vector and the high word for the scaled vector; this
			
 
				-                // choice is not arbitrary: the flag bit in the low word
			
 
				-                // means that the sum of all the low words (with P1's
			
 
				-                // low words negated) across both P0 and P1 is
			
 
				-                // definitely odd, so we can compute that sum's inverse
			
 
				-                // mod 2^64, and store it now during precomputation.  At
			
 
				-                // evaluation time for the additive-shared unit vector,
			
 
				-                // we will output this global inverse times the low word
			
 
				-                // of each leaf, which will make the sum of all of those
			
 
				-                // values 1.  (This technique replaces the protocol in
			
 
				-                // Appendix D of the Duoram paper.)
			
 
				-                //
			
 
				-                // For the scaled vector, we just have to compute shares
			
 
				-                // of what the scaled vector is a sharing _of_, but
			
 
				-                // that's just XORing or adding all of each party's
			
 
				-                // local high words; no communication needed.
			
 
				-
			
 
				-                value_t low_sum = 0;
			
 
				-                value_t high_sum = 0;
			
 
				-                value_t high_xor = 0;
			
 
				-                // Using the timing results gathered above, decide whether
			
 
				-                // to multithread, and if so, how many threads to use.
			
 
				-                // tio.cpu_nthreads() is the maximum number we have
			
 
				-                // available.
			
 
				-                int max_nthreads = tio.cpu_nthreads();
			
 
				-                if (max_nthreads == 1 || level < 19) {
			
 
				-                    // No threading
			
 
				-                    for(size_t i=0;i<curlevel_size;++i) {
			
 
				-                        bool flag = get_lsb(curlevel[i]);
			
 
				-                        DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                        DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                        if (save_expansion) {
			
 
				-                            nextlevel[2*i] = leftchild;
			
 
				-                            nextlevel[2*i+1] = rightchild;
			
 
				-                        }
			
 
				-                        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
			
 
				-                        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
			
 
				-                        value_t lefthigh =
			
 
				-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
			
 
				-                        value_t righthigh =
			
 
				-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
			
 
				-                        low_sum += (leftlow + rightlow);
			
 
				-                        high_sum += (lefthigh + righthigh);
			
 
				-                        high_xor ^= (lefthigh ^ righthigh);
			
 
				-                    }
			
 
				-                } else {
			
 
				-                    int nthreads =
			
 
				-                        int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				-                    if (nthreads > max_nthreads) {
			
 
				-                        nthreads = max_nthreads;
			
 
				-                    }
			
 
				-                    value_t tlow_sum[nthreads];
			
 
				-                    value_t thigh_sum[nthreads];
			
 
				-                    value_t thigh_xor[nthreads];
			
 
				-                    size_t threadstart = 0;
			
 
				-                    size_t threadchunk = curlevel_size / nthreads;
			
 
				-                    size_t threadextra = curlevel_size % nthreads;
			
 
				-                    boost::asio::thread_pool pool(nthreads);
			
 
				-                    for (int t=0;t<nthreads;++t) {
			
 
				-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				-                        size_t threadend = threadstart + threadsize;
			
 
				-                        boost::asio::post(pool,
			
 
				-                            [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
			
 
				-                            &curlevel, &nextlevel, CW, CWR, save_expansion] {
			
 
				-                                value_t low_sum = 0;
			
 
				-                                value_t high_sum = 0;
			
 
				-                                value_t high_xor = 0;
			
 
				-                                for(size_t i=threadstart;i<threadend;++i) {
			
 
				-                                    bool flag = get_lsb(curlevel[i]);
			
 
				-                                    DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                                    DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                                    if (save_expansion) {
			
 
				-                                        nextlevel[2*i] = leftchild;
			
 
				-                                        nextlevel[2*i+1] = rightchild;
			
 
				-                                    }
			
 
				-                                    value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
			
 
				-                                    value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
			
 
				-                                    value_t lefthigh =
			
 
				-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
			
 
				-                                    value_t righthigh =
			
 
				-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
			
 
				-                                    low_sum += (leftlow + rightlow);
			
 
				-                                    high_sum += (lefthigh + righthigh);
			
 
				-                                    high_xor ^= (lefthigh ^ righthigh);
			
 
				-                                }
			
 
				-                                tlow_sum[t] = low_sum;
			
 
				-                                thigh_sum[t] = high_sum;
			
 
				-                                thigh_xor[t] = high_xor;
			
 
				-                            });
			
 
				-                        threadstart = threadend;
			
 
				-                    }
			
 
				-                    pool.join();
			
 
				-                    for (int t=0;t<nthreads;++t) {
			
 
				-                        low_sum += tlow_sum[t];
			
 
				-                        high_sum += thigh_sum[t];
			
 
				-                        high_xor ^= thigh_xor[t];
			
 
				-                    }
			
 
				-                }
			
 
				-                if (player == 1) {
			
 
				-                    low_sum = -low_sum;
			
 
				-                    high_sum = -high_sum;
			
 
				-                }
			
 
				-                li[0].scaled_sum[0].ashare = high_sum;
			
 
				-                li[0].scaled_xor[0].xshare = high_xor;
			
 
				-                // Exchange low_sum and add them up
			
 
				-                tio.queue_peer(&low_sum, sizeof(low_sum));
			
 
				-                yield();
			
 
				-                value_t peer_low_sum;
			
 
				-                tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
			
 
				-                low_sum += peer_low_sum;
			
 
				-                // The low_sum had better be odd
			
 
				-                assert(low_sum & 1);
			
 
				-                li[0].unit_sum_inverse = inverse_value_t(low_sum);
			
 
				+        bool cfbit;
			
 
				+
			
 
				+        if (level < depth-1) {
			
 
				+            DPFnode CW;
			
 
				+            create_level(tio, yield, curlevel, nextlevel, player, level,
			
 
				+                depth, bs_choice, CW, cfbit, save_expansion, li[0],
			
 
				+                aes_ops);
			
 
				+            cfbits |= (value_t(cfbit)<<level);
			
 
				+            if (player < 2) {
			
 
				+                cw.push_back(CW);
			
 
				             }
			
 
				-            cw.push_back(CW);
			
 
				-        } else if (level == depth-1) {
			
 
				-            yield();
			
 
				+        } else {
			
 
				+            LeafNode CW;
			
 
				+            create_level(tio, yield, curlevel, leaflevel, player, level,
			
 
				+                depth, bs_choice, CW, cfbit, save_expansion, li[0],
			
 
				+                aes_ops);
			
 
				+            li[0].leaf_cw = CW;
			
 
				         }
			
 
				 
			
 
				+        if (!save_expansion) {
			
 
				+            delete[] leaflevel;
			
 
				+        }
			
 
				         ++level;
			
 
				     }
			
 
				 
			
 
				     delete[] curlevel;
			
 
				-    if (!save_expansion || player == 2) {
			
 
				-        delete[] nextlevel;
			
 
				-    }
			
 
				+    delete[] nextlevel;
			
 
				 }
			
 
				 
			
 
				 // Get the leaf node for the given input
			
--- a/types.hpp
+++ b/types.hpp
@@ -8,6 +8,8 @@
 
				 #include <x86intrin.h>  // SSE and AVX intrinsics
			
 
				 #include <bsd/stdlib.h> // arc4random_buf
			
 
				 
			
 
				+#include "bitutils.hpp"
			
 
				+
			
 
				 // The number of bits in an MPC secret-shared memory word
			
 
				 
			
 
				 #ifndef VALUE_BITS
			
@@ -648,6 +650,14 @@ inline std::array<S,N> &operator^=(std::array<S,N> &A, const std::array<S,N> &B)
 
				     return A;
			
 
				 }
			
 
				 
			
 
				+// XOR the bit B into the low bit of A
			
 
				+template <typename S, size_t N>
			
 
				+inline std::array<S,N> &xor_lsb(std::array<S,N> &A, bit_t B)
			
 
				+{
			
 
				+    A[0] ^= lsb128_mask[B];
			
 
				+    return A;
			
 
				+}
			
 
				+
			
 
				 template <typename S, size_t N>
			
 
				 inline std::tuple<std::array<value_t,N>,std::array<value_t,N>,std::array<value_t,N>>
			
 
				     combine(
			
@@ -713,6 +723,13 @@ struct AndTripleName { static constexpr const char *name = "a"; };
 
				 
			
 
				 using DPFnode = __m128i;
			
 
				 
			
 
				+// XOR the bit B into the low bit of A
			
 
				+inline DPFnode &xor_lsb(DPFnode &A, bit_t B)
			
 
				+{
			
 
				+    A ^= lsb128_mask[B];
			
 
				+    return A;
			
 
				+}
			
 
				+
			
 
				 // A Select triple for type V (V is DPFnode, value_t, or bit_t) is a
			
 
				 // triple of (X0,Y0,Z0) where X0 is a bit and Y0 and Z0 are Vs held by
			
 
				 // P0 (and correspondingly (X1,Y1,Z1) held by P1), with all values