1 年之前 · ff2653d6ea
--- a/Makefile
+++ b/Makefile
@@ -31,26 +31,27 @@ depend:
 
															 # DO NOT DELETE THIS LINE -- make depend depends on it.
														
 
															-prac.o: mpcio.hpp types.hpp corotypes.hpp mpcio.tcc preproc.hpp options.hpp
														
 
															-prac.o: online.hpp
														
 
															-mpcio.o: mpcio.hpp types.hpp corotypes.hpp mpcio.tcc rdpf.hpp coroutine.hpp
														
 
															-mpcio.o: bitutils.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp cdpf.hpp
														
 
															-mpcio.o: cdpf.tcc
														
 
															-preproc.o: types.hpp coroutine.hpp corotypes.hpp mpcio.hpp mpcio.tcc
														
 
															-preproc.o: preproc.hpp options.hpp rdpf.hpp bitutils.hpp dpf.hpp prg.hpp
														
 
															-preproc.o: aes.hpp rdpf.tcc mpcops.hpp cdpf.hpp cdpf.tcc
														
 
															-online.o: online.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc options.hpp
														
 
															-online.o: mpcops.hpp coroutine.hpp rdpf.hpp bitutils.hpp dpf.hpp prg.hpp
														
 
															-online.o: aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc cell.hpp
														
 
															-mpcops.o: mpcops.hpp types.hpp mpcio.hpp corotypes.hpp mpcio.tcc
														
 
															-mpcops.o: coroutine.hpp bitutils.hpp
														
 
															-rdpf.o: rdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc coroutine.hpp
														
 
															-rdpf.o: bitutils.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp
														
 
															+prac.o: mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc preproc.hpp
														
 
															+prac.o: options.hpp online.hpp
														
 
															+mpcio.o: mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc rdpf.hpp
														
 
															+mpcio.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp mpcops.tcc
														
 
															+mpcio.o: cdpf.hpp cdpf.tcc
														
 
															+preproc.o: types.hpp bitutils.hpp coroutine.hpp corotypes.hpp mpcio.hpp
														
 
															+preproc.o: mpcio.tcc preproc.hpp options.hpp rdpf.hpp dpf.hpp prg.hpp aes.hpp
														
 
															+preproc.o: rdpf.tcc mpcops.hpp mpcops.tcc cdpf.hpp cdpf.tcc
														
 
															+online.o: online.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
														
 
															+online.o: options.hpp mpcops.hpp coroutine.hpp mpcops.tcc rdpf.hpp dpf.hpp
														
 
															+online.o: prg.hpp aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc
														
 
															+online.o: cell.hpp
														
 
															+mpcops.o: mpcops.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
														
 
															+mpcops.o: coroutine.hpp mpcops.tcc
														
 
															+rdpf.o: rdpf.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
														
 
															+rdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp mpcops.tcc
														
 
															 cdpf.o: bitutils.hpp cdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc
														
 
															 cdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp cdpf.tcc
														
 
															-duoram.o: duoram.hpp types.hpp mpcio.hpp corotypes.hpp mpcio.tcc
														
 
															-duoram.o: coroutine.hpp duoram.tcc mpcops.hpp cdpf.hpp dpf.hpp prg.hpp
														
 
															-duoram.o: bitutils.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
														
 
															-cell.o: types.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc coroutine.hpp
														
 
															-cell.o: duoram.tcc mpcops.hpp cdpf.hpp dpf.hpp prg.hpp bitutils.hpp aes.hpp
														
 
															-cell.o: cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp
														
 
															+duoram.o: duoram.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
														
 
															+duoram.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
														
 
															+duoram.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
														
 
															+cell.o: types.hpp bitutils.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc
														
 
															+cell.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
														
 
															+cell.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp
														
--- a/bitutils.hpp
+++ b/bitutils.hpp
@@ -5,6 +5,7 @@
 
															 #ifndef __BITUTILS_HPP__
														
 
															 #define __BITUTILS_HPP__
														
 
															+#include <array>
														
 
															 #include <cstdint>
														
 
															 #include <x86intrin.h>  // SSE and AVX intrinsics
														
@@ -42,12 +43,30 @@ inline __m128i xor_if(const __m128i & block1, const __m128i & block2, bool flag)
 
															     return _mm_xor_si128(block1, _mm_and_si128(block2, if128_mask[flag ? 1 : 0]));
														
 
															 }
														
 
															+template <size_t LWIDTH>
														
 
															+inline std::array<__m128i,LWIDTH> xor_if(
														
 
															+    const std::array<__m128i,LWIDTH> & block1,
														
 
															+    const std::array<__m128i,LWIDTH> & block2, bool flag)
														
 
															+{
														
 
															+    std::array<__m128i,LWIDTH> res;
														
 
															+    for (size_t j=0;j<LWIDTH;++j) {
														
 
															+        res[j] = xor_if(block1[j], block2[j], flag);
														
 
															+    }
														
 
															+    return res;
														
 
															+}
														
 
															+
														
 
															 inline uint8_t get_lsb(const __m128i & block, uint8_t bits = 0b01)
														
 
															 {
														
 
															     __m128i vcmp = _mm_xor_si128(_mm_and_si128(block, lsb128_mask[bits]), lsb128_mask[bits]);
														
 
															     return static_cast<uint8_t>(_mm_testz_si128(vcmp, vcmp));
														
 
															 }
														
 
															+template <size_t LWIDTH>
														
 
															+inline uint8_t get_lsb(const std::array<__m128i,LWIDTH> & block)
														
 
															+{
														
 
															+    return get_lsb(block[0]);
														
 
															+}
														
 
															+
														
 
															 inline __m128i clear_lsb(const __m128i & block, uint8_t bits = 0b01)
														
 
															 {
														
 
															     return _mm_and_si128(block, lsb128_mask_inv[bits]);
														
--- a/mpcops.hpp
+++ b/mpcops.hpp
@@ -110,6 +110,17 @@ void mpc_xs_to_as(MPCTIO &tio, yield_t &yield,
 
															 void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
														
 
															     DPFnode &z, RegBS f, DPFnode x, DPFnode y);
														
 
															+// As above, but for arrays of DPFnode
														
 
															+//
														
 
															+// Cost:
														
 
															+// 6*LWIDTH 64-bit words sent in 2 messages
														
 
															+// consumes LWIDTH AndTriples
														
 
															+template <size_t LWIDTH>
														
 
															+void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
														
 
															+    std::array<DPFnode,LWIDTH> &z, RegBS f,
														
 
															+    const std::array<DPFnode,LWIDTH> &x,
														
 
															+    const std::array<DPFnode,LWIDTH> &y);
														
 
															+
														
 
															 // P0 and P1 hold bit shares of x and y.  Set z to bit shares of x & y.
														
 
															 //
														
 
															 // Cost:
														
@@ -126,4 +137,6 @@ void mpc_and(MPCTIO &tio, yield_t &yield,
 
															 void mpc_or(MPCTIO &tio, yield_t &yield,
														
 
															     RegBS &z, RegBS x, RegBS y);
														
 
															+#include "mpcops.tcc"
														
 
															+
														
 
															 #endif
														
--- a/mpcops.tcc
+++ b/mpcops.tcc
@@ -0,0 +1,16 @@
 
															+template <size_t LWIDTH>
														
 
															+void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
														
 
															+    std::array<DPFnode,LWIDTH> &z, RegBS f,
														
 
															+    const std::array<DPFnode,LWIDTH> &x,
														
 
															+    const std::array<DPFnode,LWIDTH> &y)
														
 
															+{
														
 
															+    std::vector<coro_t> coroutines;
														
 
															+    for (size_t j=0;j<LWIDTH;++j) {
														
 
															+        coroutines.emplace_back(
														
 
															+            [&tio, &z, f, &x, &y, j](yield_t &yield) {
														
 
															+                mpc_reconstruct_choice(tio, yield, z[j],
														
 
															+                    f, x[j], y[j]);
														
 
															+            });
														
 
															+    }
														
 
															+    run_coroutines(yield, coroutines);
														
 
															+}
														
--- a/prg.hpp
+++ b/prg.hpp
@@ -50,7 +50,7 @@ static inline void prgboth(__m128i &left, __m128i &right, __m128i seed,
 
															 // Compute one of the leaf children of node seed; whichchild=0 for
														
 
															 // the left child, 1 for the right child
														
 
															 template <size_t LWIDTH>
														
 
															-static inline void prgleaf(std::array<__m128i,LWIDTH> &out,
														
 
															+static inline void prg(std::array<__m128i,LWIDTH> &out,
														
 
															     __m128i seed, bool whichchild, size_t &aes_ops)
														
 
															 {
														
 
															     __m128i in = set_lsb(seed, whichchild);
														
@@ -73,7 +73,7 @@ static inline void prgleaf(std::array<__m128i,LWIDTH> &out,
 
															 // Compute both of the leaf children of node seed
														
 
															 template <size_t LWIDTH>
														
 
															-static inline void prgleafboth(std::array<__m128i,LWIDTH> &left,
														
 
															+static inline void prgboth(std::array<__m128i,LWIDTH> &left,
														
 
															     std::array<__m128i,LWIDTH> &right, __m128i seed, size_t &aes_ops)
														
 
															 {
														
 
															     __m128i inl = set_lsb(seed, 0);
														
--- a/rdpf.hpp
+++ b/rdpf.hpp
@@ -45,6 +45,7 @@ struct RDPF : public DPF {
 
															     // one leaf level (at the bottom), but incremental RDPFs have a leaf
														
 
															     // level for each level of the DPF.
														
 
															     struct LeafInfo {
														
 
															+        static const nbits_t W = WIDTH;
														
 
															         // The correction word for this leaf level
														
 
															         LeafNode leaf_cw;
														
 
															         // The amount we have to scale the low words of the leaf values by
														
@@ -153,7 +154,7 @@ struct RDPF : public DPF {
 
															         value_t highword =
														
 
															             value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[0],8)));
														
 
															         x[j++].xshare = highword;
														
 
															-        for (nbits_t i=1;i<WIDTH;++i) {
														
 
															+        for (nbits_t i=1;i<LWIDTH;++i) {
														
 
															             value_t lowword =
														
 
															                 value_t(_mm_cvtsi128_si64x(leaf[i]));
														
 
															             value_t highword =
														
--- a/rdpf.tcc
+++ b/rdpf.tcc
@@ -191,7 +191,7 @@ inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
 
															 {
														
 
															     typename RDPF<WIDTH>::LeafNode prgout;
														
 
															     bool flag = get_lsb(parent);
														
 
															-    prgleaf(prgout, parent, whichchild, aes_ops);
														
 
															+    prg(prgout, parent, whichchild, aes_ops);
														
 
															     if (flag) {
														
 
															         LeafNode CW = li[0].leaf_cw;
														
 
															         LeafNode CWR = CW;
														
@@ -335,6 +335,463 @@ T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
 
															     return is;
														
 
															 }
														
 
															+// Set a DPFnode to zero
														
 
															+static inline void zero(DPFnode &z)
														
 
															+{
														
 
															+    z = _mm_setzero_si128();
														
 
															+}
														
 
															+
														
 
															+// Set a LeafNode to zero
														
 
															+template <size_t LWIDTH>
														
 
															+static inline void zero(std::array<DPFnode,LWIDTH> &z)
														
 
															+{
														
 
															+    for (size_t j=0;j<LWIDTH;++j) {
														
 
															+        zero(z[j]);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+// Set an array of value_r to zero
														
 
															+template <size_t WIDTH>
														
 
															+static inline void zero(std::array<value_t,WIDTH> &z)
														
 
															+{
														
 
															+    for (size_t j=0;j<WIDTH;++j) {
														
 
															+        z[j] = 0;
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+
														
 
															+// Expand a level of the RDPF into the next level without threads. This
														
 
															+// just computes the PRGs without computing or applying the correction
														
 
															+// words.  L and R will be set to the XORs of the left children and the
														
 
															+// XORs of the right children respectively. NT will be LeafNode if we
														
 
															+// are expanding into a leaf level, DPFnode if not.
														
 
															+template <typename NT>
														
 
															+static inline void expand_level_nothreads(size_t start, size_t end,
														
 
															+    const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
														
 
															+    size_t &aes_ops)
														
 
															+{
														
 
															+    // Only touch registers in the inner loop if possible
														
 
															+    NT lL, lR;
														
 
															+    zero(lL);
														
 
															+    zero(lR);
														
 
															+    size_t laes_ops = 0;
														
 
															+    for(size_t i=start;i<end;++i) {
														
 
															+        NT lchild, rchild;
														
 
															+        prgboth(lchild, rchild, curlevel[i], laes_ops);
														
 
															+        lL ^= lchild;
														
 
															+        lR ^= rchild;
														
 
															+        nextlevel[2*i] = lchild;
														
 
															+        nextlevel[2*i+1] = rchild;
														
 
															+    }
														
 
															+    L = lL;
														
 
															+    R = lR;
														
 
															+    aes_ops += laes_ops;
														
 
															+}
														
 
															+
														
 
															+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
														
 
															+// timing benchmarks
														
 
															+template <typename NT>
														
 
															+static inline void expand_level(int max_nthreads, nbits_t level,
														
 
															+    const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
														
 
															+    size_t &aes_ops)
														
 
															+{
														
 
															+    size_t curlevel_size = (size_t(1)<<level);
														
 
															+    if (max_nthreads == 1 || level < 19) {
														
 
															+        // No threading
														
 
															+        expand_level_nothreads(0, curlevel_size,
														
 
															+            curlevel, nextlevel, L, R, aes_ops);
														
 
															+    } else {
														
 
															+        int nthreads =
														
 
															+            int(ceil(sqrt(double(curlevel_size/6000))));
														
 
															+        if (nthreads > max_nthreads) {
														
 
															+            nthreads = max_nthreads;
														
 
															+        }
														
 
															+        NT tL[nthreads];
														
 
															+        NT tR[nthreads];
														
 
															+        size_t taes_ops[nthreads];
														
 
															+        size_t threadstart = 0;
														
 
															+        size_t threadchunk = curlevel_size / nthreads;
														
 
															+        size_t threadextra = curlevel_size % nthreads;
														
 
															+        boost::asio::thread_pool pool(nthreads);
														
 
															+        for (int t=0;t<nthreads;++t) {
														
 
															+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
														
 
															+            size_t threadend = threadstart + threadsize;
														
 
															+            taes_ops[t] = 0;
														
 
															+            boost::asio::post(pool,
														
 
															+                [t, &tL, &tR, &taes_ops, threadstart, threadend,
														
 
															+                &curlevel, &nextlevel] {
														
 
															+                    expand_level_nothreads(threadstart, threadend,
														
 
															+                        curlevel, nextlevel, tL[t], tR[t], taes_ops[t]);
														
 
															+                });
														
 
															+            threadstart = threadend;
														
 
															+        }
														
 
															+        pool.join();
														
 
															+        // Again work on registers as much as possible
														
 
															+        NT lL, lR;
														
 
															+        zero(lL);
														
 
															+        zero(lR);
														
 
															+        size_t laes_ops = 0;
														
 
															+        for (int t=0;t<nthreads;++t) {
														
 
															+            lL ^= tL[t];
														
 
															+            lR ^= tR[t];
														
 
															+            laes_ops += taes_ops[t];
														
 
															+        }
														
 
															+        L = lL;
														
 
															+        R = lR;
														
 
															+        aes_ops += laes_ops;
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+// Apply the correction words to an expanded non-leaf level (nextlevel),
														
 
															+// based on the flag bits in curlevel. This version does not use
														
 
															+// threads.
														
 
															+static inline void finalize_nonleaf_layer_nothreads(size_t start,
														
 
															+    size_t end, const DPFnode *curlevel, DPFnode *nextlevel,
														
 
															+    DPFnode CWL, DPFnode CWR)
														
 
															+{
														
 
															+    for(size_t i=start;i<end;++i) {
														
 
															+        bool flag = get_lsb(curlevel[i]);
														
 
															+        nextlevel[2*i] = xor_if(nextlevel[2*i], CWL, flag);
														
 
															+        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
														
 
															+// timing benchmarks.  The timing of each iteration of the inner loop is
														
 
															+// comparable to the above, so just use the same computations.  All of
														
 
															+// this could be tuned, of course.
														
 
															+static inline void finalize_nonleaf_layer(int max_nthreads, nbits_t level,
														
 
															+    const DPFnode *curlevel, DPFnode *nextlevel, DPFnode CWL,
														
 
															+    DPFnode CWR)
														
 
															+{
														
 
															+    size_t curlevel_size = (size_t(1)<<level);
														
 
															+    if (max_nthreads == 1 || level < 19) {
														
 
															+        // No threading
														
 
															+        finalize_nonleaf_layer_nothreads(0, curlevel_size,
														
 
															+            curlevel, nextlevel, CWL, CWR);
														
 
															+    } else {
														
 
															+        int nthreads =
														
 
															+            int(ceil(sqrt(double(curlevel_size/6000))));
														
 
															+        if (nthreads > max_nthreads) {
														
 
															+            nthreads = max_nthreads;
														
 
															+        }
														
 
															+        size_t threadstart = 0;
														
 
															+        size_t threadchunk = curlevel_size / nthreads;
														
 
															+        size_t threadextra = curlevel_size % nthreads;
														
 
															+        boost::asio::thread_pool pool(nthreads);
														
 
															+        for (int t=0;t<nthreads;++t) {
														
 
															+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
														
 
															+            size_t threadend = threadstart + threadsize;
														
 
															+            boost::asio::post(pool,
														
 
															+                [threadstart, threadend, CWL, CWR,
														
 
															+                &curlevel, &nextlevel] {
														
 
															+                    finalize_nonleaf_layer_nothreads(threadstart, threadend,
														
 
															+                        curlevel, nextlevel, CWL, CWR);
														
 
															+                });
														
 
															+            threadstart = threadend;
														
 
															+        }
														
 
															+        pool.join();
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+// Finalize a leaf layer. This applies the correction words, and
														
 
															+// computes the low and high sums and XORs.  This version does not use
														
 
															+// threads.  You can pass save_expansion = false here if you don't need
														
 
															+// to save the expansion.  LN is a LeafNode.
														
 
															+template <size_t WIDTH, typename LN>
														
 
															+static inline void finalize_leaf_layer_nothreads(size_t start,
														
 
															+    size_t end, const DPFnode *curlevel, LN *nextlevel,
														
 
															+    bool save_expansion, LN CWL, LN CWR, value_t &low_sum,
														
 
															+    std::array<value_t,WIDTH> &high_sum,
														
 
															+    std::array<value_t,WIDTH> &high_xor)
														
 
															+{
														
 
															+    value_t llow_sum = 0;
														
 
															+    std::array<value_t,WIDTH> lhigh_sum;
														
 
															+    std::array<value_t,WIDTH> lhigh_xor;
														
 
															+    zero(lhigh_sum);
														
 
															+    zero(lhigh_xor);
														
 
															+    for(size_t i=start;i<end;++i) {
														
 
															+        bool flag = get_lsb(curlevel[i]);
														
 
															+        LN leftchild = xor_if(nextlevel[2*i], CWL, flag);
														
 
															+        LN rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
														
 
															+        if (save_expansion) {
														
 
															+            nextlevel[2*i] = leftchild;
														
 
															+            nextlevel[2*i+1] = rightchild;
														
 
															+        }
														
 
															+        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[0]));
														
 
															+        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[0]));
														
 
															+        value_t lefthigh =
														
 
															+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[0],8)));
														
 
															+        value_t righthigh =
														
 
															+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[0],8)));
														
 
															+        llow_sum += (leftlow + rightlow);
														
 
															+        lhigh_sum[0] += (lefthigh + righthigh);
														
 
															+        lhigh_xor[0] ^= (lefthigh ^ righthigh);
														
 
															+        size_t w = 0;
														
 
															+        for (size_t j=1; j<WIDTH; j+=2) {
														
 
															+            ++w;
														
 
															+            value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[w]));
														
 
															+            value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[w]));
														
 
															+            value_t lefthigh =
														
 
															+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[w],8)));
														
 
															+            value_t righthigh =
														
 
															+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[w],8)));
														
 
															+            lhigh_sum[j] += (leftlow + rightlow);
														
 
															+            lhigh_xor[j] ^= (leftlow ^ rightlow);
														
 
															+            if (j+1 < WIDTH) {
														
 
															+                lhigh_sum[j+1] += (lefthigh + righthigh);
														
 
															+                lhigh_xor[j+1] ^= (lefthigh ^ righthigh);
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+    low_sum = llow_sum;
														
 
															+    high_sum = lhigh_sum;
														
 
															+    high_xor = lhigh_xor;
														
 
															+}
														
 
															+
														
 
															+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
														
 
															+// timing benchmarks.  The timing of each iteration of the inner loop is
														
 
															+// comparable to the above, so just use the same computations.  All of
														
 
															+// this could be tuned, of course.
														
 
															+template <size_t WIDTH, typename LN>
														
 
															+static inline void finalize_leaf_layer(int max_nthreads, nbits_t level,
														
 
															+    const DPFnode *curlevel, LN *nextlevel, bool save_expansion,
														
 
															+    LN CWL, LN CWR, value_t &low_sum,
														
 
															+    std::array<value_t,WIDTH> &high_sum,
														
 
															+    std::array<value_t,WIDTH> &high_xor)
														
 
															+{
														
 
															+    size_t curlevel_size = (size_t(1)<<level);
														
 
															+    if (max_nthreads == 1 || level < 19) {
														
 
															+        // No threading
														
 
															+        finalize_leaf_layer_nothreads(0, curlevel_size,
														
 
															+            curlevel, nextlevel, save_expansion, CWL, CWR,
														
 
															+            low_sum, high_sum, high_xor);
														
 
															+    } else {
														
 
															+        int nthreads =
														
 
															+            int(ceil(sqrt(double(curlevel_size/6000))));
														
 
															+        if (nthreads > max_nthreads) {
														
 
															+            nthreads = max_nthreads;
														
 
															+        }
														
 
															+        value_t tlow_sum[nthreads];
														
 
															+        std::array<value_t,WIDTH> thigh_sum[nthreads];
														
 
															+        std::array<value_t,WIDTH> thigh_xor[nthreads];
														
 
															+        size_t threadstart = 0;
														
 
															+        size_t threadchunk = curlevel_size / nthreads;
														
 
															+        size_t threadextra = curlevel_size % nthreads;
														
 
															+        boost::asio::thread_pool pool(nthreads);
														
 
															+        for (int t=0;t<nthreads;++t) {
														
 
															+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
														
 
															+            size_t threadend = threadstart + threadsize;
														
 
															+            boost::asio::post(pool,
														
 
															+                [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
														
 
															+                &curlevel, &nextlevel, CWL, CWR, save_expansion] {
														
 
															+                    finalize_leaf_layer_nothreads(threadstart, threadend,
														
 
															+                        curlevel, nextlevel, save_expansion, CWL, CWR,
														
 
															+                        tlow_sum[t], thigh_sum[t], thigh_xor[t]);
														
 
															+                });
														
 
															+            threadstart = threadend;
														
 
															+        }
														
 
															+        pool.join();
														
 
															+        low_sum = 0;
														
 
															+        zero(high_sum);
														
 
															+        zero(high_xor);
														
 
															+        for (int t=0;t<nthreads;++t) {
														
 
															+            low_sum += tlow_sum[t];
														
 
															+            high_sum += thigh_sum[t];
														
 
															+            high_xor ^= thigh_xor[t];
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+// Create one level of the RDPF.  NT will be as above: LeafNode if we
														
 
															+// are expanding into a leaf level, DPFnode if not.  LI will be LeafInfo
														
 
															+// if we are expanding into a leaf level, and it is unused otherwise.
														
 
															+template<typename NT, typename LI>
														
 
															+static inline void create_level(MPCTIO &tio, yield_t &yield,
														
 
															+    const DPFnode *curlevel, NT *nextlevel,
														
 
															+    int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW,
														
 
															+    bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops)
														
 
															+{
														
 
															+    // tio.cpu_nthreads() is the maximum number of threads we
														
 
															+    // have available.
														
 
															+    int max_nthreads = tio.cpu_nthreads();
														
 
															+
														
 
															+    NT L, R;
														
 
															+    zero(L);
														
 
															+    zero(R);
														
 
															+    // The server doesn't need to do this computation, but it does
														
 
															+    // need to execute mpc_reconstruct_choice so that it sends
														
 
															+    // the AndTriples at the appropriate time.
														
 
															+    if (player < 2) {
														
 
															+#ifdef RDPF_MTGEN_TIMING_1
														
 
															+        if (player == 0) {
														
 
															+            mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
														
 
															+                nextlevel, aes_ops);
														
 
															+            size_t niters = 2048;
														
 
															+            if (level > 8) niters = (1<<20)>>level;
														
 
															+            for(int t=1;t<=8;++t) {
														
 
															+                mtgen_timetest_1(level, t, niters, curlevel,
														
 
															+                    nextlevel, aes_ops);
														
 
															+            }
														
 
															+            mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
														
 
															+                nextlevel, aes_ops);
														
 
															+        }
														
 
															+#endif
														
 
															+        // Using the timing results gathered above, decide whether
														
 
															+        // to multithread, and if so, how many threads to use.
														
 
															+        expand_level(max_nthreads, level, curlevel, nextlevel,
														
 
															+            L, R, aes_ops);
														
 
															+    }
														
 
															+
														
 
															+    // If we're going left (bs_choice = 0), we want the correction
														
 
															+    // word to be the XOR of our right side and our peer's right
														
 
															+    // side; if bs_choice = 1, it should be the XOR or our left side
														
 
															+    // and our peer's left side.
														
 
															+
														
 
															+    // We also have to ensure that the flag bits (the lsb) of the
														
 
															+    // side that will end up the same be of course the same, but
														
 
															+    // also that the flag bits (the lsb) of the side that will end
														
 
															+    // up different _must_ be different.  That is, it's not enough
														
 
															+    // for the nodes of the child selected by choice to be different
														
 
															+    // as 128-bit values; they also have to be different in their
														
 
															+    // lsb.
														
 
															+
														
 
															+    // This is where we make a small optimization over Appendix C of
														
 
															+    // the Duoram paper: instead of keeping separate correction flag
														
 
															+    // bits for the left and right children, we observe that the low
														
 
															+    // bit of the overall correction word effectively serves as one
														
 
															+    // of those bits, so we just need to store one extra bit per
														
 
															+    // level, not two.  (We arbitrarily choose the one for the right
														
 
															+    // child.)
														
 
															+
														
 
															+    // Note that the XOR of our left and right child before and
														
 
															+    // after applying the correction word won't change, since the
														
 
															+    // correction word is applied to either both children or
														
 
															+    // neither, depending on the value of the parent's flag. So in
														
 
															+    // particular, the XOR of the flag bits won't change, and if our
														
 
															+    // children's flag's XOR equals our peer's children's flag's
														
 
															+    // XOR, then we won't have different flag bits even for the
														
 
															+    // children that have different 128-bit values.
														
 
															+
														
 
															+    // So we compute our_parity = lsb(L^R)^player, and we XOR that
														
 
															+    // into the R value in the correction word computation.  At the
														
 
															+    // same time, we exchange these parity values to compute the
														
 
															+    // combined parity, which we store in the DPF.  Then when the
														
 
															+    // DPF is evaluated, if the parent's flag is set, not only apply
														
 
															+    // the correction work to both children, but also apply the
														
 
															+    // (combined) parity bit to just the right child.  Then for
														
 
															+    // unequal nodes (where the flag bit is different), exactly one
														
 
															+    // of the four children (two for P0 and two for P1) will have
														
 
															+    // the parity bit applied, which will set the XOR of the lsb of
														
 
															+    // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
														
 
															+    // = 1 because everything cancels out except player (for which
														
 
															+    // one player is 0 and the other is 1).
														
 
															+
														
 
															+    bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player;
														
 
															+    xor_lsb(R, our_parity_bit);
														
 
															+
														
 
															+    NT CWL;
														
 
															+    bool peer_parity_bit;
														
 
															+    // Exchange the parities and do mpc_reconstruct_choice at the
														
 
															+    // same time (bundled into the same rounds)
														
 
															+    run_coroutines(yield,
														
 
															+        [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
														
 
															+            tio.queue_peer(&our_parity_bit, 1);
														
 
															+            yield();
														
 
															+            uint8_t peer_parity_byte;
														
 
															+            tio.recv_peer(&peer_parity_byte, 1);
														
 
															+            peer_parity_bit = peer_parity_byte & 1;
														
 
															+        },
														
 
															+        [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) {
														
 
															+            mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L);
														
 
															+        });
														
 
															+    cfbit = our_parity_bit ^ peer_parity_bit;
														
 
															+    CW = CWL;
														
 
															+    NT CWR = CWL;
														
 
															+    xor_lsb(CWR, cfbit);
														
 
															+    if (player < 2) {
														
 
															+        // The timing of each iteration of the inner loop is
														
 
															+        // comparable to the above, so just use the same
														
 
															+        // computations.  All of this could be tuned, of course.
														
 
															+
														
 
															+        if constexpr (std::is_same_v<NT, DPFnode>) {
														
 
															+            finalize_nonleaf_layer(max_nthreads, level, curlevel,
														
 
															+                nextlevel, CWL, CWR);
														
 
															+        } else {
														
 
															+            // Recall there are four potentially useful vectors that
														
 
															+            // can come out of a DPF:
														
 
															+            // - (single-bit) bitwise unit vector
														
 
															+            // - additive-shared unit vector
														
 
															+            // - XOR-shared scaled unit vector
														
 
															+            // - additive-shared scaled unit vector
														
 
															+            //
														
 
															+            // (No single DPF should be used for both of the first
														
 
															+            // two or both of the last two, though, since they're
														
 
															+            // correlated; you _can_ use one of the first two and
														
 
															+            // one of the last two.)
														
 
															+            //
														
 
															+            // For each 128-bit leaf, the low bit is the flag bit,
														
 
															+            // and we're guaranteed that the flag bits (and indeed
														
 
															+            // the whole 128-bit value) for P0 and P1 are the same
														
 
															+            // for every leaf except the target, and that the flag
														
 
															+            // bits definitely differ for the target (and the other
														
 
															+            // 127 bits are independently random on each side).
														
 
															+            //
														
 
															+            // We divide the 128-bit leaf into a low 64-bit word and
														
 
															+            // a high 64-bit word.  We use the low word for the unit
														
 
															+            // vector and the high word for the scaled vector; this
														
 
															+            // choice is not arbitrary: the flag bit in the low word
														
 
															+            // means that the sum of all the low words (with P1's
														
 
															+            // low words negated) across both P0 and P1 is
														
 
															+            // definitely odd, so we can compute that sum's inverse
														
 
															+            // mod 2^64, and store it now during precomputation.  At
														
 
															+            // evaluation time for the additive-shared unit vector,
														
 
															+            // we will output this global inverse times the low word
														
 
															+            // of each leaf, which will make the sum of all of those
														
 
															+            // values 1.  (This technique replaces the protocol in
														
 
															+            // Appendix D of the Duoram paper.)
														
 
															+            //
														
 
															+            // For the scaled vector, we just have to compute shares
														
 
															+            // of what the scaled vector is a sharing _of_, but
														
 
															+            // that's just XORing or adding all of each party's
														
 
															+            // local high words; no communication needed.
														
 
															+
														
 
															+            value_t low_sum;
														
 
															+            const size_t WIDTH = LI::W;
														
 
															+            std::array<value_t,WIDTH> high_sum;
														
 
															+            std::array<value_t,WIDTH> high_xor;
														
 
															+            finalize_leaf_layer(max_nthreads, level, curlevel,
														
 
															+                nextlevel, save_expansion, CWL, CWR, low_sum, high_sum,
														
 
															+                high_xor);
														
 
															+
														
 
															+            if (player == 1) {
														
 
															+                low_sum = -low_sum;
														
 
															+                for(size_t j=0; j<WIDTH; ++j) {
														
 
															+                    high_sum[j] = -high_sum[j];
														
 
															+                }
														
 
															+            }
														
 
															+            for(size_t j=0; j<WIDTH; ++j) {
														
 
															+                li.scaled_sum[j].ashare = high_sum[j];
														
 
															+                li.scaled_xor[j].xshare = high_xor[j];
														
 
															+            }
														
 
															+            // Exchange low_sum and add them up
														
 
															+            tio.queue_peer(&low_sum, sizeof(low_sum));
														
 
															+            yield();
														
 
															+            value_t peer_low_sum;
														
 
															+            tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
														
 
															+            low_sum += peer_low_sum;
														
 
															+            // The low_sum had better be odd
														
 
															+            assert(low_sum & 1);
														
 
															+            li.unit_sum_inverse = inverse_value_t(low_sum);
														
 
															+        }
														
 
															+    } else if (level == depth-1) {
														
 
															+        yield();
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+
														
 
															 // Construct a DPF with the given (XOR-shared) target location, and
														
 
															 // of the given depth, to be used for random-access memory reads and
														
 
															 // writes.  The DPF is construction collaboratively by P0 and P1,
														
@@ -369,362 +826,56 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
															     // Construct each intermediate level
														
 
															     while(level < depth) {
														
 
															+        LeafNode *leaflevel = NULL;
														
 
															         if (player < 2) {
														
 
															             delete[] curlevel;
														
 
															             curlevel = nextlevel;
														
 
															+            nextlevel = NULL;
														
 
															             if (save_expansion && level == depth-1) {
														
 
															                 expansion.resize(1<<depth);
														
 
															-                nextlevel = (DPFnode *)expansion.data();
														
 
															+                leaflevel = expansion.data();
														
 
															+            } else if (level == depth-1) {
														
 
															+                leaflevel = new LeafNode[1<<depth];
														
 
															             } else {
														
 
															                 nextlevel = new DPFnode[1<<(level+1)];
														
 
															             }
														
 
															         }
														
 
															         // Invariant: curlevel has 2^level elements; nextlevel has
														
 
															-        // 2^{level+1} elements
														
 
															+        // 2^{level+1} DPFnode elements if we're not at the last level,
														
 
															+        // and leaflevel has 2^{level+1} LeafNode elements if we are at
														
 
															+        // a leaf level (the last level always, and all levels if we are
														
 
															+        // making an incremental RDPF).
														
 
															         // The bit-shared choice bit is bit (depth-level-1) of the
														
 
															         // XOR-shared target index
														
 
															         RegBS bs_choice = target.bit(depth-level-1);
														
 
															-        size_t curlevel_size = (size_t(1)<<level);
														
 
															-        DPFnode L = _mm_setzero_si128();
														
 
															-        DPFnode R = _mm_setzero_si128();
														
 
															-        // The server doesn't need to do this computation, but it does
														
 
															-        // need to execute mpc_reconstruct_choice so that it sends
														
 
															-        // the AndTriples at the appropriate time.
														
 
															-        if (player < 2) {
														
 
															-#ifdef RDPF_MTGEN_TIMING_1
														
 
															-            if (player == 0) {
														
 
															-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
														
 
															-                    nextlevel, aes_ops);
														
 
															-                size_t niters = 2048;
														
 
															-                if (level > 8) niters = (1<<20)>>level;
														
 
															-                for(int t=1;t<=8;++t) {
														
 
															-                    mtgen_timetest_1(level, t, niters, curlevel,
														
 
															-                        nextlevel, aes_ops);
														
 
															-                }
														
 
															-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
														
 
															-                    nextlevel, aes_ops);
														
 
															-            }
														
 
															-#endif
														
 
															-            // Using the timing results gathered above, decide whether
														
 
															-            // to multithread, and if so, how many threads to use.
														
 
															-            // tio.cpu_nthreads() is the maximum number we have
														
 
															-            // available.
														
 
															-            int max_nthreads = tio.cpu_nthreads();
														
 
															-            if (max_nthreads == 1 || level < 19) {
														
 
															-                // No threading
														
 
															-                size_t laes_ops = 0;
														
 
															-                for(size_t i=0;i<curlevel_size;++i) {
														
 
															-                    DPFnode lchild, rchild;
														
 
															-                    prgboth(lchild, rchild, curlevel[i], laes_ops);
														
 
															-                    L = (L ^ lchild);
														
 
															-                    R = (R ^ rchild);
														
 
															-                    nextlevel[2*i] = lchild;
														
 
															-                    nextlevel[2*i+1] = rchild;
														
 
															-                }
														
 
															-                aes_ops += laes_ops;
														
 
															-            } else {
														
 
															-                size_t curlevel_size = size_t(1)<<level;
														
 
															-                int nthreads =
														
 
															-                    int(ceil(sqrt(double(curlevel_size/6000))));
														
 
															-                if (nthreads > max_nthreads) {
														
 
															-                    nthreads = max_nthreads;
														
 
															-                }
														
 
															-                DPFnode tL[nthreads];
														
 
															-                DPFnode tR[nthreads];
														
 
															-                size_t taes_ops[nthreads];
														
 
															-                size_t threadstart = 0;
														
 
															-                size_t threadchunk = curlevel_size / nthreads;
														
 
															-                size_t threadextra = curlevel_size % nthreads;
														
 
															-                boost::asio::thread_pool pool(nthreads);
														
 
															-                for (int t=0;t<nthreads;++t) {
														
 
															-                    size_t threadsize = threadchunk + (size_t(t) < threadextra);
														
 
															-                    size_t threadend = threadstart + threadsize;
														
 
															-                    boost::asio::post(pool,
														
 
															-                        [t, &tL, &tR, &taes_ops, threadstart, threadend,
														
 
															-                        &curlevel, &nextlevel] {
														
 
															-                            DPFnode L = _mm_setzero_si128();
														
 
															-                            DPFnode R = _mm_setzero_si128();
														
 
															-                            size_t aes_ops = 0;
														
 
															-                            for(size_t i=threadstart;i<threadend;++i) {
														
 
															-                                DPFnode lchild, rchild;
														
 
															-                                prgboth(lchild, rchild, curlevel[i], aes_ops);
														
 
															-                                L = (L ^ lchild);
														
 
															-                                R = (R ^ rchild);
														
 
															-                                nextlevel[2*i] = lchild;
														
 
															-                                nextlevel[2*i+1] = rchild;
														
 
															-                            }
														
 
															-                            tL[t] = L;
														
 
															-                            tR[t] = R;
														
 
															-                            taes_ops[t] = aes_ops;
														
 
															-                        });
														
 
															-                    threadstart = threadend;
														
 
															-                }
														
 
															-                pool.join();
														
 
															-                for (int t=0;t<nthreads;++t) {
														
 
															-                    L ^= tL[t];
														
 
															-                    R ^= tR[t];
														
 
															-                    aes_ops += taes_ops[t];
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-        // If we're going left (bs_choice = 0), we want the correction
														
 
															-        // word to be the XOR of our right side and our peer's right
														
 
															-        // side; if bs_choice = 1, it should be the XOR or our left side
														
 
															-        // and our peer's left side.
														
 
															-
														
 
															-        // We also have to ensure that the flag bits (the lsb) of the
														
 
															-        // side that will end up the same be of course the same, but
														
 
															-        // also that the flag bits (the lsb) of the side that will end
														
 
															-        // up different _must_ be different.  That is, it's not enough
														
 
															-        // for the nodes of the child selected by choice to be different
														
 
															-        // as 128-bit values; they also have to be different in their
														
 
															-        // lsb.
														
 
															-
														
 
															-        // This is where we make a small optimization over Appendix C of
														
 
															-        // the Duoram paper: instead of keeping separate correction flag
														
 
															-        // bits for the left and right children, we observe that the low
														
 
															-        // bit of the overall correction word effectively serves as one
														
 
															-        // of those bits, so we just need to store one extra bit per
														
 
															-        // level, not two.  (We arbitrarily choose the one for the right
														
 
															-        // child.)
														
 
															-
														
 
															-        // Note that the XOR of our left and right child before and
														
 
															-        // after applying the correction word won't change, since the
														
 
															-        // correction word is applied to either both children or
														
 
															-        // neither, depending on the value of the parent's flag. So in
														
 
															-        // particular, the XOR of the flag bits won't change, and if our
														
 
															-        // children's flag's XOR equals our peer's children's flag's
														
 
															-        // XOR, then we won't have different flag bits even for the
														
 
															-        // children that have different 128-bit values.
														
 
															-
														
 
															-        // So we compute our_parity = lsb(L^R)^player, and we XOR that
														
 
															-        // into the R value in the correction word computation.  At the
														
 
															-        // same time, we exchange these parity values to compute the
														
 
															-        // combined parity, which we store in the DPF.  Then when the
														
 
															-        // DPF is evaluated, if the parent's flag is set, not only apply
														
 
															-        // the correction work to both children, but also apply the
														
 
															-        // (combined) parity bit to just the right child.  Then for
														
 
															-        // unequal nodes (where the flag bit is different), exactly one
														
 
															-        // of the four children (two for P0 and two for P1) will have
														
 
															-        // the parity bit applied, which will set the XOR of the lsb of
														
 
															-        // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
														
 
															-        // = 1 because everything cancels out except player (for which
														
 
															-        // one player is 0 and the other is 1).
														
 
															-
														
 
															-        bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
														
 
															-        DPFnode our_parity = lsb128_mask[our_parity_bit];
														
 
															-
														
 
															-        DPFnode CW;
														
 
															-        bool peer_parity_bit;
														
 
															-        // Exchange the parities and do mpc_reconstruct_choice at the
														
 
															-        // same time (bundled into the same rounds)
														
 
															-        run_coroutines(yield,
														
 
															-            [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
														
 
															-                tio.queue_peer(&our_parity_bit, 1);
														
 
															-                yield();
														
 
															-                uint8_t peer_parity_byte;
														
 
															-                tio.recv_peer(&peer_parity_byte, 1);
														
 
															-                peer_parity_bit = peer_parity_byte & 1;
														
 
															-            },
														
 
															-            [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
														
 
															-                mpc_reconstruct_choice(tio, yield, CW, bs_choice,
														
 
															-                    (R ^ our_parity), L);
														
 
															-            });
														
 
															-        bool parity_bit = our_parity_bit ^ peer_parity_bit;
														
 
															-        cfbits |= (value_t(parity_bit)<<level);
														
 
															-        DPFnode CWR = CW ^ lsb128_mask[parity_bit];
														
 
															-        if (player < 2) {
														
 
															-            // The timing of each iteration of the inner loop is
														
 
															-            // comparable to the above, so just use the same
														
 
															-            // computations.  All of this could be tuned, of course.
														
 
															-
														
 
															-            if (level < depth-1) {
														
 
															-                // Using the timing results gathered above, decide whether
														
 
															-                // to multithread, and if so, how many threads to use.
														
 
															-                // tio.cpu_nthreads() is the maximum number we have
														
 
															-                // available.
														
 
															-                int max_nthreads = tio.cpu_nthreads();
														
 
															-                if (max_nthreads == 1 || level < 19) {
														
 
															-                    // No threading
														
 
															-                    for(size_t i=0;i<curlevel_size;++i) {
														
 
															-                        bool flag = get_lsb(curlevel[i]);
														
 
															-                        nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
														
 
															-                        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
														
 
															-                    }
														
 
															-                } else {
														
 
															-                    int nthreads =
														
 
															-                        int(ceil(sqrt(double(curlevel_size/6000))));
														
 
															-                    if (nthreads > max_nthreads) {
														
 
															-                        nthreads = max_nthreads;
														
 
															-                    }
														
 
															-                    size_t threadstart = 0;
														
 
															-                    size_t threadchunk = curlevel_size / nthreads;
														
 
															-                    size_t threadextra = curlevel_size % nthreads;
														
 
															-                    boost::asio::thread_pool pool(nthreads);
														
 
															-                    for (int t=0;t<nthreads;++t) {
														
 
															-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
														
 
															-                        size_t threadend = threadstart + threadsize;
														
 
															-                        boost::asio::post(pool, [CW, CWR, threadstart, threadend,
														
 
															-                            &curlevel, &nextlevel] {
														
 
															-                                for(size_t i=threadstart;i<threadend;++i) {
														
 
															-                                    bool flag = get_lsb(curlevel[i]);
														
 
															-                                    nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
														
 
															-                                    nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
														
 
															-                                }
														
 
															-                        });
														
 
															-                        threadstart = threadend;
														
 
															-                    }
														
 
															-                    pool.join();
														
 
															-                }
														
 
															-            } else {
														
 
															-                // Recall there are four potentially useful vectors that
														
 
															-                // can come out of a DPF:
														
 
															-                // - (single-bit) bitwise unit vector
														
 
															-                // - additive-shared unit vector
														
 
															-                // - XOR-shared scaled unit vector
														
 
															-                // - additive-shared scaled unit vector
														
 
															-                //
														
 
															-                // (No single DPF should be used for both of the first
														
 
															-                // two or both of the last two, though, since they're
														
 
															-                // correlated; you _can_ use one of the first two and
														
 
															-                // one of the last two.)
														
 
															-                //
														
 
															-                // For each 128-bit leaf, the low bit is the flag bit,
														
 
															-                // and we're guaranteed that the flag bits (and indeed
														
 
															-                // the whole 128-bit value) for P0 and P1 are the same
														
 
															-                // for every leaf except the target, and that the flag
														
 
															-                // bits definitely differ for the target (and the other
														
 
															-                // 127 bits are independently random on each side).
														
 
															-                //
														
 
															-                // We divide the 128-bit leaf into a low 64-bit word and
														
 
															-                // a high 64-bit word.  We use the low word for the unit
														
 
															-                // vector and the high word for the scaled vector; this
														
 
															-                // choice is not arbitrary: the flag bit in the low word
														
 
															-                // means that the sum of all the low words (with P1's
														
 
															-                // low words negated) across both P0 and P1 is
														
 
															-                // definitely odd, so we can compute that sum's inverse
														
 
															-                // mod 2^64, and store it now during precomputation.  At
														
 
															-                // evaluation time for the additive-shared unit vector,
														
 
															-                // we will output this global inverse times the low word
														
 
															-                // of each leaf, which will make the sum of all of those
														
 
															-                // values 1.  (This technique replaces the protocol in
														
 
															-                // Appendix D of the Duoram paper.)
														
 
															-                //
														
 
															-                // For the scaled vector, we just have to compute shares
														
 
															-                // of what the scaled vector is a sharing _of_, but
														
 
															-                // that's just XORing or adding all of each party's
														
 
															-                // local high words; no communication needed.
														
 
															-
														
 
															-                value_t low_sum = 0;
														
 
															-                value_t high_sum = 0;
														
 
															-                value_t high_xor = 0;
														
 
															-                // Using the timing results gathered above, decide whether
														
 
															-                // to multithread, and if so, how many threads to use.
														
 
															-                // tio.cpu_nthreads() is the maximum number we have
														
 
															-                // available.
														
 
															-                int max_nthreads = tio.cpu_nthreads();
														
 
															-                if (max_nthreads == 1 || level < 19) {
														
 
															-                    // No threading
														
 
															-                    for(size_t i=0;i<curlevel_size;++i) {
														
 
															-                        bool flag = get_lsb(curlevel[i]);
														
 
															-                        DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
														
 
															-                        DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
														
 
															-                        if (save_expansion) {
														
 
															-                            nextlevel[2*i] = leftchild;
														
 
															-                            nextlevel[2*i+1] = rightchild;
														
 
															-                        }
														
 
															-                        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
														
 
															-                        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
														
 
															-                        value_t lefthigh =
														
 
															-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
														
 
															-                        value_t righthigh =
														
 
															-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
														
 
															-                        low_sum += (leftlow + rightlow);
														
 
															-                        high_sum += (lefthigh + righthigh);
														
 
															-                        high_xor ^= (lefthigh ^ righthigh);
														
 
															-                    }
														
 
															-                } else {
														
 
															-                    int nthreads =
														
 
															-                        int(ceil(sqrt(double(curlevel_size/6000))));
														
 
															-                    if (nthreads > max_nthreads) {
														
 
															-                        nthreads = max_nthreads;
														
 
															-                    }
														
 
															-                    value_t tlow_sum[nthreads];
														
 
															-                    value_t thigh_sum[nthreads];
														
 
															-                    value_t thigh_xor[nthreads];
														
 
															-                    size_t threadstart = 0;
														
 
															-                    size_t threadchunk = curlevel_size / nthreads;
														
 
															-                    size_t threadextra = curlevel_size % nthreads;
														
 
															-                    boost::asio::thread_pool pool(nthreads);
														
 
															-                    for (int t=0;t<nthreads;++t) {
														
 
															-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
														
 
															-                        size_t threadend = threadstart + threadsize;
														
 
															-                        boost::asio::post(pool,
														
 
															-                            [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
														
 
															-                            &curlevel, &nextlevel, CW, CWR, save_expansion] {
														
 
															-                                value_t low_sum = 0;
														
 
															-                                value_t high_sum = 0;
														
 
															-                                value_t high_xor = 0;
														
 
															-                                for(size_t i=threadstart;i<threadend;++i) {
														
 
															-                                    bool flag = get_lsb(curlevel[i]);
														
 
															-                                    DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
														
 
															-                                    DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
														
 
															-                                    if (save_expansion) {
														
 
															-                                        nextlevel[2*i] = leftchild;
														
 
															-                                        nextlevel[2*i+1] = rightchild;
														
 
															-                                    }
														
 
															-                                    value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
														
 
															-                                    value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
														
 
															-                                    value_t lefthigh =
														
 
															-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
														
 
															-                                    value_t righthigh =
														
 
															-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
														
 
															-                                    low_sum += (leftlow + rightlow);
														
 
															-                                    high_sum += (lefthigh + righthigh);
														
 
															-                                    high_xor ^= (lefthigh ^ righthigh);
														
 
															-                                }
														
 
															-                                tlow_sum[t] = low_sum;
														
 
															-                                thigh_sum[t] = high_sum;
														
 
															-                                thigh_xor[t] = high_xor;
														
 
															-                            });
														
 
															-                        threadstart = threadend;
														
 
															-                    }
														
 
															-                    pool.join();
														
 
															-                    for (int t=0;t<nthreads;++t) {
														
 
															-                        low_sum += tlow_sum[t];
														
 
															-                        high_sum += thigh_sum[t];
														
 
															-                        high_xor ^= thigh_xor[t];
														
 
															-                    }
														
 
															-                }
														
 
															-                if (player == 1) {
														
 
															-                    low_sum = -low_sum;
														
 
															-                    high_sum = -high_sum;
														
 
															-                }
														
 
															-                li[0].scaled_sum[0].ashare = high_sum;
														
 
															-                li[0].scaled_xor[0].xshare = high_xor;
														
 
															-                // Exchange low_sum and add them up
														
 
															-                tio.queue_peer(&low_sum, sizeof(low_sum));
														
 
															-                yield();
														
 
															-                value_t peer_low_sum;
														
 
															-                tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
														
 
															-                low_sum += peer_low_sum;
														
 
															-                // The low_sum had better be odd
														
 
															-                assert(low_sum & 1);
														
 
															-                li[0].unit_sum_inverse = inverse_value_t(low_sum);
														
 
															+        bool cfbit;
														
 
															+
														
 
															+        if (level < depth-1) {
														
 
															+            DPFnode CW;
														
 
															+            create_level(tio, yield, curlevel, nextlevel, player, level,
														
 
															+                depth, bs_choice, CW, cfbit, save_expansion, li[0],
														
 
															+                aes_ops);
														
 
															+            cfbits |= (value_t(cfbit)<<level);
														
 
															+            if (player < 2) {
														
 
															+                cw.push_back(CW);
														
 
															             }
														
 
															-            cw.push_back(CW);
														
 
															-        } else if (level == depth-1) {
														
 
															-            yield();
														
 
															+        } else {
														
 
															+            LeafNode CW;
														
 
															+            create_level(tio, yield, curlevel, leaflevel, player, level,
														
 
															+                depth, bs_choice, CW, cfbit, save_expansion, li[0],
														
 
															+                aes_ops);
														
 
															+            li[0].leaf_cw = CW;
														
 
															         }
														
 
															+        if (!save_expansion) {
														
 
															+            delete[] leaflevel;
														
 
															+        }
														
 
															         ++level;
														
 
															     }
														
 
															     delete[] curlevel;
														
 
															-    if (!save_expansion || player == 2) {
														
 
															-        delete[] nextlevel;
														
 
															-    }
														
 
															+    delete[] nextlevel;
														
 
															 }
														
 
															 // Get the leaf node for the given input
														
--- a/types.hpp
+++ b/types.hpp
@@ -8,6 +8,8 @@
 
															 #include <x86intrin.h>  // SSE and AVX intrinsics
														
 
															 #include <bsd/stdlib.h> // arc4random_buf
														
 
															+#include "bitutils.hpp"
														
 
															+
														
 
															 // The number of bits in an MPC secret-shared memory word
														
 
															 #ifndef VALUE_BITS
														
@@ -648,6 +650,14 @@ inline std::array<S,N> &operator^=(std::array<S,N> &A, const std::array<S,N> &B)
 
															     return A;
														
 
															 }
														
 
															+// XOR the bit B into the low bit of A
														
 
															+template <typename S, size_t N>
														
 
															+inline std::array<S,N> &xor_lsb(std::array<S,N> &A, bit_t B)
														
 
															+{
														
 
															+    A[0] ^= lsb128_mask[B];
														
 
															+    return A;
														
 
															+}
														
 
															+
														
 
															 template <typename S, size_t N>
														
 
															 inline std::tuple<std::array<value_t,N>,std::array<value_t,N>,std::array<value_t,N>>
														
 
															     combine(
														
@@ -713,6 +723,13 @@ struct AndTripleName { static constexpr const char *name = "a"; };
 
															 using DPFnode = __m128i;
														
 
															+// XOR the bit B into the low bit of A
														
 
															+inline DPFnode &xor_lsb(DPFnode &A, bit_t B)
														
 
															+{
														
 
															+    A ^= lsb128_mask[B];
														
 
															+    return A;
														
 
															+}
														
 
															+
														
 
															 // A Select triple for type V (V is DPFnode, value_t, or bit_t) is a
														
 
															 // triple of (X0,Y0,Z0) where X0 is a bit and Y0 and Z0 are Vs held by
														
 
															 // P0 (and correspondingly (X1,Y1,Z1) held by P1), with all values