2 年之前 · a9e39d265e
--- a/cell.cpp
+++ b/cell.cpp
@@ -91,7 +91,8 @@ struct Cell {
 
				     // Note that RegXS will extend a RegBS of 1 to the all-1s word, not
			
 
				     // the word with value 1.  This is used for ORAM reads, where the
			
 
				     // same DPF is used for all the fields.
			
 
				-    inline void unit(const RDPF &dpf, DPFnode leaf) {
			
 
				+    template <nbits_t WIDTH>
			
 
				+    inline void unit(const RDPF<WIDTH> &dpf, DPFnode leaf) {
			
 
				         key = dpf.unit_as(leaf);
			
 
				         pointers = dpf.unit_bs(leaf);
			
 
				         value = dpf.unit_bs(leaf);
			
--- a/duoram.tcc
+++ b/duoram.tcc
@@ -273,7 +273,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				     if (player < 2) {
			
 
				         // Computational players do this
			
 
				 
			
 
				-        RDPFTriple dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
			
 
				+        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
			
 
				 
			
 
				         // Compute the index offset
			
 
				         U indoffset;
			
@@ -281,7 +281,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				         indoffset -= idx;
			
 
				 
			
 
				         // We only need two of the DPFs for reading
			
 
				-        RDPFPair dp(std::move(dt), 0, player == 0 ? 2 : 1);
			
 
				+        RDPFPair<1> dp(std::move(dt), 0, player == 0 ? 2 : 1);
			
 
				         // The RDPFTriple dt is now broken, since we've moved things out
			
 
				         // of it.
			
 
				 
			
@@ -304,7 +304,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				             shape.tio.aes_ops());
			
 
				         FT init;
			
 
				         res = pe.reduce(init, [this, &dp, &shape] (int thread_num,
			
 
				-                address_t i, const RDPFPair::node &leaf) {
			
 
				+                address_t i, const RDPFPair<1>::node &leaf) {
			
 
				             // The values from the two DPFs, which will each be of type T
			
 
				             std::tuple<FT,FT> V;
			
 
				             dp.unit(V, leaf);
			
@@ -324,7 +324,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				     } else {
			
 
				         // The server does this
			
 
				 
			
 
				-        RDPFPair dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
			
 
				+        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
			
 
				         U p0indoffset, p1indoffset;
			
 
				 
			
 
				         shape.yield();
			
@@ -341,7 +341,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				             shape.shape_size, shape.tio.cpu_nthreads(),
			
 
				             shape.tio.aes_ops());
			
 
				         gamma = pe.reduce(init, [this, &dp, &shape] (int thread_num,
			
 
				-                address_t i, const RDPFPair::node &leaf) {
			
 
				+                address_t i, const RDPFPair<1>::node &leaf) {
			
 
				             // The values from the two DPFs, each of type FT
			
 
				             std::tuple<FT,FT> V;
			
 
				             dp.unit(V, leaf);
			
@@ -383,7 +383,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				     if (player < 2) {
			
 
				         // Computational players do this
			
 
				 
			
 
				-        RDPFTriple dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
			
 
				+        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
			
 
				 
			
 
				         // Compute the index and message offsets
			
 
				         U indoffset;
			
@@ -420,7 +420,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             shape.tio.aes_ops());
			
 
				         int init = 0;
			
 
				         pe.reduce(init, [this, &dt, &shape, &Mshift, player] (int thread_num,
			
 
				-                address_t i, const RDPFTriple::node &leaf) {
			
 
				+                address_t i, const RDPFTriple<1>::node &leaf) {
			
 
				             // The values from the three DPFs
			
 
				             std::tuple<FT,FT,FT> scaled, unit;
			
 
				             dt.scaled(scaled, leaf);
			
@@ -442,7 +442,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				     } else {
			
 
				         // The server does this
			
 
				 
			
 
				-        RDPFPair dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
			
 
				+        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
			
 
				         U p0indoffset, p1indoffset;
			
 
				         std::tuple<FT,FT> p0Moffset, p1Moffset;
			
 
				 
			
@@ -463,7 +463,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             shape.tio.aes_ops());
			
 
				         int init = 0;
			
 
				         pe.reduce(init, [this, &dp, &shape, &Mshift] (int thread_num,
			
 
				-                address_t i, const RDPFPair::node &leaf) {
			
 
				+                address_t i, const RDPFPair<1>::node &leaf) {
			
 
				             // The values from the two DPFs
			
 
				             std::tuple<FT,FT> scaled, unit;
			
 
				             dp.scaled(scaled, leaf);
			
--- a/mpcio.cpp
+++ b/mpcio.cpp
@@ -802,17 +802,17 @@ SelectTriple<bit_t> MPCTIO::bitselecttriple(yield_t &yield)
 
				 
			
 
				 // Only computational peers call this; the server should be calling
			
 
				 // rdpfpair() at the same time
			
 
				-RDPFTriple MPCTIO::rdpftriple(yield_t &yield, nbits_t depth,
			
 
				+RDPFTriple<1> MPCTIO::rdpftriple(yield_t &yield, nbits_t depth,
			
 
				     bool keep_expansion)
			
 
				 {
			
 
				     assert(mpcio.player < 2);
			
 
				-    RDPFTriple val;
			
 
				+    RDPFTriple<1> val;
			
 
				 
			
 
				     MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
			
 
				     if (mpcio.mode == MODE_ONLINE) {
			
 
				         mpcpio.rdpftriples[thread_num][depth-1].get(val);
			
 
				     } else {
			
 
				-        val = RDPFTriple(*this, yield, depth,
			
 
				+        val = RDPFTriple<1>(*this, yield, depth,
			
 
				             keep_expansion);
			
 
				         iostream_server() <<
			
 
				             val.dpf[(mpcio.player == 0) ? 1 : 2];
			
@@ -824,16 +824,16 @@ RDPFTriple MPCTIO::rdpftriple(yield_t &yield, nbits_t depth,
 
				 
			
 
				 // Only the server calls this; the computational peers should be calling
			
 
				 // rdpftriple() at the same time
			
 
				-RDPFPair MPCTIO::rdpfpair(yield_t &yield, nbits_t depth)
			
 
				+RDPFPair<1> MPCTIO::rdpfpair(yield_t &yield, nbits_t depth)
			
 
				 {
			
 
				     assert(mpcio.player == 2);
			
 
				-    RDPFPair val;
			
 
				+    RDPFPair<1> val;
			
 
				 
			
 
				     MPCServerIO &mpcsrvio = static_cast<MPCServerIO&>(mpcio);
			
 
				     if (mpcio.mode == MODE_ONLINE) {
			
 
				         mpcsrvio.rdpfpairs[thread_num][depth-1].get(val);
			
 
				     } else {
			
 
				-        RDPFTriple trip(*this, yield, depth, true);
			
 
				+        RDPFTriple<1> trip(*this, yield, depth, true);
			
 
				         yield();
			
 
				         iostream_p0() >> val.dpf[0];
			
 
				         iostream_p1() >> val.dpf[1];
			
--- a/mpcio.hpp
+++ b/mpcio.hpp
@@ -215,7 +215,7 @@ struct MPCPeerIO : public MPCIO {
 
				     std::vector<PreCompStorage<CDPF, CDPFName>> cdpfs;
			
 
				     // The outer vector is (like above) one item per thread
			
 
				     // The inner array is indexed by DPF depth (depth d is at entry d-1)
			
 
				-    std::vector<std::array<PreCompStorage<RDPFTriple, RDPFTripleName>,ADDRESS_MAX_BITS>> rdpftriples;
			
 
				+    std::vector<std::array<PreCompStorage<RDPFTriple<1>, RDPFTripleName>,ADDRESS_MAX_BITS>> rdpftriples;
			
 
				 
			
 
				     MPCPeerIO(unsigned player, ProcessingMode mode,
			
 
				             std::deque<tcp::socket> &peersocks,
			
@@ -236,7 +236,7 @@ struct MPCServerIO : public MPCIO {
 
				     std::deque<MPCSingleIO> p1ios;
			
 
				     // The outer vector is (like above) one item per thread
			
 
				     // The inner array is indexed by DPF depth (depth d is at entry d-1)
			
 
				-    std::vector<std::array<PreCompStorage<RDPFPair, RDPFPairName>,ADDRESS_MAX_BITS>> rdpfpairs;
			
 
				+    std::vector<std::array<PreCompStorage<RDPFPair<1>, RDPFPairName>,ADDRESS_MAX_BITS>> rdpfpairs;
			
 
				 
			
 
				     MPCServerIO(ProcessingMode mode,
			
 
				             std::deque<tcp::socket> &p0socks,
			
@@ -387,10 +387,10 @@ public:
 
				 
			
 
				     // These ones only work during the online phase
			
 
				     // Computational peers call:
			
 
				-    RDPFTriple rdpftriple(yield_t &yield, nbits_t depth,
			
 
				+    RDPFTriple<1> rdpftriple(yield_t &yield, nbits_t depth,
			
 
				         bool keep_expansion = true);
			
 
				     // The server calls:
			
 
				-    RDPFPair rdpfpair(yield_t &yield, nbits_t depth);
			
 
				+    RDPFPair<1> rdpfpair(yield_t &yield, nbits_t depth);
			
 
				     // Anyone can call:
			
 
				     CDPF cdpf(yield_t &yield);
			
 
				 
			
--- a/online.cpp
+++ b/online.cpp
@@ -216,9 +216,9 @@ static void rdpf_test(MPCIO &mpcio,
 
				                 size_t &aes_ops = tio.aes_ops();
			
 
				                 for (size_t iter=0; iter < num_iters; ++iter) {
			
 
				                     if (tio.player() == 2) {
			
 
				-                        RDPFPair dp = tio.rdpfpair(yield, depth);
			
 
				+                        RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				                         for (int i=0;i<2;++i) {
			
 
				-                            const RDPF &dpf = dp.dpf[i];
			
 
				+                            const RDPF<1> &dpf = dp.dpf[i];
			
 
				                             for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
 
				                                 DPFnode leaf = dpf.leaf(x, aes_ops);
			
 
				                                 RegBS ub = dpf.unit_bs(leaf);
			
@@ -231,9 +231,9 @@ static void rdpf_test(MPCIO &mpcio,
 
				                             printf("\n");
			
 
				                         }
			
 
				                     } else {
			
 
				-                        RDPFTriple dt = tio.rdpftriple(yield, depth);
			
 
				+                        RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				                         for (int i=0;i<3;++i) {
			
 
				-                            const RDPF &dpf = dt.dpf[i];
			
 
				+                            const RDPF<1> &dpf = dt.dpf[i];
			
 
				                             RegXS peer_scaled_xor;
			
 
				                             RegAS peer_scaled_sum;
			
 
				                             if (tio.player() == 1) {
			
@@ -300,9 +300,9 @@ static void rdpf_timing(MPCIO &mpcio,
 
				             run_coroutines(tio, [&tio, depth] (yield_t &yield) {
			
 
				                 size_t &aes_ops = tio.aes_ops();
			
 
				                 if (tio.player() == 2) {
			
 
				-                    RDPFPair dp = tio.rdpfpair(yield, depth);
			
 
				+                    RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				                     for (int i=0;i<2;++i) {
			
 
				-                        RDPF &dpf = dp.dpf[i];
			
 
				+                        RDPF<1> &dpf = dp.dpf[i];
			
 
				                         dpf.expand(aes_ops);
			
 
				                         RegXS scaled_xor;
			
 
				                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
@@ -315,9 +315,9 @@ static void rdpf_timing(MPCIO &mpcio,
 
				                         printf("\n");
			
 
				                     }
			
 
				                 } else {
			
 
				-                    RDPFTriple dt = tio.rdpftriple(yield, depth);
			
 
				+                    RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				                     for (int i=0;i<3;++i) {
			
 
				-                        RDPF &dpf = dt.dpf[i];
			
 
				+                        RDPF<1> &dpf = dt.dpf[i];
			
 
				                         dpf.expand(aes_ops);
			
 
				                         RegXS scaled_xor;
			
 
				                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
@@ -336,7 +336,7 @@ static void rdpf_timing(MPCIO &mpcio,
 
				     pool.join();
			
 
				 }
			
 
				 
			
 
				-static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF &dpf,
			
 
				+static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
			
 
				     address_t start, int num_threads)
			
 
				 {
			
 
				     RegXS scaled_xor[num_threads];
			
@@ -392,9 +392,9 @@ static void rdpfeval_timing(MPCIO &mpcio,
 
				     MPCTIO tio(mpcio, 0, num_threads);
			
 
				     run_coroutines(tio, [&mpcio, &tio, depth, start, num_threads] (yield_t &yield) {
			
 
				         if (tio.player() == 2) {
			
 
				-            RDPFPair dp = tio.rdpfpair(yield, depth);
			
 
				+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				             for (int i=0;i<2;++i) {
			
 
				-                RDPF &dpf = dp.dpf[i];
			
 
				+                RDPF<1> &dpf = dp.dpf[i];
			
 
				                 value_t scaled_xor =
			
 
				                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
			
 
				                 printf("%016lx\n%016lx\n", scaled_xor,
			
@@ -402,9 +402,9 @@ static void rdpfeval_timing(MPCIO &mpcio,
 
				                 printf("\n");
			
 
				             }
			
 
				         } else {
			
 
				-            RDPFTriple dt = tio.rdpftriple(yield, depth);
			
 
				+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				             for (int i=0;i<3;++i) {
			
 
				-                RDPF &dpf = dt.dpf[i];
			
 
				+                RDPF<1> &dpf = dt.dpf[i];
			
 
				                 value_t scaled_xor =
			
 
				                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
			
 
				                 printf("%016lx\n%016lx\n", scaled_xor,
			
@@ -434,15 +434,15 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
 
				     MPCTIO tio(mpcio, 0, num_threads);
			
 
				     run_coroutines(tio, [&tio, depth, start, num_threads] (yield_t &yield) {
			
 
				         if (tio.player() == 2) {
			
 
				-            RDPFPair dp = tio.rdpfpair(yield, depth);
			
 
				+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				             for (int i=0;i<2;++i) {
			
 
				-                RDPF &dpf = dp.dpf[i];
			
 
				+                RDPF<1> &dpf = dp.dpf[i];
			
 
				                 nbits_t depth = dpf.depth();
			
 
				                 auto pe = ParallelEval(dpf, start, 0,
			
 
				                     address_t(1)<<depth, num_threads, tio.aes_ops());
			
 
				                 RegXS result, init;
			
 
				                 result = pe.reduce(init, [&dpf] (int thread_num,
			
 
				-                        address_t i, const RDPF::node &leaf) {
			
 
				+                        address_t i, const RDPF<1>::node &leaf) {
			
 
				                     return dpf.scaled_xs(leaf);
			
 
				                 });
			
 
				                 printf("%016lx\n%016lx\n", result.xshare,
			
@@ -450,15 +450,15 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
 
				                 printf("\n");
			
 
				             }
			
 
				         } else {
			
 
				-            RDPFTriple dt = tio.rdpftriple(yield, depth);
			
 
				+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				             for (int i=0;i<3;++i) {
			
 
				-                RDPF &dpf = dt.dpf[i];
			
 
				+                RDPF<1> &dpf = dt.dpf[i];
			
 
				                 nbits_t depth = dpf.depth();
			
 
				                 auto pe = ParallelEval(dpf, start, 0,
			
 
				                     address_t(1)<<depth, num_threads, tio.aes_ops());
			
 
				                 RegXS result, init;
			
 
				                 result = pe.reduce(init, [&dpf] (int thread_num,
			
 
				-                        address_t i, const RDPF::node &leaf) {
			
 
				+                        address_t i, const RDPF<1>::node &leaf) {
			
 
				                     return dpf.scaled_xs(leaf);
			
 
				                 });
			
 
				                 printf("%016lx\n%016lx\n", result.xshare,
			
@@ -489,7 +489,7 @@ static void tupleeval_timing(MPCIO &mpcio,
 
				     run_coroutines(tio, [&tio, depth, start] (yield_t &yield) {
			
 
				         size_t &aes_ops = tio.aes_ops();
			
 
				         if (tio.player() == 2) {
			
 
				-            RDPFPair dp = tio.rdpfpair(yield, depth);
			
 
				+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				             RegXS scaled_xor0, scaled_xor1;
			
 
				             auto ev = StreamEval(dp, start, 0, aes_ops, false);
			
 
				             for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
@@ -506,7 +506,7 @@ static void tupleeval_timing(MPCIO &mpcio,
 
				                 dp.dpf[1].scaled_xor.xshare);
			
 
				             printf("\n");
			
 
				         } else {
			
 
				-            RDPFTriple dt = tio.rdpftriple(yield, depth);
			
 
				+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				             RegXS scaled_xor0, scaled_xor1, scaled_xor2;
			
 
				             auto ev = StreamEval(dt, start, 0, aes_ops, false);
			
 
				             for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
@@ -551,13 +551,13 @@ static void par_tupleeval_timing(MPCIO &mpcio,
 
				     run_coroutines(tio, [&tio, depth, start, num_threads] (yield_t &yield) {
			
 
				         size_t &aes_ops = tio.aes_ops();
			
 
				         if (tio.player() == 2) {
			
 
				-            RDPFPair dp = tio.rdpfpair(yield, depth);
			
 
				+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				             auto pe = ParallelEval(dp, start, 0, address_t(1)<<depth,
			
 
				                 num_threads, aes_ops);
			
 
				             using V = std::tuple<RegXS,RegXS>;
			
 
				             V result, init;
			
 
				             result = pe.reduce(init, [&dp] (int thread_num, address_t i,
			
 
				-                    const RDPFPair::node &leaf) {
			
 
				+                    const RDPFPair<1>::node &leaf) {
			
 
				                 std::tuple<RegXS,RegXS> scaled;
			
 
				                 dp.scaled(scaled, leaf);
			
 
				                 return scaled;
			
@@ -569,13 +569,13 @@ static void par_tupleeval_timing(MPCIO &mpcio,
 
				                 dp.dpf[1].scaled_xor.xshare);
			
 
				             printf("\n");
			
 
				         } else {
			
 
				-            RDPFTriple dt = tio.rdpftriple(yield, depth);
			
 
				+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				             auto pe = ParallelEval(dt, start, 0, address_t(1)<<depth,
			
 
				                 num_threads, aes_ops);
			
 
				             using V = std::tuple<RegXS,RegXS,RegXS>;
			
 
				             V result, init;
			
 
				             result = pe.reduce(init, [&dt] (int thread_num, address_t i,
			
 
				-                    const RDPFTriple::node &leaf) {
			
 
				+                    const RDPFTriple<1>::node &leaf) {
			
 
				                 std::tuple<RegXS,RegXS,RegXS> scaled;
			
 
				                 dt.scaled(scaled, leaf);
			
 
				                 return scaled;
			
--- a/rdpf.cpp
+++ b/rdpf.cpp
@@ -2,23 +2,6 @@
 
				 
			
 
				 #include "rdpf.hpp"
			
 
				 #include "bitutils.hpp"
			
 
				-#include "mpcops.hpp"
			
 
				-
			
 
				-// Compute the multiplicative inverse of x mod 2^{VALUE_BITS}
			
 
				-// This is the same as computing x to the power of
			
 
				-// 2^{VALUE_BITS-1}-1.
			
 
				-static value_t inverse_value_t(value_t x)
			
 
				-{
			
 
				-    int expon = 1;
			
 
				-    value_t xe = x;
			
 
				-    // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS}
			
 
				-    // Goal: compute x^(2^{VALUE_BITS-1} - 1)
			
 
				-    while (expon < VALUE_BITS-1) {
			
 
				-        xe = xe * xe * x;
			
 
				-        ++expon;
			
 
				-    }
			
 
				-    return xe;
			
 
				-}
			
 
				 
			
 
				 #undef RDPF_MTGEN_TIMING_1
			
 
				 
			
@@ -114,500 +97,3 @@ static void mtgen_timetest_1(nbits_t level, int nthreads,
 
				 }
			
 
				 
			
 
				 #endif
			
 
				-
			
 
				-// Construct a DPF with the given (XOR-shared) target location, and
			
 
				-// of the given depth, to be used for random-access memory reads and
			
 
				-// writes.  The DPF is construction collaboratively by P0 and P1,
			
 
				-// with the server P2 helping by providing various kinds of
			
 
				-// correlated randomness, such as MultTriples and AndTriples.
			
 
				-//
			
 
				-// This algorithm is based on Appendix C from the Duoram paper, with a
			
 
				-// small optimization noted below.
			
 
				-RDPF::RDPF(MPCTIO &tio, yield_t &yield,
			
 
				-    RegXS target, nbits_t depth, bool save_expansion)
			
 
				-{
			
 
				-    int player = tio.player();
			
 
				-    size_t &aes_ops = tio.aes_ops();
			
 
				-
			
 
				-    // Choose a random seed
			
 
				-    arc4random_buf(&seed, sizeof(seed));
			
 
				-    // Ensure the flag bits (the lsb of each node) are different
			
 
				-    seed = set_lsb(seed, !!player);
			
 
				-    cfbits = 0;
			
 
				-    whichhalf = (player == 1);
			
 
				-
			
 
				-    // The root level is just the seed
			
 
				-    nbits_t level = 0;
			
 
				-    DPFnode *curlevel = NULL;
			
 
				-    DPFnode *nextlevel = new DPFnode[1];
			
 
				-    nextlevel[0] = seed;
			
 
				-
			
 
				-    // Construct each intermediate level
			
 
				-    while(level < depth) {
			
 
				-        if (player < 2) {
			
 
				-            delete[] curlevel;
			
 
				-            curlevel = nextlevel;
			
 
				-            if (save_expansion && level == depth-1) {
			
 
				-                expansion.resize(1<<depth);
			
 
				-                nextlevel = expansion.data();
			
 
				-            } else {
			
 
				-                nextlevel = new DPFnode[1<<(level+1)];
			
 
				-            }
			
 
				-        }
			
 
				-        // Invariant: curlevel has 2^level elements; nextlevel has
			
 
				-        // 2^{level+1} elements
			
 
				-
			
 
				-        // The bit-shared choice bit is bit (depth-level-1) of the
			
 
				-        // XOR-shared target index
			
 
				-        RegBS bs_choice = target.bit(depth-level-1);
			
 
				-        size_t curlevel_size = (size_t(1)<<level);
			
 
				-        DPFnode L = _mm_setzero_si128();
			
 
				-        DPFnode R = _mm_setzero_si128();
			
 
				-        // The server doesn't need to do this computation, but it does
			
 
				-        // need to execute mpc_reconstruct_choice so that it sends
			
 
				-        // the AndTriples at the appropriate time.
			
 
				-        if (player < 2) {
			
 
				-#ifdef RDPF_MTGEN_TIMING_1
			
 
				-            if (player == 0) {
			
 
				-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				-                    nextlevel, aes_ops);
			
 
				-                size_t niters = 2048;
			
 
				-                if (level > 8) niters = (1<<20)>>level;
			
 
				-                for(int t=1;t<=8;++t) {
			
 
				-                    mtgen_timetest_1(level, t, niters, curlevel,
			
 
				-                        nextlevel, aes_ops);
			
 
				-                }
			
 
				-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				-                    nextlevel, aes_ops);
			
 
				-            }
			
 
				-#endif
			
 
				-            // Using the timing results gathered above, decide whether
			
 
				-            // to multithread, and if so, how many threads to use.
			
 
				-            // tio.cpu_nthreads() is the maximum number we have
			
 
				-            // available.
			
 
				-            int max_nthreads = tio.cpu_nthreads();
			
 
				-            if (max_nthreads == 1 || level < 19) {
			
 
				-                // No threading
			
 
				-                size_t laes_ops = 0;
			
 
				-                for(size_t i=0;i<curlevel_size;++i) {
			
 
				-                    DPFnode lchild, rchild;
			
 
				-                    prgboth(lchild, rchild, curlevel[i], laes_ops);
			
 
				-                    L = (L ^ lchild);
			
 
				-                    R = (R ^ rchild);
			
 
				-                    nextlevel[2*i] = lchild;
			
 
				-                    nextlevel[2*i+1] = rchild;
			
 
				-                }
			
 
				-                aes_ops += laes_ops;
			
 
				-            } else {
			
 
				-                size_t curlevel_size = size_t(1)<<level;
			
 
				-                int nthreads =
			
 
				-                    int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				-                if (nthreads > max_nthreads) {
			
 
				-                    nthreads = max_nthreads;
			
 
				-                }
			
 
				-                DPFnode tL[nthreads];
			
 
				-                DPFnode tR[nthreads];
			
 
				-                size_t taes_ops[nthreads];
			
 
				-                size_t threadstart = 0;
			
 
				-                size_t threadchunk = curlevel_size / nthreads;
			
 
				-                size_t threadextra = curlevel_size % nthreads;
			
 
				-                boost::asio::thread_pool pool(nthreads);
			
 
				-                for (int t=0;t<nthreads;++t) {
			
 
				-                    size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				-                    size_t threadend = threadstart + threadsize;
			
 
				-                    boost::asio::post(pool,
			
 
				-                        [t, &tL, &tR, &taes_ops, threadstart, threadend,
			
 
				-                        &curlevel, &nextlevel] {
			
 
				-                            DPFnode L = _mm_setzero_si128();
			
 
				-                            DPFnode R = _mm_setzero_si128();
			
 
				-                            size_t aes_ops = 0;
			
 
				-                            for(size_t i=threadstart;i<threadend;++i) {
			
 
				-                                DPFnode lchild, rchild;
			
 
				-                                prgboth(lchild, rchild, curlevel[i], aes_ops);
			
 
				-                                L = (L ^ lchild);
			
 
				-                                R = (R ^ rchild);
			
 
				-                                nextlevel[2*i] = lchild;
			
 
				-                                nextlevel[2*i+1] = rchild;
			
 
				-                            }
			
 
				-                            tL[t] = L;
			
 
				-                            tR[t] = R;
			
 
				-                            taes_ops[t] = aes_ops;
			
 
				-                        });
			
 
				-                    threadstart = threadend;
			
 
				-                }
			
 
				-                pool.join();
			
 
				-                for (int t=0;t<nthreads;++t) {
			
 
				-                    L ^= tL[t];
			
 
				-                    R ^= tR[t];
			
 
				-                    aes_ops += taes_ops[t];
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-        // If we're going left (bs_choice = 0), we want the correction
			
 
				-        // word to be the XOR of our right side and our peer's right
			
 
				-        // side; if bs_choice = 1, it should be the XOR or our left side
			
 
				-        // and our peer's left side.
			
 
				-
			
 
				-        // We also have to ensure that the flag bits (the lsb) of the
			
 
				-        // side that will end up the same be of course the same, but
			
 
				-        // also that the flag bits (the lsb) of the side that will end
			
 
				-        // up different _must_ be different.  That is, it's not enough
			
 
				-        // for the nodes of the child selected by choice to be different
			
 
				-        // as 128-bit values; they also have to be different in their
			
 
				-        // lsb.
			
 
				-
			
 
				-        // This is where we make a small optimization over Appendix C of
			
 
				-        // the Duoram paper: instead of keeping separate correction flag
			
 
				-        // bits for the left and right children, we observe that the low
			
 
				-        // bit of the overall correction word effectively serves as one
			
 
				-        // of those bits, so we just need to store one extra bit per
			
 
				-        // level, not two.  (We arbitrarily choose the one for the right
			
 
				-        // child.)
			
 
				-
			
 
				-        // Note that the XOR of our left and right child before and
			
 
				-        // after applying the correction word won't change, since the
			
 
				-        // correction word is applied to either both children or
			
 
				-        // neither, depending on the value of the parent's flag. So in
			
 
				-        // particular, the XOR of the flag bits won't change, and if our
			
 
				-        // children's flag's XOR equals our peer's children's flag's
			
 
				-        // XOR, then we won't have different flag bits even for the
			
 
				-        // children that have different 128-bit values.
			
 
				-
			
 
				-        // So we compute our_parity = lsb(L^R)^player, and we XOR that
			
 
				-        // into the R value in the correction word computation.  At the
			
 
				-        // same time, we exchange these parity values to compute the
			
 
				-        // combined parity, which we store in the DPF.  Then when the
			
 
				-        // DPF is evaluated, if the parent's flag is set, not only apply
			
 
				-        // the correction work to both children, but also apply the
			
 
				-        // (combined) parity bit to just the right child.  Then for
			
 
				-        // unequal nodes (where the flag bit is different), exactly one
			
 
				-        // of the four children (two for P0 and two for P1) will have
			
 
				-        // the parity bit applied, which will set the XOR of the lsb of
			
 
				-        // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
			
 
				-        // = 1 because everything cancels out except player (for which
			
 
				-        // one player is 0 and the other is 1).
			
 
				-
			
 
				-        bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
			
 
				-        DPFnode our_parity = lsb128_mask[our_parity_bit];
			
 
				-
			
 
				-        DPFnode CW;
			
 
				-        bool peer_parity_bit;
			
 
				-        // Exchange the parities and do mpc_reconstruct_choice at the
			
 
				-        // same time (bundled into the same rounds)
			
 
				-        run_coroutines(yield,
			
 
				-            [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
			
 
				-                tio.queue_peer(&our_parity_bit, 1);
			
 
				-                yield();
			
 
				-                uint8_t peer_parity_byte;
			
 
				-                tio.recv_peer(&peer_parity_byte, 1);
			
 
				-                peer_parity_bit = peer_parity_byte & 1;
			
 
				-            },
			
 
				-            [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
			
 
				-                mpc_reconstruct_choice(tio, yield, CW, bs_choice,
			
 
				-                    (R ^ our_parity), L);
			
 
				-            });
			
 
				-        bool parity_bit = our_parity_bit ^ peer_parity_bit;
			
 
				-        cfbits |= (value_t(parity_bit)<<level);
			
 
				-        DPFnode CWR = CW ^ lsb128_mask[parity_bit];
			
 
				-        if (player < 2) {
			
 
				-            // The timing of each iteration of the inner loop is
			
 
				-            // comparable to the above, so just use the same
			
 
				-            // computations.  All of this could be tuned, of course.
			
 
				-
			
 
				-            if (level < depth-1) {
			
 
				-                // Using the timing results gathered above, decide whether
			
 
				-                // to multithread, and if so, how many threads to use.
			
 
				-                // tio.cpu_nthreads() is the maximum number we have
			
 
				-                // available.
			
 
				-                int max_nthreads = tio.cpu_nthreads();
			
 
				-                if (max_nthreads == 1 || level < 19) {
			
 
				-                    // No threading
			
 
				-                    for(size_t i=0;i<curlevel_size;++i) {
			
 
				-                        bool flag = get_lsb(curlevel[i]);
			
 
				-                        nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                    }
			
 
				-                } else {
			
 
				-                    int nthreads =
			
 
				-                        int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				-                    if (nthreads > max_nthreads) {
			
 
				-                        nthreads = max_nthreads;
			
 
				-                    }
			
 
				-                    size_t threadstart = 0;
			
 
				-                    size_t threadchunk = curlevel_size / nthreads;
			
 
				-                    size_t threadextra = curlevel_size % nthreads;
			
 
				-                    boost::asio::thread_pool pool(nthreads);
			
 
				-                    for (int t=0;t<nthreads;++t) {
			
 
				-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				-                        size_t threadend = threadstart + threadsize;
			
 
				-                        boost::asio::post(pool, [CW, CWR, threadstart, threadend,
			
 
				-                            &curlevel, &nextlevel] {
			
 
				-                                for(size_t i=threadstart;i<threadend;++i) {
			
 
				-                                    bool flag = get_lsb(curlevel[i]);
			
 
				-                                    nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                                    nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                                }
			
 
				-                        });
			
 
				-                        threadstart = threadend;
			
 
				-                    }
			
 
				-                    pool.join();
			
 
				-                }
			
 
				-            } else {
			
 
				-                // Recall there are four potentially useful vectors that
			
 
				-                // can come out of a DPF:
			
 
				-                // - (single-bit) bitwise unit vector
			
 
				-                // - additive-shared unit vector
			
 
				-                // - XOR-shared scaled unit vector
			
 
				-                // - additive-shared scaled unit vector
			
 
				-                //
			
 
				-                // (No single DPF should be used for both of the first
			
 
				-                // two or both of the last two, though, since they're
			
 
				-                // correlated; you _can_ use one of the first two and
			
 
				-                // one of the last two.)
			
 
				-                //
			
 
				-                // For each 128-bit leaf, the low bit is the flag bit,
			
 
				-                // and we're guaranteed that the flag bits (and indeed
			
 
				-                // the whole 128-bit value) for P0 and P1 are the same
			
 
				-                // for every leaf except the target, and that the flag
			
 
				-                // bits definitely differ for the target (and the other
			
 
				-                // 127 bits are independently random on each side).
			
 
				-                //
			
 
				-                // We divide the 128-bit leaf into a low 64-bit word and
			
 
				-                // a high 64-bit word.  We use the low word for the unit
			
 
				-                // vector and the high word for the scaled vector; this
			
 
				-                // choice is not arbitrary: the flag bit in the low word
			
 
				-                // means that the sum of all the low words (with P1's
			
 
				-                // low words negated) across both P0 and P1 is
			
 
				-                // definitely odd, so we can compute that sum's inverse
			
 
				-                // mod 2^64, and store it now during precomputation.  At
			
 
				-                // evaluation time for the additive-shared unit vector,
			
 
				-                // we will output this global inverse times the low word
			
 
				-                // of each leaf, which will make the sum of all of those
			
 
				-                // values 1.  (This technique replaces the protocol in
			
 
				-                // Appendix D of the Duoram paper.)
			
 
				-                //
			
 
				-                // For the scaled vector, we just have to compute shares
			
 
				-                // of what the scaled vector is a sharing _of_, but
			
 
				-                // that's just XORing or adding all of each party's
			
 
				-                // local high words; no communication needed.
			
 
				-
			
 
				-                value_t low_sum = 0;
			
 
				-                value_t high_sum = 0;
			
 
				-                value_t high_xor = 0;
			
 
				-                // Using the timing results gathered above, decide whether
			
 
				-                // to multithread, and if so, how many threads to use.
			
 
				-                // tio.cpu_nthreads() is the maximum number we have
			
 
				-                // available.
			
 
				-                int max_nthreads = tio.cpu_nthreads();
			
 
				-                if (max_nthreads == 1 || level < 19) {
			
 
				-                    // No threading
			
 
				-                    for(size_t i=0;i<curlevel_size;++i) {
			
 
				-                        bool flag = get_lsb(curlevel[i]);
			
 
				-                        DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                        DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                        if (save_expansion) {
			
 
				-                            nextlevel[2*i] = leftchild;
			
 
				-                            nextlevel[2*i+1] = rightchild;
			
 
				-                        }
			
 
				-                        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
			
 
				-                        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
			
 
				-                        value_t lefthigh =
			
 
				-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
			
 
				-                        value_t righthigh =
			
 
				-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
			
 
				-                        low_sum += (leftlow + rightlow);
			
 
				-                        high_sum += (lefthigh + righthigh);
			
 
				-                        high_xor ^= (lefthigh ^ righthigh);
			
 
				-                    }
			
 
				-                } else {
			
 
				-                    int nthreads =
			
 
				-                        int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				-                    if (nthreads > max_nthreads) {
			
 
				-                        nthreads = max_nthreads;
			
 
				-                    }
			
 
				-                    value_t tlow_sum[nthreads];
			
 
				-                    value_t thigh_sum[nthreads];
			
 
				-                    value_t thigh_xor[nthreads];
			
 
				-                    size_t threadstart = 0;
			
 
				-                    size_t threadchunk = curlevel_size / nthreads;
			
 
				-                    size_t threadextra = curlevel_size % nthreads;
			
 
				-                    boost::asio::thread_pool pool(nthreads);
			
 
				-                    for (int t=0;t<nthreads;++t) {
			
 
				-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				-                        size_t threadend = threadstart + threadsize;
			
 
				-                        boost::asio::post(pool,
			
 
				-                            [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
			
 
				-                            &curlevel, &nextlevel, CW, CWR, save_expansion] {
			
 
				-                                value_t low_sum = 0;
			
 
				-                                value_t high_sum = 0;
			
 
				-                                value_t high_xor = 0;
			
 
				-                                for(size_t i=threadstart;i<threadend;++i) {
			
 
				-                                    bool flag = get_lsb(curlevel[i]);
			
 
				-                                    DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
			
 
				-                                    DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				-                                    if (save_expansion) {
			
 
				-                                        nextlevel[2*i] = leftchild;
			
 
				-                                        nextlevel[2*i+1] = rightchild;
			
 
				-                                    }
			
 
				-                                    value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
			
 
				-                                    value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
			
 
				-                                    value_t lefthigh =
			
 
				-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
			
 
				-                                    value_t righthigh =
			
 
				-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
			
 
				-                                    low_sum += (leftlow + rightlow);
			
 
				-                                    high_sum += (lefthigh + righthigh);
			
 
				-                                    high_xor ^= (lefthigh ^ righthigh);
			
 
				-                                }
			
 
				-                                tlow_sum[t] = low_sum;
			
 
				-                                thigh_sum[t] = high_sum;
			
 
				-                                thigh_xor[t] = high_xor;
			
 
				-                            });
			
 
				-                        threadstart = threadend;
			
 
				-                    }
			
 
				-                    pool.join();
			
 
				-                    for (int t=0;t<nthreads;++t) {
			
 
				-                        low_sum += tlow_sum[t];
			
 
				-                        high_sum += thigh_sum[t];
			
 
				-                        high_xor ^= thigh_xor[t];
			
 
				-                    }
			
 
				-                }
			
 
				-                if (player == 1) {
			
 
				-                    low_sum = -low_sum;
			
 
				-                    high_sum = -high_sum;
			
 
				-                }
			
 
				-                scaled_sum.ashare = high_sum;
			
 
				-                scaled_xor.xshare = high_xor;
			
 
				-                // Exchange low_sum and add them up
			
 
				-                tio.queue_peer(&low_sum, sizeof(low_sum));
			
 
				-                yield();
			
 
				-                value_t peer_low_sum;
			
 
				-                tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
			
 
				-                low_sum += peer_low_sum;
			
 
				-                // The low_sum had better be odd
			
 
				-                assert(low_sum & 1);
			
 
				-                unit_sum_inverse = inverse_value_t(low_sum);
			
 
				-            }
			
 
				-            cw.push_back(CW);
			
 
				-        } else if (level == depth-1) {
			
 
				-            yield();
			
 
				-        }
			
 
				-
			
 
				-        ++level;
			
 
				-    }
			
 
				-
			
 
				-    delete[] curlevel;
			
 
				-    if (!save_expansion || player == 2) {
			
 
				-        delete[] nextlevel;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-// Get the leaf node for the given input
			
 
				-DPFnode RDPF::leaf(address_t input, size_t &aes_ops) const
			
 
				-{
			
 
				-    // If we have a precomputed expansion, just use it
			
 
				-    if (expansion.size()) {
			
 
				-        return expansion[input];
			
 
				-    }
			
 
				-
			
 
				-    nbits_t totdepth = depth();
			
 
				-    DPFnode node = seed;
			
 
				-    for (nbits_t d=0;d<totdepth;++d) {
			
 
				-        bit_t dir = !!(input & (address_t(1)<<(totdepth-d-1)));
			
 
				-        node = descend(node, d, dir, aes_ops);
			
 
				-    }
			
 
				-    return node;
			
 
				-}
			
 
				-
			
 
				-// Expand the DPF if it's not already expanded
			
 
				-//
			
 
				-// This routine is slightly more efficient than repeatedly calling
			
 
				-// StreamEval::next(), but it uses a lot more memory.
			
 
				-void RDPF::expand(size_t &aes_ops)
			
 
				-{
			
 
				-    nbits_t depth = this->depth();
			
 
				-    size_t num_leaves = size_t(1)<<depth;
			
 
				-    if (expansion.size() == num_leaves) return;
			
 
				-    expansion.resize(num_leaves);
			
 
				-    address_t index = 0;
			
 
				-    address_t lastindex = 0;
			
 
				-    DPFnode *path = new DPFnode[depth];
			
 
				-    path[0] = seed;
			
 
				-    for (nbits_t i=1;i<depth;++i) {
			
 
				-        path[i] = descend(path[i-1], i-1, 0, aes_ops);
			
 
				-    }
			
 
				-    expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				-    expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				-    while(index < num_leaves) {
			
 
				-        // Invariant: lastindex and index will both be even, and
			
 
				-        // index=lastindex+2
			
 
				-        uint64_t index_xor = index ^ lastindex;
			
 
				-        nbits_t how_many_1_bits = __builtin_popcount(index_xor);
			
 
				-        // If lastindex -> index goes for example from (in binary)
			
 
				-        // 010010110 -> 010011000, then index_xor will be
			
 
				-        // 000001110 and how_many_1_bits will be 3.
			
 
				-        // That indicates that path[depth-3] was a left child, and now
			
 
				-        // we need to change it to a right child by descending right
			
 
				-        // from path[depth-4], and then filling the path after that with
			
 
				-        // left children.
			
 
				-        path[depth-how_many_1_bits] =
			
 
				-            descend(path[depth-how_many_1_bits-1],
			
 
				-                depth-how_many_1_bits-1, 1, aes_ops);
			
 
				-        for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
			
 
				-            path[i+1] = descend(path[i], i, 0, aes_ops);
			
 
				-        }
			
 
				-        lastindex = index;
			
 
				-        expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				-        expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				-    }
			
 
				-
			
 
				-    delete[] path;
			
 
				-}
			
 
				-
			
 
				-// Construct three RDPFs of the given depth all with the same randomly
			
 
				-// generated target index.
			
 
				-RDPFTriple::RDPFTriple(MPCTIO &tio, yield_t &yield,
			
 
				-    nbits_t depth, bool save_expansion)
			
 
				-{
			
 
				-    // Pick a random XOR share of the target
			
 
				-    xs_target.randomize(depth);
			
 
				-
			
 
				-    // Now create three RDPFs with that target, and also convert the XOR
			
 
				-    // shares of the target to additive shares
			
 
				-    std::vector<coro_t> coroutines;
			
 
				-    for (int i=0;i<3;++i) {
			
 
				-        coroutines.emplace_back(
			
 
				-            [this, &tio, depth, i, save_expansion](yield_t &yield) {
			
 
				-                dpf[i] = RDPF(tio, yield, xs_target, depth,
			
 
				-                    save_expansion);
			
 
				-            });
			
 
				-    }
			
 
				-    coroutines.emplace_back(
			
 
				-        [this, &tio, depth](yield_t &yield) {
			
 
				-            mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false);
			
 
				-        });
			
 
				-    run_coroutines(yield, coroutines);
			
 
				-}
			
 
				-
			
 
				-RDPFTriple::node RDPFTriple::descend(const RDPFTriple::node &parent,
			
 
				-    nbits_t parentdepth, bit_t whichchild,
			
 
				-    size_t &aes_ops) const
			
 
				-{
			
 
				-    auto [P0, P1, P2] = parent;
			
 
				-    DPFnode C0, C1, C2;
			
 
				-    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
			
 
				-    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
			
 
				-    C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops);
			
 
				-    return std::make_tuple(C0,C1,C2);
			
 
				-}
			
 
				-
			
 
				-RDPFPair::node RDPFPair::descend(const RDPFPair::node &parent,
			
 
				-    nbits_t parentdepth, bit_t whichchild,
			
 
				-    size_t &aes_ops) const
			
 
				-{
			
 
				-    auto [P0, P1] = parent;
			
 
				-    DPFnode C0, C1;
			
 
				-    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
			
 
				-    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
			
 
				-    return std::make_tuple(C0,C1);
			
 
				-}
			
--- a/rdpf.hpp
+++ b/rdpf.hpp
@@ -13,6 +13,7 @@
 
				 // DPFs for oblivious random accesses to memory.  See dpf.hpp for the
			
 
				 // differences between the different kinds of DPFs.
			
 
				 
			
 
				+template <nbits_t WIDTH>
			
 
				 struct RDPF : public DPF {
			
 
				     // The amount we have to scale the low words of the leaf values by
			
 
				     // to get additive shares of a unit vector
			
@@ -109,13 +110,14 @@ struct RDPF : public DPF {
 
				 // target index), while the server will hold a RDPFPair (which does
			
 
				 // not).
			
 
				 
			
 
				+template <nbits_t WIDTH>
			
 
				 struct RDPFTriple {
			
 
				     // The type of node triples
			
 
				     using node = std::tuple<DPFnode, DPFnode, DPFnode>;
			
 
				 
			
 
				     RegAS as_target;
			
 
				     RegXS xs_target;
			
 
				-    RDPF dpf[3];
			
 
				+    RDPF<WIDTH> dpf[3];
			
 
				 
			
 
				     // The depth
			
 
				     inline nbits_t depth() const { return dpf[0].depth(); }
			
@@ -209,11 +211,12 @@ struct RDPFTriple {
 
				     }
			
 
				 };
			
 
				 
			
 
				+template <nbits_t WIDTH>
			
 
				 struct RDPFPair {
			
 
				     // The type of node pairs
			
 
				     using node = std::tuple<DPFnode, DPFnode>;
			
 
				 
			
 
				-    RDPF dpf[2];
			
 
				+    RDPF<WIDTH> dpf[2];
			
 
				 
			
 
				     RDPFPair() {}
			
 
				 
			
@@ -221,7 +224,7 @@ struct RDPFPair {
 
				     // and dropping one.  This _moves_ the dpfs from the triple to the
			
 
				     // pair, so the triple will no longer be valid after using this.
			
 
				     // which0 and which1 indicate which of the dpfs to keep.
			
 
				-    RDPFPair(RDPFTriple &&trip, int which0, int which1) {
			
 
				+    RDPFPair(RDPFTriple<WIDTH> &&trip, int which0, int which1) {
			
 
				         dpf[0] = std::move(trip.dpf[which0]);
			
 
				         dpf[1] = std::move(trip.dpf[which1]);
			
 
				     }
			
--- a/rdpf.tcc
+++ b/rdpf.tcc
@@ -1,5 +1,23 @@
 
				 // Templated method implementations for rdpf.hpp
			
 
				 
			
 
				+#include "mpcops.hpp"
			
 
				+
			
 
				+// Compute the multiplicative inverse of x mod 2^{VALUE_BITS}
			
 
				+// This is the same as computing x to the power of
			
 
				+// 2^{VALUE_BITS-1}-1.
			
 
				+static value_t inverse_value_t(value_t x)
			
 
				+{
			
 
				+    int expon = 1;
			
 
				+    value_t xe = x;
			
 
				+    // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS}
			
 
				+    // Goal: compute x^(2^{VALUE_BITS-1} - 1)
			
 
				+    while (expon < VALUE_BITS-1) {
			
 
				+        xe = xe * xe * x;
			
 
				+        ++expon;
			
 
				+    }
			
 
				+    return xe;
			
 
				+}
			
 
				+
			
 
				 // Create a StreamEval object that will start its output at index start.
			
 
				 // It will wrap around to 0 when it hits 2^depth.  If use_expansion
			
 
				 // is true, then if the DPF has been expanded, just output values
			
@@ -163,8 +181,8 @@ inline V ParallelEval<T>::reduce(V init, W process)
 
				 
			
 
				 // I/O for RDPFs
			
 
				 
			
 
				-template <typename T>
			
 
				-T& operator>>(T &is, RDPF &rdpf)
			
 
				+template <typename T, nbits_t WIDTH>
			
 
				+T& operator>>(T &is, RDPF<WIDTH> &rdpf)
			
 
				 {
			
 
				     is.read((char *)&rdpf.seed, sizeof(rdpf.seed));
			
 
				     rdpf.whichhalf = get_lsb(rdpf.seed);
			
@@ -201,8 +219,8 @@ T& operator>>(T &is, RDPF &rdpf)
 
				 // Write the DPF to the output stream.  If expanded=true, then include
			
 
				 // the expansion _if_ the DPF is itself already expanded.  You can use
			
 
				 // this to write DPFs to files.
			
 
				-template <typename T>
			
 
				-T& write_maybe_expanded(T &os, const RDPF &rdpf,
			
 
				+template <typename T, nbits_t WIDTH>
			
 
				+T& write_maybe_expanded(T &os, const RDPF<WIDTH> &rdpf,
			
 
				     bool expanded = true)
			
 
				 {
			
 
				     os.write((const char *)&rdpf.seed, sizeof(rdpf.seed));
			
@@ -233,8 +251,8 @@ T& write_maybe_expanded(T &os, const RDPF &rdpf,
 
				 
			
 
				 // The ordinary << version never writes the expansion, since this is
			
 
				 // what we use to send DPFs over the network.
			
 
				-template <typename T>
			
 
				-T& operator<<(T &os, const RDPF &rdpf)
			
 
				+template <typename T, nbits_t WIDTH>
			
 
				+T& operator<<(T &os, const RDPF<WIDTH> &rdpf)
			
 
				 {
			
 
				     return write_maybe_expanded(os, rdpf, false);
			
 
				 }
			
@@ -243,8 +261,8 @@ T& operator<<(T &os, const RDPF &rdpf)
 
				 
			
 
				 // We never write RDPFTriples over the network, so always write
			
 
				 // the DPF expansions if they're available.
			
 
				-template <typename T>
			
 
				-T& operator<<(T &os, const RDPFTriple &rdpftrip)
			
 
				+template <typename T, nbits_t WIDTH>
			
 
				+T& operator<<(T &os, const RDPFTriple<WIDTH> &rdpftrip)
			
 
				 {
			
 
				     write_maybe_expanded(os, rdpftrip.dpf[0], true);
			
 
				     write_maybe_expanded(os, rdpftrip.dpf[1], true);
			
@@ -255,8 +273,8 @@ T& operator<<(T &os, const RDPFTriple &rdpftrip)
 
				     return os;
			
 
				 }
			
 
				 
			
 
				-template <typename T>
			
 
				-T& operator>>(T &is, RDPFTriple &rdpftrip)
			
 
				+template <typename T, nbits_t WIDTH>
			
 
				+T& operator>>(T &is, RDPFTriple<WIDTH> &rdpftrip)
			
 
				 {
			
 
				     is >> rdpftrip.dpf[0] >> rdpftrip.dpf[1] >> rdpftrip.dpf[2];
			
 
				     nbits_t depth = rdpftrip.dpf[0].depth();
			
@@ -271,17 +289,522 @@ T& operator>>(T &is, RDPFTriple &rdpftrip)
 
				 
			
 
				 // We never write RDPFPairs over the network, so always write
			
 
				 // the DPF expansions if they're available.
			
 
				-template <typename T>
			
 
				-T& operator<<(T &os, const RDPFPair &rdpfpair)
			
 
				+template <typename T, nbits_t WIDTH>
			
 
				+T& operator<<(T &os, const RDPFPair<WIDTH> &rdpfpair)
			
 
				 {
			
 
				     write_maybe_expanded(os, rdpfpair.dpf[0], true);
			
 
				     write_maybe_expanded(os, rdpfpair.dpf[1], true);
			
 
				     return os;
			
 
				 }
			
 
				 
			
 
				-template <typename T>
			
 
				-T& operator>>(T &is, RDPFPair &rdpfpair)
			
 
				+template <typename T, nbits_t WIDTH>
			
 
				+T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
			
 
				 {
			
 
				     is >> rdpfpair.dpf[0] >> rdpfpair.dpf[1];
			
 
				     return is;
			
 
				 }
			
 
				+
			
 
				+// Construct a DPF with the given (XOR-shared) target location, and
			
 
				+// of the given depth, to be used for random-access memory reads and
			
 
				+// writes.  The DPF is construction collaboratively by P0 and P1,
			
 
				+// with the server P2 helping by providing various kinds of
			
 
				+// correlated randomness, such as MultTriples and AndTriples.
			
 
				+//
			
 
				+// This algorithm is based on Appendix C from the Duoram paper, with a
			
 
				+// small optimization noted below.
			
 
				+template <nbits_t WIDTH>
			
 
				+RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
			
 
				+    RegXS target, nbits_t depth, bool save_expansion)
			
 
				+{
			
 
				+    int player = tio.player();
			
 
				+    size_t &aes_ops = tio.aes_ops();
			
 
				+
			
 
				+    // Choose a random seed
			
 
				+    arc4random_buf(&seed, sizeof(seed));
			
 
				+    // Ensure the flag bits (the lsb of each node) are different
			
 
				+    seed = set_lsb(seed, !!player);
			
 
				+    cfbits = 0;
			
 
				+    whichhalf = (player == 1);
			
 
				+
			
 
				+    // The root level is just the seed
			
 
				+    nbits_t level = 0;
			
 
				+    DPFnode *curlevel = NULL;
			
 
				+    DPFnode *nextlevel = new DPFnode[1];
			
 
				+    nextlevel[0] = seed;
			
 
				+
			
 
				+    // Construct each intermediate level
			
 
				+    while(level < depth) {
			
 
				+        if (player < 2) {
			
 
				+            delete[] curlevel;
			
 
				+            curlevel = nextlevel;
			
 
				+            if (save_expansion && level == depth-1) {
			
 
				+                expansion.resize(1<<depth);
			
 
				+                nextlevel = expansion.data();
			
 
				+            } else {
			
 
				+                nextlevel = new DPFnode[1<<(level+1)];
			
 
				+            }
			
 
				+        }
			
 
				+        // Invariant: curlevel has 2^level elements; nextlevel has
			
 
				+        // 2^{level+1} elements
			
 
				+
			
 
				+        // The bit-shared choice bit is bit (depth-level-1) of the
			
 
				+        // XOR-shared target index
			
 
				+        RegBS bs_choice = target.bit(depth-level-1);
			
 
				+        size_t curlevel_size = (size_t(1)<<level);
			
 
				+        DPFnode L = _mm_setzero_si128();
			
 
				+        DPFnode R = _mm_setzero_si128();
			
 
				+        // The server doesn't need to do this computation, but it does
			
 
				+        // need to execute mpc_reconstruct_choice so that it sends
			
 
				+        // the AndTriples at the appropriate time.
			
 
				+        if (player < 2) {
			
 
				+#ifdef RDPF_MTGEN_TIMING_1
			
 
				+            if (player == 0) {
			
 
				+                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				+                    nextlevel, aes_ops);
			
 
				+                size_t niters = 2048;
			
 
				+                if (level > 8) niters = (1<<20)>>level;
			
 
				+                for(int t=1;t<=8;++t) {
			
 
				+                    mtgen_timetest_1(level, t, niters, curlevel,
			
 
				+                        nextlevel, aes_ops);
			
 
				+                }
			
 
				+                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
			
 
				+                    nextlevel, aes_ops);
			
 
				+            }
			
 
				+#endif
			
 
				+            // Using the timing results gathered above, decide whether
			
 
				+            // to multithread, and if so, how many threads to use.
			
 
				+            // tio.cpu_nthreads() is the maximum number we have
			
 
				+            // available.
			
 
				+            int max_nthreads = tio.cpu_nthreads();
			
 
				+            if (max_nthreads == 1 || level < 19) {
			
 
				+                // No threading
			
 
				+                size_t laes_ops = 0;
			
 
				+                for(size_t i=0;i<curlevel_size;++i) {
			
 
				+                    DPFnode lchild, rchild;
			
 
				+                    prgboth(lchild, rchild, curlevel[i], laes_ops);
			
 
				+                    L = (L ^ lchild);
			
 
				+                    R = (R ^ rchild);
			
 
				+                    nextlevel[2*i] = lchild;
			
 
				+                    nextlevel[2*i+1] = rchild;
			
 
				+                }
			
 
				+                aes_ops += laes_ops;
			
 
				+            } else {
			
 
				+                size_t curlevel_size = size_t(1)<<level;
			
 
				+                int nthreads =
			
 
				+                    int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				+                if (nthreads > max_nthreads) {
			
 
				+                    nthreads = max_nthreads;
			
 
				+                }
			
 
				+                DPFnode tL[nthreads];
			
 
				+                DPFnode tR[nthreads];
			
 
				+                size_t taes_ops[nthreads];
			
 
				+                size_t threadstart = 0;
			
 
				+                size_t threadchunk = curlevel_size / nthreads;
			
 
				+                size_t threadextra = curlevel_size % nthreads;
			
 
				+                boost::asio::thread_pool pool(nthreads);
			
 
				+                for (int t=0;t<nthreads;++t) {
			
 
				+                    size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				+                    size_t threadend = threadstart + threadsize;
			
 
				+                    boost::asio::post(pool,
			
 
				+                        [t, &tL, &tR, &taes_ops, threadstart, threadend,
			
 
				+                        &curlevel, &nextlevel] {
			
 
				+                            DPFnode L = _mm_setzero_si128();
			
 
				+                            DPFnode R = _mm_setzero_si128();
			
 
				+                            size_t aes_ops = 0;
			
 
				+                            for(size_t i=threadstart;i<threadend;++i) {
			
 
				+                                DPFnode lchild, rchild;
			
 
				+                                prgboth(lchild, rchild, curlevel[i], aes_ops);
			
 
				+                                L = (L ^ lchild);
			
 
				+                                R = (R ^ rchild);
			
 
				+                                nextlevel[2*i] = lchild;
			
 
				+                                nextlevel[2*i+1] = rchild;
			
 
				+                            }
			
 
				+                            tL[t] = L;
			
 
				+                            tR[t] = R;
			
 
				+                            taes_ops[t] = aes_ops;
			
 
				+                        });
			
 
				+                    threadstart = threadend;
			
 
				+                }
			
 
				+                pool.join();
			
 
				+                for (int t=0;t<nthreads;++t) {
			
 
				+                    L ^= tL[t];
			
 
				+                    R ^= tR[t];
			
 
				+                    aes_ops += taes_ops[t];
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        // If we're going left (bs_choice = 0), we want the correction
			
 
				+        // word to be the XOR of our right side and our peer's right
			
 
				+        // side; if bs_choice = 1, it should be the XOR or our left side
			
 
				+        // and our peer's left side.
			
 
				+
			
 
				+        // We also have to ensure that the flag bits (the lsb) of the
			
 
				+        // side that will end up the same be of course the same, but
			
 
				+        // also that the flag bits (the lsb) of the side that will end
			
 
				+        // up different _must_ be different.  That is, it's not enough
			
 
				+        // for the nodes of the child selected by choice to be different
			
 
				+        // as 128-bit values; they also have to be different in their
			
 
				+        // lsb.
			
 
				+
			
 
				+        // This is where we make a small optimization over Appendix C of
			
 
				+        // the Duoram paper: instead of keeping separate correction flag
			
 
				+        // bits for the left and right children, we observe that the low
			
 
				+        // bit of the overall correction word effectively serves as one
			
 
				+        // of those bits, so we just need to store one extra bit per
			
 
				+        // level, not two.  (We arbitrarily choose the one for the right
			
 
				+        // child.)
			
 
				+
			
 
				+        // Note that the XOR of our left and right child before and
			
 
				+        // after applying the correction word won't change, since the
			
 
				+        // correction word is applied to either both children or
			
 
				+        // neither, depending on the value of the parent's flag. So in
			
 
				+        // particular, the XOR of the flag bits won't change, and if our
			
 
				+        // children's flag's XOR equals our peer's children's flag's
			
 
				+        // XOR, then we won't have different flag bits even for the
			
 
				+        // children that have different 128-bit values.
			
 
				+
			
 
				+        // So we compute our_parity = lsb(L^R)^player, and we XOR that
			
 
				+        // into the R value in the correction word computation.  At the
			
 
				+        // same time, we exchange these parity values to compute the
			
 
				+        // combined parity, which we store in the DPF.  Then when the
			
 
				+        // DPF is evaluated, if the parent's flag is set, not only apply
			
 
				+        // the correction work to both children, but also apply the
			
 
				+        // (combined) parity bit to just the right child.  Then for
			
 
				+        // unequal nodes (where the flag bit is different), exactly one
			
 
				+        // of the four children (two for P0 and two for P1) will have
			
 
				+        // the parity bit applied, which will set the XOR of the lsb of
			
 
				+        // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
			
 
				+        // = 1 because everything cancels out except player (for which
			
 
				+        // one player is 0 and the other is 1).
			
 
				+
			
 
				+        bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
			
 
				+        DPFnode our_parity = lsb128_mask[our_parity_bit];
			
 
				+
			
 
				+        DPFnode CW;
			
 
				+        bool peer_parity_bit;
			
 
				+        // Exchange the parities and do mpc_reconstruct_choice at the
			
 
				+        // same time (bundled into the same rounds)
			
 
				+        run_coroutines(yield,
			
 
				+            [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
			
 
				+                tio.queue_peer(&our_parity_bit, 1);
			
 
				+                yield();
			
 
				+                uint8_t peer_parity_byte;
			
 
				+                tio.recv_peer(&peer_parity_byte, 1);
			
 
				+                peer_parity_bit = peer_parity_byte & 1;
			
 
				+            },
			
 
				+            [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
			
 
				+                mpc_reconstruct_choice(tio, yield, CW, bs_choice,
			
 
				+                    (R ^ our_parity), L);
			
 
				+            });
			
 
				+        bool parity_bit = our_parity_bit ^ peer_parity_bit;
			
 
				+        cfbits |= (value_t(parity_bit)<<level);
			
 
				+        DPFnode CWR = CW ^ lsb128_mask[parity_bit];
			
 
				+        if (player < 2) {
			
 
				+            // The timing of each iteration of the inner loop is
			
 
				+            // comparable to the above, so just use the same
			
 
				+            // computations.  All of this could be tuned, of course.
			
 
				+
			
 
				+            if (level < depth-1) {
			
 
				+                // Using the timing results gathered above, decide whether
			
 
				+                // to multithread, and if so, how many threads to use.
			
 
				+                // tio.cpu_nthreads() is the maximum number we have
			
 
				+                // available.
			
 
				+                int max_nthreads = tio.cpu_nthreads();
			
 
				+                if (max_nthreads == 1 || level < 19) {
			
 
				+                    // No threading
			
 
				+                    for(size_t i=0;i<curlevel_size;++i) {
			
 
				+                        bool flag = get_lsb(curlevel[i]);
			
 
				+                        nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
			
 
				+                        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    int nthreads =
			
 
				+                        int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				+                    if (nthreads > max_nthreads) {
			
 
				+                        nthreads = max_nthreads;
			
 
				+                    }
			
 
				+                    size_t threadstart = 0;
			
 
				+                    size_t threadchunk = curlevel_size / nthreads;
			
 
				+                    size_t threadextra = curlevel_size % nthreads;
			
 
				+                    boost::asio::thread_pool pool(nthreads);
			
 
				+                    for (int t=0;t<nthreads;++t) {
			
 
				+                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				+                        size_t threadend = threadstart + threadsize;
			
 
				+                        boost::asio::post(pool, [CW, CWR, threadstart, threadend,
			
 
				+                            &curlevel, &nextlevel] {
			
 
				+                                for(size_t i=threadstart;i<threadend;++i) {
			
 
				+                                    bool flag = get_lsb(curlevel[i]);
			
 
				+                                    nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
			
 
				+                                    nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				+                                }
			
 
				+                        });
			
 
				+                        threadstart = threadend;
			
 
				+                    }
			
 
				+                    pool.join();
			
 
				+                }
			
 
				+            } else {
			
 
				+                // Recall there are four potentially useful vectors that
			
 
				+                // can come out of a DPF:
			
 
				+                // - (single-bit) bitwise unit vector
			
 
				+                // - additive-shared unit vector
			
 
				+                // - XOR-shared scaled unit vector
			
 
				+                // - additive-shared scaled unit vector
			
 
				+                //
			
 
				+                // (No single DPF should be used for both of the first
			
 
				+                // two or both of the last two, though, since they're
			
 
				+                // correlated; you _can_ use one of the first two and
			
 
				+                // one of the last two.)
			
 
				+                //
			
 
				+                // For each 128-bit leaf, the low bit is the flag bit,
			
 
				+                // and we're guaranteed that the flag bits (and indeed
			
 
				+                // the whole 128-bit value) for P0 and P1 are the same
			
 
				+                // for every leaf except the target, and that the flag
			
 
				+                // bits definitely differ for the target (and the other
			
 
				+                // 127 bits are independently random on each side).
			
 
				+                //
			
 
				+                // We divide the 128-bit leaf into a low 64-bit word and
			
 
				+                // a high 64-bit word.  We use the low word for the unit
			
 
				+                // vector and the high word for the scaled vector; this
			
 
				+                // choice is not arbitrary: the flag bit in the low word
			
 
				+                // means that the sum of all the low words (with P1's
			
 
				+                // low words negated) across both P0 and P1 is
			
 
				+                // definitely odd, so we can compute that sum's inverse
			
 
				+                // mod 2^64, and store it now during precomputation.  At
			
 
				+                // evaluation time for the additive-shared unit vector,
			
 
				+                // we will output this global inverse times the low word
			
 
				+                // of each leaf, which will make the sum of all of those
			
 
				+                // values 1.  (This technique replaces the protocol in
			
 
				+                // Appendix D of the Duoram paper.)
			
 
				+                //
			
 
				+                // For the scaled vector, we just have to compute shares
			
 
				+                // of what the scaled vector is a sharing _of_, but
			
 
				+                // that's just XORing or adding all of each party's
			
 
				+                // local high words; no communication needed.
			
 
				+
			
 
				+                value_t low_sum = 0;
			
 
				+                value_t high_sum = 0;
			
 
				+                value_t high_xor = 0;
			
 
				+                // Using the timing results gathered above, decide whether
			
 
				+                // to multithread, and if so, how many threads to use.
			
 
				+                // tio.cpu_nthreads() is the maximum number we have
			
 
				+                // available.
			
 
				+                int max_nthreads = tio.cpu_nthreads();
			
 
				+                if (max_nthreads == 1 || level < 19) {
			
 
				+                    // No threading
			
 
				+                    for(size_t i=0;i<curlevel_size;++i) {
			
 
				+                        bool flag = get_lsb(curlevel[i]);
			
 
				+                        DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
			
 
				+                        DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				+                        if (save_expansion) {
			
 
				+                            nextlevel[2*i] = leftchild;
			
 
				+                            nextlevel[2*i+1] = rightchild;
			
 
				+                        }
			
 
				+                        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
			
 
				+                        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
			
 
				+                        value_t lefthigh =
			
 
				+                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
			
 
				+                        value_t righthigh =
			
 
				+                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
			
 
				+                        low_sum += (leftlow + rightlow);
			
 
				+                        high_sum += (lefthigh + righthigh);
			
 
				+                        high_xor ^= (lefthigh ^ righthigh);
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    int nthreads =
			
 
				+                        int(ceil(sqrt(double(curlevel_size/6000))));
			
 
				+                    if (nthreads > max_nthreads) {
			
 
				+                        nthreads = max_nthreads;
			
 
				+                    }
			
 
				+                    value_t tlow_sum[nthreads];
			
 
				+                    value_t thigh_sum[nthreads];
			
 
				+                    value_t thigh_xor[nthreads];
			
 
				+                    size_t threadstart = 0;
			
 
				+                    size_t threadchunk = curlevel_size / nthreads;
			
 
				+                    size_t threadextra = curlevel_size % nthreads;
			
 
				+                    boost::asio::thread_pool pool(nthreads);
			
 
				+                    for (int t=0;t<nthreads;++t) {
			
 
				+                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
			
 
				+                        size_t threadend = threadstart + threadsize;
			
 
				+                        boost::asio::post(pool,
			
 
				+                            [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
			
 
				+                            &curlevel, &nextlevel, CW, CWR, save_expansion] {
			
 
				+                                value_t low_sum = 0;
			
 
				+                                value_t high_sum = 0;
			
 
				+                                value_t high_xor = 0;
			
 
				+                                for(size_t i=threadstart;i<threadend;++i) {
			
 
				+                                    bool flag = get_lsb(curlevel[i]);
			
 
				+                                    DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
			
 
				+                                    DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
			
 
				+                                    if (save_expansion) {
			
 
				+                                        nextlevel[2*i] = leftchild;
			
 
				+                                        nextlevel[2*i+1] = rightchild;
			
 
				+                                    }
			
 
				+                                    value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
			
 
				+                                    value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
			
 
				+                                    value_t lefthigh =
			
 
				+                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
			
 
				+                                    value_t righthigh =
			
 
				+                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
			
 
				+                                    low_sum += (leftlow + rightlow);
			
 
				+                                    high_sum += (lefthigh + righthigh);
			
 
				+                                    high_xor ^= (lefthigh ^ righthigh);
			
 
				+                                }
			
 
				+                                tlow_sum[t] = low_sum;
			
 
				+                                thigh_sum[t] = high_sum;
			
 
				+                                thigh_xor[t] = high_xor;
			
 
				+                            });
			
 
				+                        threadstart = threadend;
			
 
				+                    }
			
 
				+                    pool.join();
			
 
				+                    for (int t=0;t<nthreads;++t) {
			
 
				+                        low_sum += tlow_sum[t];
			
 
				+                        high_sum += thigh_sum[t];
			
 
				+                        high_xor ^= thigh_xor[t];
			
 
				+                    }
			
 
				+                }
			
 
				+                if (player == 1) {
			
 
				+                    low_sum = -low_sum;
			
 
				+                    high_sum = -high_sum;
			
 
				+                }
			
 
				+                scaled_sum.ashare = high_sum;
			
 
				+                scaled_xor.xshare = high_xor;
			
 
				+                // Exchange low_sum and add them up
			
 
				+                tio.queue_peer(&low_sum, sizeof(low_sum));
			
 
				+                yield();
			
 
				+                value_t peer_low_sum;
			
 
				+                tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
			
 
				+                low_sum += peer_low_sum;
			
 
				+                // The low_sum had better be odd
			
 
				+                assert(low_sum & 1);
			
 
				+                unit_sum_inverse = inverse_value_t(low_sum);
			
 
				+            }
			
 
				+            cw.push_back(CW);
			
 
				+        } else if (level == depth-1) {
			
 
				+            yield();
			
 
				+        }
			
 
				+
			
 
				+        ++level;
			
 
				+    }
			
 
				+
			
 
				+    delete[] curlevel;
			
 
				+    if (!save_expansion || player == 2) {
			
 
				+        delete[] nextlevel;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Get the leaf node for the given input
			
 
				+template <nbits_t WIDTH>
			
 
				+DPFnode RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
			
 
				+{
			
 
				+    // If we have a precomputed expansion, just use it
			
 
				+    if (expansion.size()) {
			
 
				+        return expansion[input];
			
 
				+    }
			
 
				+
			
 
				+    nbits_t totdepth = depth();
			
 
				+    DPFnode node = seed;
			
 
				+    for (nbits_t d=0;d<totdepth;++d) {
			
 
				+        bit_t dir = !!(input & (address_t(1)<<(totdepth-d-1)));
			
 
				+        node = descend(node, d, dir, aes_ops);
			
 
				+    }
			
 
				+    return node;
			
 
				+}
			
 
				+
			
 
				+// Expand the DPF if it's not already expanded
			
 
				+//
			
 
				+// This routine is slightly more efficient than repeatedly calling
			
 
				+// StreamEval::next(), but it uses a lot more memory.
			
 
				+template <nbits_t WIDTH>
			
 
				+void RDPF<WIDTH>::expand(size_t &aes_ops)
			
 
				+{
			
 
				+    nbits_t depth = this->depth();
			
 
				+    size_t num_leaves = size_t(1)<<depth;
			
 
				+    if (expansion.size() == num_leaves) return;
			
 
				+    expansion.resize(num_leaves);
			
 
				+    address_t index = 0;
			
 
				+    address_t lastindex = 0;
			
 
				+    DPFnode *path = new DPFnode[depth];
			
 
				+    path[0] = seed;
			
 
				+    for (nbits_t i=1;i<depth;++i) {
			
 
				+        path[i] = descend(path[i-1], i-1, 0, aes_ops);
			
 
				+    }
			
 
				+    expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				+    expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				+    while(index < num_leaves) {
			
 
				+        // Invariant: lastindex and index will both be even, and
			
 
				+        // index=lastindex+2
			
 
				+        uint64_t index_xor = index ^ lastindex;
			
 
				+        nbits_t how_many_1_bits = __builtin_popcount(index_xor);
			
 
				+        // If lastindex -> index goes for example from (in binary)
			
 
				+        // 010010110 -> 010011000, then index_xor will be
			
 
				+        // 000001110 and how_many_1_bits will be 3.
			
 
				+        // That indicates that path[depth-3] was a left child, and now
			
 
				+        // we need to change it to a right child by descending right
			
 
				+        // from path[depth-4], and then filling the path after that with
			
 
				+        // left children.
			
 
				+        path[depth-how_many_1_bits] =
			
 
				+            descend(path[depth-how_many_1_bits-1],
			
 
				+                depth-how_many_1_bits-1, 1, aes_ops);
			
 
				+        for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
			
 
				+            path[i+1] = descend(path[i], i, 0, aes_ops);
			
 
				+        }
			
 
				+        lastindex = index;
			
 
				+        expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				+        expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				+    }
			
 
				+
			
 
				+    delete[] path;
			
 
				+}
			
 
				+
			
 
				+// Construct three RDPFs of the given depth all with the same randomly
			
 
				+// generated target index.
			
 
				+template <nbits_t WIDTH>
			
 
				+RDPFTriple<WIDTH>::RDPFTriple(MPCTIO &tio, yield_t &yield,
			
 
				+    nbits_t depth, bool save_expansion)
			
 
				+{
			
 
				+    // Pick a random XOR share of the target
			
 
				+    xs_target.randomize(depth);
			
 
				+
			
 
				+    // Now create three RDPFs with that target, and also convert the XOR
			
 
				+    // shares of the target to additive shares
			
 
				+    std::vector<coro_t> coroutines;
			
 
				+    for (int i=0;i<3;++i) {
			
 
				+        coroutines.emplace_back(
			
 
				+            [this, &tio, depth, i, save_expansion](yield_t &yield) {
			
 
				+                dpf[i] = RDPF<WIDTH>(tio, yield, xs_target, depth,
			
 
				+                    save_expansion);
			
 
				+            });
			
 
				+    }
			
 
				+    coroutines.emplace_back(
			
 
				+        [this, &tio, depth](yield_t &yield) {
			
 
				+            mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false);
			
 
				+        });
			
 
				+    run_coroutines(yield, coroutines);
			
 
				+}
			
 
				+
			
 
				+template <nbits_t WIDTH>
			
 
				+typename RDPFTriple<WIDTH>::node RDPFTriple<WIDTH>::descend(
			
 
				+    const RDPFTriple<WIDTH>::node &parent,
			
 
				+    nbits_t parentdepth, bit_t whichchild,
			
 
				+    size_t &aes_ops) const
			
 
				+{
			
 
				+    auto [P0, P1, P2] = parent;
			
 
				+    DPFnode C0, C1, C2;
			
 
				+    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
			
 
				+    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
			
 
				+    C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops);
			
 
				+    return std::make_tuple(C0,C1,C2);
			
 
				+}
			
 
				+
			
 
				+template <nbits_t WIDTH>
			
 
				+typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
			
 
				+    const RDPFPair<WIDTH>::node &parent,
			
 
				+    nbits_t parentdepth, bit_t whichchild,
			
 
				+    size_t &aes_ops) const
			
 
				+{
			
 
				+    auto [P0, P1] = parent;
			
 
				+    DPFnode C0, C1;
			
 
				+    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
			
 
				+    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
			
 
				+    return std::make_tuple(C0,C1);
			
 
				+}
			
--- a/types.hpp
+++ b/types.hpp
@@ -643,9 +643,9 @@ struct ValSelectTripleName { static constexpr const char *name = "s"; };
 
				 
			
 
				 // These are defined in rdpf.hpp, but declared here to avoid cyclic
			
 
				 // header dependencies.
			
 
				-struct RDPFPair;
			
 
				+template <nbits_t WIDTH> struct RDPFPair;
			
 
				 struct RDPFPairName { static constexpr const char *name = "r"; };
			
 
				-struct RDPFTriple;
			
 
				+template <nbits_t WIDTH> struct RDPFTriple;
			
 
				 struct RDPFTripleName { static constexpr const char *name = "r"; };
			
 
				 struct CDPF;
			
 
				 struct CDPFName { static constexpr const char *name = "c"; };