Browse Source

Add a template parameter to RDPF, RDPFPair, RDPFTriple for the leaf width

The parameter exists, but is not yet used; the width is still always 1
Ian Goldberg 1 year ago
parent
commit
a9e39d265e
9 changed files with 591 additions and 578 deletions
  1. 2 1
      cell.cpp
  2. 9 9
      duoram.tcc
  3. 6 6
      mpcio.cpp
  4. 4 4
      mpcio.hpp
  5. 25 25
      online.cpp
  6. 0 514
      rdpf.cpp
  7. 6 3
      rdpf.hpp
  8. 537 14
      rdpf.tcc
  9. 2 2
      types.hpp

+ 2 - 1
cell.cpp

@@ -91,7 +91,8 @@ struct Cell {
     // Note that RegXS will extend a RegBS of 1 to the all-1s word, not
     // the word with value 1.  This is used for ORAM reads, where the
     // same DPF is used for all the fields.
-    inline void unit(const RDPF &dpf, DPFnode leaf) {
+    template <nbits_t WIDTH>
+    inline void unit(const RDPF<WIDTH> &dpf, DPFnode leaf) {
         key = dpf.unit_as(leaf);
         pointers = dpf.unit_bs(leaf);
         value = dpf.unit_bs(leaf);

+ 9 - 9
duoram.tcc

@@ -273,7 +273,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
     if (player < 2) {
         // Computational players do this
 
-        RDPFTriple dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
+        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
 
         // Compute the index offset
         U indoffset;
@@ -281,7 +281,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
         indoffset -= idx;
 
         // We only need two of the DPFs for reading
-        RDPFPair dp(std::move(dt), 0, player == 0 ? 2 : 1);
+        RDPFPair<1> dp(std::move(dt), 0, player == 0 ? 2 : 1);
         // The RDPFTriple dt is now broken, since we've moved things out
         // of it.
 
@@ -304,7 +304,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
             shape.tio.aes_ops());
         FT init;
         res = pe.reduce(init, [this, &dp, &shape] (int thread_num,
-                address_t i, const RDPFPair::node &leaf) {
+                address_t i, const RDPFPair<1>::node &leaf) {
             // The values from the two DPFs, which will each be of type T
             std::tuple<FT,FT> V;
             dp.unit(V, leaf);
@@ -324,7 +324,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
     } else {
         // The server does this
 
-        RDPFPair dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
+        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
         U p0indoffset, p1indoffset;
 
         shape.yield();
@@ -341,7 +341,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
             shape.shape_size, shape.tio.cpu_nthreads(),
             shape.tio.aes_ops());
         gamma = pe.reduce(init, [this, &dp, &shape] (int thread_num,
-                address_t i, const RDPFPair::node &leaf) {
+                address_t i, const RDPFPair<1>::node &leaf) {
             // The values from the two DPFs, each of type FT
             std::tuple<FT,FT> V;
             dp.unit(V, leaf);
@@ -383,7 +383,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
     if (player < 2) {
         // Computational players do this
 
-        RDPFTriple dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
+        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
 
         // Compute the index and message offsets
         U indoffset;
@@ -420,7 +420,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             shape.tio.aes_ops());
         int init = 0;
         pe.reduce(init, [this, &dt, &shape, &Mshift, player] (int thread_num,
-                address_t i, const RDPFTriple::node &leaf) {
+                address_t i, const RDPFTriple<1>::node &leaf) {
             // The values from the three DPFs
             std::tuple<FT,FT,FT> scaled, unit;
             dt.scaled(scaled, leaf);
@@ -442,7 +442,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
     } else {
         // The server does this
 
-        RDPFPair dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
+        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
         U p0indoffset, p1indoffset;
         std::tuple<FT,FT> p0Moffset, p1Moffset;
 
@@ -463,7 +463,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             shape.tio.aes_ops());
         int init = 0;
         pe.reduce(init, [this, &dp, &shape, &Mshift] (int thread_num,
-                address_t i, const RDPFPair::node &leaf) {
+                address_t i, const RDPFPair<1>::node &leaf) {
             // The values from the two DPFs
             std::tuple<FT,FT> scaled, unit;
             dp.scaled(scaled, leaf);

+ 6 - 6
mpcio.cpp

@@ -802,17 +802,17 @@ SelectTriple<bit_t> MPCTIO::bitselecttriple(yield_t &yield)
 
 // Only computational peers call this; the server should be calling
 // rdpfpair() at the same time
-RDPFTriple MPCTIO::rdpftriple(yield_t &yield, nbits_t depth,
+RDPFTriple<1> MPCTIO::rdpftriple(yield_t &yield, nbits_t depth,
     bool keep_expansion)
 {
     assert(mpcio.player < 2);
-    RDPFTriple val;
+    RDPFTriple<1> val;
 
     MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
     if (mpcio.mode == MODE_ONLINE) {
         mpcpio.rdpftriples[thread_num][depth-1].get(val);
     } else {
-        val = RDPFTriple(*this, yield, depth,
+        val = RDPFTriple<1>(*this, yield, depth,
             keep_expansion);
         iostream_server() <<
             val.dpf[(mpcio.player == 0) ? 1 : 2];
@@ -824,16 +824,16 @@ RDPFTriple MPCTIO::rdpftriple(yield_t &yield, nbits_t depth,
 
 // Only the server calls this; the computational peers should be calling
 // rdpftriple() at the same time
-RDPFPair MPCTIO::rdpfpair(yield_t &yield, nbits_t depth)
+RDPFPair<1> MPCTIO::rdpfpair(yield_t &yield, nbits_t depth)
 {
     assert(mpcio.player == 2);
-    RDPFPair val;
+    RDPFPair<1> val;
 
     MPCServerIO &mpcsrvio = static_cast<MPCServerIO&>(mpcio);
     if (mpcio.mode == MODE_ONLINE) {
         mpcsrvio.rdpfpairs[thread_num][depth-1].get(val);
     } else {
-        RDPFTriple trip(*this, yield, depth, true);
+        RDPFTriple<1> trip(*this, yield, depth, true);
         yield();
         iostream_p0() >> val.dpf[0];
         iostream_p1() >> val.dpf[1];

+ 4 - 4
mpcio.hpp

@@ -215,7 +215,7 @@ struct MPCPeerIO : public MPCIO {
     std::vector<PreCompStorage<CDPF, CDPFName>> cdpfs;
     // The outer vector is (like above) one item per thread
     // The inner array is indexed by DPF depth (depth d is at entry d-1)
-    std::vector<std::array<PreCompStorage<RDPFTriple, RDPFTripleName>,ADDRESS_MAX_BITS>> rdpftriples;
+    std::vector<std::array<PreCompStorage<RDPFTriple<1>, RDPFTripleName>,ADDRESS_MAX_BITS>> rdpftriples;
 
     MPCPeerIO(unsigned player, ProcessingMode mode,
             std::deque<tcp::socket> &peersocks,
@@ -236,7 +236,7 @@ struct MPCServerIO : public MPCIO {
     std::deque<MPCSingleIO> p1ios;
     // The outer vector is (like above) one item per thread
     // The inner array is indexed by DPF depth (depth d is at entry d-1)
-    std::vector<std::array<PreCompStorage<RDPFPair, RDPFPairName>,ADDRESS_MAX_BITS>> rdpfpairs;
+    std::vector<std::array<PreCompStorage<RDPFPair<1>, RDPFPairName>,ADDRESS_MAX_BITS>> rdpfpairs;
 
     MPCServerIO(ProcessingMode mode,
             std::deque<tcp::socket> &p0socks,
@@ -387,10 +387,10 @@ public:
 
     // These ones only work during the online phase
     // Computational peers call:
-    RDPFTriple rdpftriple(yield_t &yield, nbits_t depth,
+    RDPFTriple<1> rdpftriple(yield_t &yield, nbits_t depth,
         bool keep_expansion = true);
     // The server calls:
-    RDPFPair rdpfpair(yield_t &yield, nbits_t depth);
+    RDPFPair<1> rdpfpair(yield_t &yield, nbits_t depth);
     // Anyone can call:
     CDPF cdpf(yield_t &yield);
 

+ 25 - 25
online.cpp

@@ -216,9 +216,9 @@ static void rdpf_test(MPCIO &mpcio,
                 size_t &aes_ops = tio.aes_ops();
                 for (size_t iter=0; iter < num_iters; ++iter) {
                     if (tio.player() == 2) {
-                        RDPFPair dp = tio.rdpfpair(yield, depth);
+                        RDPFPair<1> dp = tio.rdpfpair(yield, depth);
                         for (int i=0;i<2;++i) {
-                            const RDPF &dpf = dp.dpf[i];
+                            const RDPF<1> &dpf = dp.dpf[i];
                             for (address_t x=0;x<(address_t(1)<<depth);++x) {
                                 DPFnode leaf = dpf.leaf(x, aes_ops);
                                 RegBS ub = dpf.unit_bs(leaf);
@@ -231,9 +231,9 @@ static void rdpf_test(MPCIO &mpcio,
                             printf("\n");
                         }
                     } else {
-                        RDPFTriple dt = tio.rdpftriple(yield, depth);
+                        RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
                         for (int i=0;i<3;++i) {
-                            const RDPF &dpf = dt.dpf[i];
+                            const RDPF<1> &dpf = dt.dpf[i];
                             RegXS peer_scaled_xor;
                             RegAS peer_scaled_sum;
                             if (tio.player() == 1) {
@@ -300,9 +300,9 @@ static void rdpf_timing(MPCIO &mpcio,
             run_coroutines(tio, [&tio, depth] (yield_t &yield) {
                 size_t &aes_ops = tio.aes_ops();
                 if (tio.player() == 2) {
-                    RDPFPair dp = tio.rdpfpair(yield, depth);
+                    RDPFPair<1> dp = tio.rdpfpair(yield, depth);
                     for (int i=0;i<2;++i) {
-                        RDPF &dpf = dp.dpf[i];
+                        RDPF<1> &dpf = dp.dpf[i];
                         dpf.expand(aes_ops);
                         RegXS scaled_xor;
                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
@@ -315,9 +315,9 @@ static void rdpf_timing(MPCIO &mpcio,
                         printf("\n");
                     }
                 } else {
-                    RDPFTriple dt = tio.rdpftriple(yield, depth);
+                    RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
                     for (int i=0;i<3;++i) {
-                        RDPF &dpf = dt.dpf[i];
+                        RDPF<1> &dpf = dt.dpf[i];
                         dpf.expand(aes_ops);
                         RegXS scaled_xor;
                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
@@ -336,7 +336,7 @@ static void rdpf_timing(MPCIO &mpcio,
     pool.join();
 }
 
-static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF &dpf,
+static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
     address_t start, int num_threads)
 {
     RegXS scaled_xor[num_threads];
@@ -392,9 +392,9 @@ static void rdpfeval_timing(MPCIO &mpcio,
     MPCTIO tio(mpcio, 0, num_threads);
     run_coroutines(tio, [&mpcio, &tio, depth, start, num_threads] (yield_t &yield) {
         if (tio.player() == 2) {
-            RDPFPair dp = tio.rdpfpair(yield, depth);
+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
             for (int i=0;i<2;++i) {
-                RDPF &dpf = dp.dpf[i];
+                RDPF<1> &dpf = dp.dpf[i];
                 value_t scaled_xor =
                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
                 printf("%016lx\n%016lx\n", scaled_xor,
@@ -402,9 +402,9 @@ static void rdpfeval_timing(MPCIO &mpcio,
                 printf("\n");
             }
         } else {
-            RDPFTriple dt = tio.rdpftriple(yield, depth);
+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
             for (int i=0;i<3;++i) {
-                RDPF &dpf = dt.dpf[i];
+                RDPF<1> &dpf = dt.dpf[i];
                 value_t scaled_xor =
                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
                 printf("%016lx\n%016lx\n", scaled_xor,
@@ -434,15 +434,15 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
     MPCTIO tio(mpcio, 0, num_threads);
     run_coroutines(tio, [&tio, depth, start, num_threads] (yield_t &yield) {
         if (tio.player() == 2) {
-            RDPFPair dp = tio.rdpfpair(yield, depth);
+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
             for (int i=0;i<2;++i) {
-                RDPF &dpf = dp.dpf[i];
+                RDPF<1> &dpf = dp.dpf[i];
                 nbits_t depth = dpf.depth();
                 auto pe = ParallelEval(dpf, start, 0,
                     address_t(1)<<depth, num_threads, tio.aes_ops());
                 RegXS result, init;
                 result = pe.reduce(init, [&dpf] (int thread_num,
-                        address_t i, const RDPF::node &leaf) {
+                        address_t i, const RDPF<1>::node &leaf) {
                     return dpf.scaled_xs(leaf);
                 });
                 printf("%016lx\n%016lx\n", result.xshare,
@@ -450,15 +450,15 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
                 printf("\n");
             }
         } else {
-            RDPFTriple dt = tio.rdpftriple(yield, depth);
+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
             for (int i=0;i<3;++i) {
-                RDPF &dpf = dt.dpf[i];
+                RDPF<1> &dpf = dt.dpf[i];
                 nbits_t depth = dpf.depth();
                 auto pe = ParallelEval(dpf, start, 0,
                     address_t(1)<<depth, num_threads, tio.aes_ops());
                 RegXS result, init;
                 result = pe.reduce(init, [&dpf] (int thread_num,
-                        address_t i, const RDPF::node &leaf) {
+                        address_t i, const RDPF<1>::node &leaf) {
                     return dpf.scaled_xs(leaf);
                 });
                 printf("%016lx\n%016lx\n", result.xshare,
@@ -489,7 +489,7 @@ static void tupleeval_timing(MPCIO &mpcio,
     run_coroutines(tio, [&tio, depth, start] (yield_t &yield) {
         size_t &aes_ops = tio.aes_ops();
         if (tio.player() == 2) {
-            RDPFPair dp = tio.rdpfpair(yield, depth);
+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
             RegXS scaled_xor0, scaled_xor1;
             auto ev = StreamEval(dp, start, 0, aes_ops, false);
             for (address_t x=0;x<(address_t(1)<<depth);++x) {
@@ -506,7 +506,7 @@ static void tupleeval_timing(MPCIO &mpcio,
                 dp.dpf[1].scaled_xor.xshare);
             printf("\n");
         } else {
-            RDPFTriple dt = tio.rdpftriple(yield, depth);
+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
             RegXS scaled_xor0, scaled_xor1, scaled_xor2;
             auto ev = StreamEval(dt, start, 0, aes_ops, false);
             for (address_t x=0;x<(address_t(1)<<depth);++x) {
@@ -551,13 +551,13 @@ static void par_tupleeval_timing(MPCIO &mpcio,
     run_coroutines(tio, [&tio, depth, start, num_threads] (yield_t &yield) {
         size_t &aes_ops = tio.aes_ops();
         if (tio.player() == 2) {
-            RDPFPair dp = tio.rdpfpair(yield, depth);
+            RDPFPair<1> dp = tio.rdpfpair(yield, depth);
             auto pe = ParallelEval(dp, start, 0, address_t(1)<<depth,
                 num_threads, aes_ops);
             using V = std::tuple<RegXS,RegXS>;
             V result, init;
             result = pe.reduce(init, [&dp] (int thread_num, address_t i,
-                    const RDPFPair::node &leaf) {
+                    const RDPFPair<1>::node &leaf) {
                 std::tuple<RegXS,RegXS> scaled;
                 dp.scaled(scaled, leaf);
                 return scaled;
@@ -569,13 +569,13 @@ static void par_tupleeval_timing(MPCIO &mpcio,
                 dp.dpf[1].scaled_xor.xshare);
             printf("\n");
         } else {
-            RDPFTriple dt = tio.rdpftriple(yield, depth);
+            RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
             auto pe = ParallelEval(dt, start, 0, address_t(1)<<depth,
                 num_threads, aes_ops);
             using V = std::tuple<RegXS,RegXS,RegXS>;
             V result, init;
             result = pe.reduce(init, [&dt] (int thread_num, address_t i,
-                    const RDPFTriple::node &leaf) {
+                    const RDPFTriple<1>::node &leaf) {
                 std::tuple<RegXS,RegXS,RegXS> scaled;
                 dt.scaled(scaled, leaf);
                 return scaled;

+ 0 - 514
rdpf.cpp

@@ -2,23 +2,6 @@
 
 #include "rdpf.hpp"
 #include "bitutils.hpp"
-#include "mpcops.hpp"
-
-// Compute the multiplicative inverse of x mod 2^{VALUE_BITS}
-// This is the same as computing x to the power of
-// 2^{VALUE_BITS-1}-1.
-static value_t inverse_value_t(value_t x)
-{
-    int expon = 1;
-    value_t xe = x;
-    // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS}
-    // Goal: compute x^(2^{VALUE_BITS-1} - 1)
-    while (expon < VALUE_BITS-1) {
-        xe = xe * xe * x;
-        ++expon;
-    }
-    return xe;
-}
 
 #undef RDPF_MTGEN_TIMING_1
 
@@ -114,500 +97,3 @@ static void mtgen_timetest_1(nbits_t level, int nthreads,
 }
 
 #endif
-
-// Construct a DPF with the given (XOR-shared) target location, and
-// of the given depth, to be used for random-access memory reads and
-// writes.  The DPF is construction collaboratively by P0 and P1,
-// with the server P2 helping by providing various kinds of
-// correlated randomness, such as MultTriples and AndTriples.
-//
-// This algorithm is based on Appendix C from the Duoram paper, with a
-// small optimization noted below.
-RDPF::RDPF(MPCTIO &tio, yield_t &yield,
-    RegXS target, nbits_t depth, bool save_expansion)
-{
-    int player = tio.player();
-    size_t &aes_ops = tio.aes_ops();
-
-    // Choose a random seed
-    arc4random_buf(&seed, sizeof(seed));
-    // Ensure the flag bits (the lsb of each node) are different
-    seed = set_lsb(seed, !!player);
-    cfbits = 0;
-    whichhalf = (player == 1);
-
-    // The root level is just the seed
-    nbits_t level = 0;
-    DPFnode *curlevel = NULL;
-    DPFnode *nextlevel = new DPFnode[1];
-    nextlevel[0] = seed;
-
-    // Construct each intermediate level
-    while(level < depth) {
-        if (player < 2) {
-            delete[] curlevel;
-            curlevel = nextlevel;
-            if (save_expansion && level == depth-1) {
-                expansion.resize(1<<depth);
-                nextlevel = expansion.data();
-            } else {
-                nextlevel = new DPFnode[1<<(level+1)];
-            }
-        }
-        // Invariant: curlevel has 2^level elements; nextlevel has
-        // 2^{level+1} elements
-
-        // The bit-shared choice bit is bit (depth-level-1) of the
-        // XOR-shared target index
-        RegBS bs_choice = target.bit(depth-level-1);
-        size_t curlevel_size = (size_t(1)<<level);
-        DPFnode L = _mm_setzero_si128();
-        DPFnode R = _mm_setzero_si128();
-        // The server doesn't need to do this computation, but it does
-        // need to execute mpc_reconstruct_choice so that it sends
-        // the AndTriples at the appropriate time.
-        if (player < 2) {
-#ifdef RDPF_MTGEN_TIMING_1
-            if (player == 0) {
-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
-                    nextlevel, aes_ops);
-                size_t niters = 2048;
-                if (level > 8) niters = (1<<20)>>level;
-                for(int t=1;t<=8;++t) {
-                    mtgen_timetest_1(level, t, niters, curlevel,
-                        nextlevel, aes_ops);
-                }
-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
-                    nextlevel, aes_ops);
-            }
-#endif
-            // Using the timing results gathered above, decide whether
-            // to multithread, and if so, how many threads to use.
-            // tio.cpu_nthreads() is the maximum number we have
-            // available.
-            int max_nthreads = tio.cpu_nthreads();
-            if (max_nthreads == 1 || level < 19) {
-                // No threading
-                size_t laes_ops = 0;
-                for(size_t i=0;i<curlevel_size;++i) {
-                    DPFnode lchild, rchild;
-                    prgboth(lchild, rchild, curlevel[i], laes_ops);
-                    L = (L ^ lchild);
-                    R = (R ^ rchild);
-                    nextlevel[2*i] = lchild;
-                    nextlevel[2*i+1] = rchild;
-                }
-                aes_ops += laes_ops;
-            } else {
-                size_t curlevel_size = size_t(1)<<level;
-                int nthreads =
-                    int(ceil(sqrt(double(curlevel_size/6000))));
-                if (nthreads > max_nthreads) {
-                    nthreads = max_nthreads;
-                }
-                DPFnode tL[nthreads];
-                DPFnode tR[nthreads];
-                size_t taes_ops[nthreads];
-                size_t threadstart = 0;
-                size_t threadchunk = curlevel_size / nthreads;
-                size_t threadextra = curlevel_size % nthreads;
-                boost::asio::thread_pool pool(nthreads);
-                for (int t=0;t<nthreads;++t) {
-                    size_t threadsize = threadchunk + (size_t(t) < threadextra);
-                    size_t threadend = threadstart + threadsize;
-                    boost::asio::post(pool,
-                        [t, &tL, &tR, &taes_ops, threadstart, threadend,
-                        &curlevel, &nextlevel] {
-                            DPFnode L = _mm_setzero_si128();
-                            DPFnode R = _mm_setzero_si128();
-                            size_t aes_ops = 0;
-                            for(size_t i=threadstart;i<threadend;++i) {
-                                DPFnode lchild, rchild;
-                                prgboth(lchild, rchild, curlevel[i], aes_ops);
-                                L = (L ^ lchild);
-                                R = (R ^ rchild);
-                                nextlevel[2*i] = lchild;
-                                nextlevel[2*i+1] = rchild;
-                            }
-                            tL[t] = L;
-                            tR[t] = R;
-                            taes_ops[t] = aes_ops;
-                        });
-                    threadstart = threadend;
-                }
-                pool.join();
-                for (int t=0;t<nthreads;++t) {
-                    L ^= tL[t];
-                    R ^= tR[t];
-                    aes_ops += taes_ops[t];
-                }
-            }
-        }
-        // If we're going left (bs_choice = 0), we want the correction
-        // word to be the XOR of our right side and our peer's right
-        // side; if bs_choice = 1, it should be the XOR or our left side
-        // and our peer's left side.
-
-        // We also have to ensure that the flag bits (the lsb) of the
-        // side that will end up the same be of course the same, but
-        // also that the flag bits (the lsb) of the side that will end
-        // up different _must_ be different.  That is, it's not enough
-        // for the nodes of the child selected by choice to be different
-        // as 128-bit values; they also have to be different in their
-        // lsb.
-
-        // This is where we make a small optimization over Appendix C of
-        // the Duoram paper: instead of keeping separate correction flag
-        // bits for the left and right children, we observe that the low
-        // bit of the overall correction word effectively serves as one
-        // of those bits, so we just need to store one extra bit per
-        // level, not two.  (We arbitrarily choose the one for the right
-        // child.)
-
-        // Note that the XOR of our left and right child before and
-        // after applying the correction word won't change, since the
-        // correction word is applied to either both children or
-        // neither, depending on the value of the parent's flag. So in
-        // particular, the XOR of the flag bits won't change, and if our
-        // children's flag's XOR equals our peer's children's flag's
-        // XOR, then we won't have different flag bits even for the
-        // children that have different 128-bit values.
-
-        // So we compute our_parity = lsb(L^R)^player, and we XOR that
-        // into the R value in the correction word computation.  At the
-        // same time, we exchange these parity values to compute the
-        // combined parity, which we store in the DPF.  Then when the
-        // DPF is evaluated, if the parent's flag is set, not only apply
-        // the correction work to both children, but also apply the
-        // (combined) parity bit to just the right child.  Then for
-        // unequal nodes (where the flag bit is different), exactly one
-        // of the four children (two for P0 and two for P1) will have
-        // the parity bit applied, which will set the XOR of the lsb of
-        // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
-        // = 1 because everything cancels out except player (for which
-        // one player is 0 and the other is 1).
-
-        bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
-        DPFnode our_parity = lsb128_mask[our_parity_bit];
-
-        DPFnode CW;
-        bool peer_parity_bit;
-        // Exchange the parities and do mpc_reconstruct_choice at the
-        // same time (bundled into the same rounds)
-        run_coroutines(yield,
-            [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
-                tio.queue_peer(&our_parity_bit, 1);
-                yield();
-                uint8_t peer_parity_byte;
-                tio.recv_peer(&peer_parity_byte, 1);
-                peer_parity_bit = peer_parity_byte & 1;
-            },
-            [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
-                mpc_reconstruct_choice(tio, yield, CW, bs_choice,
-                    (R ^ our_parity), L);
-            });
-        bool parity_bit = our_parity_bit ^ peer_parity_bit;
-        cfbits |= (value_t(parity_bit)<<level);
-        DPFnode CWR = CW ^ lsb128_mask[parity_bit];
-        if (player < 2) {
-            // The timing of each iteration of the inner loop is
-            // comparable to the above, so just use the same
-            // computations.  All of this could be tuned, of course.
-
-            if (level < depth-1) {
-                // Using the timing results gathered above, decide whether
-                // to multithread, and if so, how many threads to use.
-                // tio.cpu_nthreads() is the maximum number we have
-                // available.
-                int max_nthreads = tio.cpu_nthreads();
-                if (max_nthreads == 1 || level < 19) {
-                    // No threading
-                    for(size_t i=0;i<curlevel_size;++i) {
-                        bool flag = get_lsb(curlevel[i]);
-                        nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
-                        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
-                    }
-                } else {
-                    int nthreads =
-                        int(ceil(sqrt(double(curlevel_size/6000))));
-                    if (nthreads > max_nthreads) {
-                        nthreads = max_nthreads;
-                    }
-                    size_t threadstart = 0;
-                    size_t threadchunk = curlevel_size / nthreads;
-                    size_t threadextra = curlevel_size % nthreads;
-                    boost::asio::thread_pool pool(nthreads);
-                    for (int t=0;t<nthreads;++t) {
-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
-                        size_t threadend = threadstart + threadsize;
-                        boost::asio::post(pool, [CW, CWR, threadstart, threadend,
-                            &curlevel, &nextlevel] {
-                                for(size_t i=threadstart;i<threadend;++i) {
-                                    bool flag = get_lsb(curlevel[i]);
-                                    nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
-                                    nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
-                                }
-                        });
-                        threadstart = threadend;
-                    }
-                    pool.join();
-                }
-            } else {
-                // Recall there are four potentially useful vectors that
-                // can come out of a DPF:
-                // - (single-bit) bitwise unit vector
-                // - additive-shared unit vector
-                // - XOR-shared scaled unit vector
-                // - additive-shared scaled unit vector
-                //
-                // (No single DPF should be used for both of the first
-                // two or both of the last two, though, since they're
-                // correlated; you _can_ use one of the first two and
-                // one of the last two.)
-                //
-                // For each 128-bit leaf, the low bit is the flag bit,
-                // and we're guaranteed that the flag bits (and indeed
-                // the whole 128-bit value) for P0 and P1 are the same
-                // for every leaf except the target, and that the flag
-                // bits definitely differ for the target (and the other
-                // 127 bits are independently random on each side).
-                //
-                // We divide the 128-bit leaf into a low 64-bit word and
-                // a high 64-bit word.  We use the low word for the unit
-                // vector and the high word for the scaled vector; this
-                // choice is not arbitrary: the flag bit in the low word
-                // means that the sum of all the low words (with P1's
-                // low words negated) across both P0 and P1 is
-                // definitely odd, so we can compute that sum's inverse
-                // mod 2^64, and store it now during precomputation.  At
-                // evaluation time for the additive-shared unit vector,
-                // we will output this global inverse times the low word
-                // of each leaf, which will make the sum of all of those
-                // values 1.  (This technique replaces the protocol in
-                // Appendix D of the Duoram paper.)
-                //
-                // For the scaled vector, we just have to compute shares
-                // of what the scaled vector is a sharing _of_, but
-                // that's just XORing or adding all of each party's
-                // local high words; no communication needed.
-
-                value_t low_sum = 0;
-                value_t high_sum = 0;
-                value_t high_xor = 0;
-                // Using the timing results gathered above, decide whether
-                // to multithread, and if so, how many threads to use.
-                // tio.cpu_nthreads() is the maximum number we have
-                // available.
-                int max_nthreads = tio.cpu_nthreads();
-                if (max_nthreads == 1 || level < 19) {
-                    // No threading
-                    for(size_t i=0;i<curlevel_size;++i) {
-                        bool flag = get_lsb(curlevel[i]);
-                        DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
-                        DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
-                        if (save_expansion) {
-                            nextlevel[2*i] = leftchild;
-                            nextlevel[2*i+1] = rightchild;
-                        }
-                        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
-                        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
-                        value_t lefthigh =
-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
-                        value_t righthigh =
-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
-                        low_sum += (leftlow + rightlow);
-                        high_sum += (lefthigh + righthigh);
-                        high_xor ^= (lefthigh ^ righthigh);
-                    }
-                } else {
-                    int nthreads =
-                        int(ceil(sqrt(double(curlevel_size/6000))));
-                    if (nthreads > max_nthreads) {
-                        nthreads = max_nthreads;
-                    }
-                    value_t tlow_sum[nthreads];
-                    value_t thigh_sum[nthreads];
-                    value_t thigh_xor[nthreads];
-                    size_t threadstart = 0;
-                    size_t threadchunk = curlevel_size / nthreads;
-                    size_t threadextra = curlevel_size % nthreads;
-                    boost::asio::thread_pool pool(nthreads);
-                    for (int t=0;t<nthreads;++t) {
-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
-                        size_t threadend = threadstart + threadsize;
-                        boost::asio::post(pool,
-                            [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
-                            &curlevel, &nextlevel, CW, CWR, save_expansion] {
-                                value_t low_sum = 0;
-                                value_t high_sum = 0;
-                                value_t high_xor = 0;
-                                for(size_t i=threadstart;i<threadend;++i) {
-                                    bool flag = get_lsb(curlevel[i]);
-                                    DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
-                                    DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
-                                    if (save_expansion) {
-                                        nextlevel[2*i] = leftchild;
-                                        nextlevel[2*i+1] = rightchild;
-                                    }
-                                    value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
-                                    value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
-                                    value_t lefthigh =
-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
-                                    value_t righthigh =
-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
-                                    low_sum += (leftlow + rightlow);
-                                    high_sum += (lefthigh + righthigh);
-                                    high_xor ^= (lefthigh ^ righthigh);
-                                }
-                                tlow_sum[t] = low_sum;
-                                thigh_sum[t] = high_sum;
-                                thigh_xor[t] = high_xor;
-                            });
-                        threadstart = threadend;
-                    }
-                    pool.join();
-                    for (int t=0;t<nthreads;++t) {
-                        low_sum += tlow_sum[t];
-                        high_sum += thigh_sum[t];
-                        high_xor ^= thigh_xor[t];
-                    }
-                }
-                if (player == 1) {
-                    low_sum = -low_sum;
-                    high_sum = -high_sum;
-                }
-                scaled_sum.ashare = high_sum;
-                scaled_xor.xshare = high_xor;
-                // Exchange low_sum and add them up
-                tio.queue_peer(&low_sum, sizeof(low_sum));
-                yield();
-                value_t peer_low_sum;
-                tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
-                low_sum += peer_low_sum;
-                // The low_sum had better be odd
-                assert(low_sum & 1);
-                unit_sum_inverse = inverse_value_t(low_sum);
-            }
-            cw.push_back(CW);
-        } else if (level == depth-1) {
-            yield();
-        }
-
-        ++level;
-    }
-
-    delete[] curlevel;
-    if (!save_expansion || player == 2) {
-        delete[] nextlevel;
-    }
-}
-
-// Get the leaf node for the given input
-DPFnode RDPF::leaf(address_t input, size_t &aes_ops) const
-{
-    // If we have a precomputed expansion, just use it
-    if (expansion.size()) {
-        return expansion[input];
-    }
-
-    nbits_t totdepth = depth();
-    DPFnode node = seed;
-    for (nbits_t d=0;d<totdepth;++d) {
-        bit_t dir = !!(input & (address_t(1)<<(totdepth-d-1)));
-        node = descend(node, d, dir, aes_ops);
-    }
-    return node;
-}
-
-// Expand the DPF if it's not already expanded
-//
-// This routine is slightly more efficient than repeatedly calling
-// StreamEval::next(), but it uses a lot more memory.
-void RDPF::expand(size_t &aes_ops)
-{
-    nbits_t depth = this->depth();
-    size_t num_leaves = size_t(1)<<depth;
-    if (expansion.size() == num_leaves) return;
-    expansion.resize(num_leaves);
-    address_t index = 0;
-    address_t lastindex = 0;
-    DPFnode *path = new DPFnode[depth];
-    path[0] = seed;
-    for (nbits_t i=1;i<depth;++i) {
-        path[i] = descend(path[i-1], i-1, 0, aes_ops);
-    }
-    expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
-    expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
-    while(index < num_leaves) {
-        // Invariant: lastindex and index will both be even, and
-        // index=lastindex+2
-        uint64_t index_xor = index ^ lastindex;
-        nbits_t how_many_1_bits = __builtin_popcount(index_xor);
-        // If lastindex -> index goes for example from (in binary)
-        // 010010110 -> 010011000, then index_xor will be
-        // 000001110 and how_many_1_bits will be 3.
-        // That indicates that path[depth-3] was a left child, and now
-        // we need to change it to a right child by descending right
-        // from path[depth-4], and then filling the path after that with
-        // left children.
-        path[depth-how_many_1_bits] =
-            descend(path[depth-how_many_1_bits-1],
-                depth-how_many_1_bits-1, 1, aes_ops);
-        for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
-            path[i+1] = descend(path[i], i, 0, aes_ops);
-        }
-        lastindex = index;
-        expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
-        expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
-    }
-
-    delete[] path;
-}
-
-// Construct three RDPFs of the given depth all with the same randomly
-// generated target index.
-RDPFTriple::RDPFTriple(MPCTIO &tio, yield_t &yield,
-    nbits_t depth, bool save_expansion)
-{
-    // Pick a random XOR share of the target
-    xs_target.randomize(depth);
-
-    // Now create three RDPFs with that target, and also convert the XOR
-    // shares of the target to additive shares
-    std::vector<coro_t> coroutines;
-    for (int i=0;i<3;++i) {
-        coroutines.emplace_back(
-            [this, &tio, depth, i, save_expansion](yield_t &yield) {
-                dpf[i] = RDPF(tio, yield, xs_target, depth,
-                    save_expansion);
-            });
-    }
-    coroutines.emplace_back(
-        [this, &tio, depth](yield_t &yield) {
-            mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false);
-        });
-    run_coroutines(yield, coroutines);
-}
-
-RDPFTriple::node RDPFTriple::descend(const RDPFTriple::node &parent,
-    nbits_t parentdepth, bit_t whichchild,
-    size_t &aes_ops) const
-{
-    auto [P0, P1, P2] = parent;
-    DPFnode C0, C1, C2;
-    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
-    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
-    C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops);
-    return std::make_tuple(C0,C1,C2);
-}
-
-RDPFPair::node RDPFPair::descend(const RDPFPair::node &parent,
-    nbits_t parentdepth, bit_t whichchild,
-    size_t &aes_ops) const
-{
-    auto [P0, P1] = parent;
-    DPFnode C0, C1;
-    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
-    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
-    return std::make_tuple(C0,C1);
-}

+ 6 - 3
rdpf.hpp

@@ -13,6 +13,7 @@
 // DPFs for oblivious random accesses to memory.  See dpf.hpp for the
 // differences between the different kinds of DPFs.
 
+template <nbits_t WIDTH>
 struct RDPF : public DPF {
     // The amount we have to scale the low words of the leaf values by
     // to get additive shares of a unit vector
@@ -109,13 +110,14 @@ struct RDPF : public DPF {
 // target index), while the server will hold a RDPFPair (which does
 // not).
 
+template <nbits_t WIDTH>
 struct RDPFTriple {
     // The type of node triples
     using node = std::tuple<DPFnode, DPFnode, DPFnode>;
 
     RegAS as_target;
     RegXS xs_target;
-    RDPF dpf[3];
+    RDPF<WIDTH> dpf[3];
 
     // The depth
     inline nbits_t depth() const { return dpf[0].depth(); }
@@ -209,11 +211,12 @@ struct RDPFTriple {
     }
 };
 
+template <nbits_t WIDTH>
 struct RDPFPair {
     // The type of node pairs
     using node = std::tuple<DPFnode, DPFnode>;
 
-    RDPF dpf[2];
+    RDPF<WIDTH> dpf[2];
 
     RDPFPair() {}
 
@@ -221,7 +224,7 @@ struct RDPFPair {
     // and dropping one.  This _moves_ the dpfs from the triple to the
     // pair, so the triple will no longer be valid after using this.
     // which0 and which1 indicate which of the dpfs to keep.
-    RDPFPair(RDPFTriple &&trip, int which0, int which1) {
+    RDPFPair(RDPFTriple<WIDTH> &&trip, int which0, int which1) {
         dpf[0] = std::move(trip.dpf[which0]);
         dpf[1] = std::move(trip.dpf[which1]);
     }

+ 537 - 14
rdpf.tcc

@@ -1,5 +1,23 @@
 // Templated method implementations for rdpf.hpp
 
+#include "mpcops.hpp"
+
+// Compute the multiplicative inverse of x mod 2^{VALUE_BITS}
+// This is the same as computing x to the power of
+// 2^{VALUE_BITS-1}-1.
+static value_t inverse_value_t(value_t x)
+{
+    int expon = 1;
+    value_t xe = x;
+    // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS}
+    // Goal: compute x^(2^{VALUE_BITS-1} - 1)
+    while (expon < VALUE_BITS-1) {
+        xe = xe * xe * x;
+        ++expon;
+    }
+    return xe;
+}
+
 // Create a StreamEval object that will start its output at index start.
 // It will wrap around to 0 when it hits 2^depth.  If use_expansion
 // is true, then if the DPF has been expanded, just output values
@@ -163,8 +181,8 @@ inline V ParallelEval<T>::reduce(V init, W process)
 
 // I/O for RDPFs
 
-template <typename T>
-T& operator>>(T &is, RDPF &rdpf)
+template <typename T, nbits_t WIDTH>
+T& operator>>(T &is, RDPF<WIDTH> &rdpf)
 {
     is.read((char *)&rdpf.seed, sizeof(rdpf.seed));
     rdpf.whichhalf = get_lsb(rdpf.seed);
@@ -201,8 +219,8 @@ T& operator>>(T &is, RDPF &rdpf)
 // Write the DPF to the output stream.  If expanded=true, then include
 // the expansion _if_ the DPF is itself already expanded.  You can use
 // this to write DPFs to files.
-template <typename T>
-T& write_maybe_expanded(T &os, const RDPF &rdpf,
+template <typename T, nbits_t WIDTH>
+T& write_maybe_expanded(T &os, const RDPF<WIDTH> &rdpf,
     bool expanded = true)
 {
     os.write((const char *)&rdpf.seed, sizeof(rdpf.seed));
@@ -233,8 +251,8 @@ T& write_maybe_expanded(T &os, const RDPF &rdpf,
 
 // The ordinary << version never writes the expansion, since this is
 // what we use to send DPFs over the network.
-template <typename T>
-T& operator<<(T &os, const RDPF &rdpf)
+template <typename T, nbits_t WIDTH>
+T& operator<<(T &os, const RDPF<WIDTH> &rdpf)
 {
     return write_maybe_expanded(os, rdpf, false);
 }
@@ -243,8 +261,8 @@ T& operator<<(T &os, const RDPF &rdpf)
 
 // We never write RDPFTriples over the network, so always write
 // the DPF expansions if they're available.
-template <typename T>
-T& operator<<(T &os, const RDPFTriple &rdpftrip)
+template <typename T, nbits_t WIDTH>
+T& operator<<(T &os, const RDPFTriple<WIDTH> &rdpftrip)
 {
     write_maybe_expanded(os, rdpftrip.dpf[0], true);
     write_maybe_expanded(os, rdpftrip.dpf[1], true);
@@ -255,8 +273,8 @@ T& operator<<(T &os, const RDPFTriple &rdpftrip)
     return os;
 }
 
-template <typename T>
-T& operator>>(T &is, RDPFTriple &rdpftrip)
+template <typename T, nbits_t WIDTH>
+T& operator>>(T &is, RDPFTriple<WIDTH> &rdpftrip)
 {
     is >> rdpftrip.dpf[0] >> rdpftrip.dpf[1] >> rdpftrip.dpf[2];
     nbits_t depth = rdpftrip.dpf[0].depth();
@@ -271,17 +289,522 @@ T& operator>>(T &is, RDPFTriple &rdpftrip)
 
 // We never write RDPFPairs over the network, so always write
 // the DPF expansions if they're available.
-template <typename T>
-T& operator<<(T &os, const RDPFPair &rdpfpair)
+template <typename T, nbits_t WIDTH>
+T& operator<<(T &os, const RDPFPair<WIDTH> &rdpfpair)
 {
     write_maybe_expanded(os, rdpfpair.dpf[0], true);
     write_maybe_expanded(os, rdpfpair.dpf[1], true);
     return os;
 }
 
-template <typename T>
-T& operator>>(T &is, RDPFPair &rdpfpair)
+template <typename T, nbits_t WIDTH>
+T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
 {
     is >> rdpfpair.dpf[0] >> rdpfpair.dpf[1];
     return is;
 }
+
+// Construct a DPF with the given (XOR-shared) target location, and
+// of the given depth, to be used for random-access memory reads and
+// writes.  The DPF is construction collaboratively by P0 and P1,
+// with the server P2 helping by providing various kinds of
+// correlated randomness, such as MultTriples and AndTriples.
+//
+// This algorithm is based on Appendix C from the Duoram paper, with a
+// small optimization noted below.
+template <nbits_t WIDTH>
+RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
+    RegXS target, nbits_t depth, bool save_expansion)
+{
+    int player = tio.player();
+    size_t &aes_ops = tio.aes_ops();
+
+    // Choose a random seed
+    arc4random_buf(&seed, sizeof(seed));
+    // Ensure the flag bits (the lsb of each node) are different
+    seed = set_lsb(seed, !!player);
+    cfbits = 0;
+    whichhalf = (player == 1);
+
+    // The root level is just the seed
+    nbits_t level = 0;
+    DPFnode *curlevel = NULL;
+    DPFnode *nextlevel = new DPFnode[1];
+    nextlevel[0] = seed;
+
+    // Construct each intermediate level
+    while(level < depth) {
+        if (player < 2) {
+            delete[] curlevel;
+            curlevel = nextlevel;
+            if (save_expansion && level == depth-1) {
+                expansion.resize(1<<depth);
+                nextlevel = expansion.data();
+            } else {
+                nextlevel = new DPFnode[1<<(level+1)];
+            }
+        }
+        // Invariant: curlevel has 2^level elements; nextlevel has
+        // 2^{level+1} elements
+
+        // The bit-shared choice bit is bit (depth-level-1) of the
+        // XOR-shared target index
+        RegBS bs_choice = target.bit(depth-level-1);
+        size_t curlevel_size = (size_t(1)<<level);
+        DPFnode L = _mm_setzero_si128();
+        DPFnode R = _mm_setzero_si128();
+        // The server doesn't need to do this computation, but it does
+        // need to execute mpc_reconstruct_choice so that it sends
+        // the AndTriples at the appropriate time.
+        if (player < 2) {
+#ifdef RDPF_MTGEN_TIMING_1
+            if (player == 0) {
+                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
+                    nextlevel, aes_ops);
+                size_t niters = 2048;
+                if (level > 8) niters = (1<<20)>>level;
+                for(int t=1;t<=8;++t) {
+                    mtgen_timetest_1(level, t, niters, curlevel,
+                        nextlevel, aes_ops);
+                }
+                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
+                    nextlevel, aes_ops);
+            }
+#endif
+            // Using the timing results gathered above, decide whether
+            // to multithread, and if so, how many threads to use.
+            // tio.cpu_nthreads() is the maximum number we have
+            // available.
+            int max_nthreads = tio.cpu_nthreads();
+            if (max_nthreads == 1 || level < 19) {
+                // No threading
+                size_t laes_ops = 0;
+                for(size_t i=0;i<curlevel_size;++i) {
+                    DPFnode lchild, rchild;
+                    prgboth(lchild, rchild, curlevel[i], laes_ops);
+                    L = (L ^ lchild);
+                    R = (R ^ rchild);
+                    nextlevel[2*i] = lchild;
+                    nextlevel[2*i+1] = rchild;
+                }
+                aes_ops += laes_ops;
+            } else {
+                size_t curlevel_size = size_t(1)<<level;
+                int nthreads =
+                    int(ceil(sqrt(double(curlevel_size/6000))));
+                if (nthreads > max_nthreads) {
+                    nthreads = max_nthreads;
+                }
+                DPFnode tL[nthreads];
+                DPFnode tR[nthreads];
+                size_t taes_ops[nthreads];
+                size_t threadstart = 0;
+                size_t threadchunk = curlevel_size / nthreads;
+                size_t threadextra = curlevel_size % nthreads;
+                boost::asio::thread_pool pool(nthreads);
+                for (int t=0;t<nthreads;++t) {
+                    size_t threadsize = threadchunk + (size_t(t) < threadextra);
+                    size_t threadend = threadstart + threadsize;
+                    boost::asio::post(pool,
+                        [t, &tL, &tR, &taes_ops, threadstart, threadend,
+                        &curlevel, &nextlevel] {
+                            DPFnode L = _mm_setzero_si128();
+                            DPFnode R = _mm_setzero_si128();
+                            size_t aes_ops = 0;
+                            for(size_t i=threadstart;i<threadend;++i) {
+                                DPFnode lchild, rchild;
+                                prgboth(lchild, rchild, curlevel[i], aes_ops);
+                                L = (L ^ lchild);
+                                R = (R ^ rchild);
+                                nextlevel[2*i] = lchild;
+                                nextlevel[2*i+1] = rchild;
+                            }
+                            tL[t] = L;
+                            tR[t] = R;
+                            taes_ops[t] = aes_ops;
+                        });
+                    threadstart = threadend;
+                }
+                pool.join();
+                for (int t=0;t<nthreads;++t) {
+                    L ^= tL[t];
+                    R ^= tR[t];
+                    aes_ops += taes_ops[t];
+                }
+            }
+        }
+        // If we're going left (bs_choice = 0), we want the correction
+        // word to be the XOR of our right side and our peer's right
+        // side; if bs_choice = 1, it should be the XOR or our left side
+        // and our peer's left side.
+
+        // We also have to ensure that the flag bits (the lsb) of the
+        // side that will end up the same be of course the same, but
+        // also that the flag bits (the lsb) of the side that will end
+        // up different _must_ be different.  That is, it's not enough
+        // for the nodes of the child selected by choice to be different
+        // as 128-bit values; they also have to be different in their
+        // lsb.
+
+        // This is where we make a small optimization over Appendix C of
+        // the Duoram paper: instead of keeping separate correction flag
+        // bits for the left and right children, we observe that the low
+        // bit of the overall correction word effectively serves as one
+        // of those bits, so we just need to store one extra bit per
+        // level, not two.  (We arbitrarily choose the one for the right
+        // child.)
+
+        // Note that the XOR of our left and right child before and
+        // after applying the correction word won't change, since the
+        // correction word is applied to either both children or
+        // neither, depending on the value of the parent's flag. So in
+        // particular, the XOR of the flag bits won't change, and if our
+        // children's flag's XOR equals our peer's children's flag's
+        // XOR, then we won't have different flag bits even for the
+        // children that have different 128-bit values.
+
+        // So we compute our_parity = lsb(L^R)^player, and we XOR that
+        // into the R value in the correction word computation.  At the
+        // same time, we exchange these parity values to compute the
+        // combined parity, which we store in the DPF.  Then when the
+        // DPF is evaluated, if the parent's flag is set, not only apply
+        // the correction work to both children, but also apply the
+        // (combined) parity bit to just the right child.  Then for
+        // unequal nodes (where the flag bit is different), exactly one
+        // of the four children (two for P0 and two for P1) will have
+        // the parity bit applied, which will set the XOR of the lsb of
+        // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
+        // = 1 because everything cancels out except player (for which
+        // one player is 0 and the other is 1).
+
+        bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
+        DPFnode our_parity = lsb128_mask[our_parity_bit];
+
+        DPFnode CW;
+        bool peer_parity_bit;
+        // Exchange the parities and do mpc_reconstruct_choice at the
+        // same time (bundled into the same rounds)
+        run_coroutines(yield,
+            [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
+                tio.queue_peer(&our_parity_bit, 1);
+                yield();
+                uint8_t peer_parity_byte;
+                tio.recv_peer(&peer_parity_byte, 1);
+                peer_parity_bit = peer_parity_byte & 1;
+            },
+            [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
+                mpc_reconstruct_choice(tio, yield, CW, bs_choice,
+                    (R ^ our_parity), L);
+            });
+        bool parity_bit = our_parity_bit ^ peer_parity_bit;
+        cfbits |= (value_t(parity_bit)<<level);
+        DPFnode CWR = CW ^ lsb128_mask[parity_bit];
+        if (player < 2) {
+            // The timing of each iteration of the inner loop is
+            // comparable to the above, so just use the same
+            // computations.  All of this could be tuned, of course.
+
+            if (level < depth-1) {
+                // Using the timing results gathered above, decide whether
+                // to multithread, and if so, how many threads to use.
+                // tio.cpu_nthreads() is the maximum number we have
+                // available.
+                int max_nthreads = tio.cpu_nthreads();
+                if (max_nthreads == 1 || level < 19) {
+                    // No threading
+                    for(size_t i=0;i<curlevel_size;++i) {
+                        bool flag = get_lsb(curlevel[i]);
+                        nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
+                        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
+                    }
+                } else {
+                    int nthreads =
+                        int(ceil(sqrt(double(curlevel_size/6000))));
+                    if (nthreads > max_nthreads) {
+                        nthreads = max_nthreads;
+                    }
+                    size_t threadstart = 0;
+                    size_t threadchunk = curlevel_size / nthreads;
+                    size_t threadextra = curlevel_size % nthreads;
+                    boost::asio::thread_pool pool(nthreads);
+                    for (int t=0;t<nthreads;++t) {
+                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
+                        size_t threadend = threadstart + threadsize;
+                        boost::asio::post(pool, [CW, CWR, threadstart, threadend,
+                            &curlevel, &nextlevel] {
+                                for(size_t i=threadstart;i<threadend;++i) {
+                                    bool flag = get_lsb(curlevel[i]);
+                                    nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
+                                    nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
+                                }
+                        });
+                        threadstart = threadend;
+                    }
+                    pool.join();
+                }
+            } else {
+                // Recall there are four potentially useful vectors that
+                // can come out of a DPF:
+                // - (single-bit) bitwise unit vector
+                // - additive-shared unit vector
+                // - XOR-shared scaled unit vector
+                // - additive-shared scaled unit vector
+                //
+                // (No single DPF should be used for both of the first
+                // two or both of the last two, though, since they're
+                // correlated; you _can_ use one of the first two and
+                // one of the last two.)
+                //
+                // For each 128-bit leaf, the low bit is the flag bit,
+                // and we're guaranteed that the flag bits (and indeed
+                // the whole 128-bit value) for P0 and P1 are the same
+                // for every leaf except the target, and that the flag
+                // bits definitely differ for the target (and the other
+                // 127 bits are independently random on each side).
+                //
+                // We divide the 128-bit leaf into a low 64-bit word and
+                // a high 64-bit word.  We use the low word for the unit
+                // vector and the high word for the scaled vector; this
+                // choice is not arbitrary: the flag bit in the low word
+                // means that the sum of all the low words (with P1's
+                // low words negated) across both P0 and P1 is
+                // definitely odd, so we can compute that sum's inverse
+                // mod 2^64, and store it now during precomputation.  At
+                // evaluation time for the additive-shared unit vector,
+                // we will output this global inverse times the low word
+                // of each leaf, which will make the sum of all of those
+                // values 1.  (This technique replaces the protocol in
+                // Appendix D of the Duoram paper.)
+                //
+                // For the scaled vector, we just have to compute shares
+                // of what the scaled vector is a sharing _of_, but
+                // that's just XORing or adding all of each party's
+                // local high words; no communication needed.
+
+                value_t low_sum = 0;
+                value_t high_sum = 0;
+                value_t high_xor = 0;
+                // Using the timing results gathered above, decide whether
+                // to multithread, and if so, how many threads to use.
+                // tio.cpu_nthreads() is the maximum number we have
+                // available.
+                int max_nthreads = tio.cpu_nthreads();
+                if (max_nthreads == 1 || level < 19) {
+                    // No threading
+                    for(size_t i=0;i<curlevel_size;++i) {
+                        bool flag = get_lsb(curlevel[i]);
+                        DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
+                        DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
+                        if (save_expansion) {
+                            nextlevel[2*i] = leftchild;
+                            nextlevel[2*i+1] = rightchild;
+                        }
+                        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
+                        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
+                        value_t lefthigh =
+                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
+                        value_t righthigh =
+                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
+                        low_sum += (leftlow + rightlow);
+                        high_sum += (lefthigh + righthigh);
+                        high_xor ^= (lefthigh ^ righthigh);
+                    }
+                } else {
+                    int nthreads =
+                        int(ceil(sqrt(double(curlevel_size/6000))));
+                    if (nthreads > max_nthreads) {
+                        nthreads = max_nthreads;
+                    }
+                    value_t tlow_sum[nthreads];
+                    value_t thigh_sum[nthreads];
+                    value_t thigh_xor[nthreads];
+                    size_t threadstart = 0;
+                    size_t threadchunk = curlevel_size / nthreads;
+                    size_t threadextra = curlevel_size % nthreads;
+                    boost::asio::thread_pool pool(nthreads);
+                    for (int t=0;t<nthreads;++t) {
+                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
+                        size_t threadend = threadstart + threadsize;
+                        boost::asio::post(pool,
+                            [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
+                            &curlevel, &nextlevel, CW, CWR, save_expansion] {
+                                value_t low_sum = 0;
+                                value_t high_sum = 0;
+                                value_t high_xor = 0;
+                                for(size_t i=threadstart;i<threadend;++i) {
+                                    bool flag = get_lsb(curlevel[i]);
+                                    DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
+                                    DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
+                                    if (save_expansion) {
+                                        nextlevel[2*i] = leftchild;
+                                        nextlevel[2*i+1] = rightchild;
+                                    }
+                                    value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
+                                    value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
+                                    value_t lefthigh =
+                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
+                                    value_t righthigh =
+                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
+                                    low_sum += (leftlow + rightlow);
+                                    high_sum += (lefthigh + righthigh);
+                                    high_xor ^= (lefthigh ^ righthigh);
+                                }
+                                tlow_sum[t] = low_sum;
+                                thigh_sum[t] = high_sum;
+                                thigh_xor[t] = high_xor;
+                            });
+                        threadstart = threadend;
+                    }
+                    pool.join();
+                    for (int t=0;t<nthreads;++t) {
+                        low_sum += tlow_sum[t];
+                        high_sum += thigh_sum[t];
+                        high_xor ^= thigh_xor[t];
+                    }
+                }
+                if (player == 1) {
+                    low_sum = -low_sum;
+                    high_sum = -high_sum;
+                }
+                scaled_sum.ashare = high_sum;
+                scaled_xor.xshare = high_xor;
+                // Exchange low_sum and add them up
+                tio.queue_peer(&low_sum, sizeof(low_sum));
+                yield();
+                value_t peer_low_sum;
+                tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
+                low_sum += peer_low_sum;
+                // The low_sum had better be odd
+                assert(low_sum & 1);
+                unit_sum_inverse = inverse_value_t(low_sum);
+            }
+            cw.push_back(CW);
+        } else if (level == depth-1) {
+            yield();
+        }
+
+        ++level;
+    }
+
+    delete[] curlevel;
+    if (!save_expansion || player == 2) {
+        delete[] nextlevel;
+    }
+}
+
+// Get the leaf node for the given input
+template <nbits_t WIDTH>
+DPFnode RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
+{
+    // If we have a precomputed expansion, just use it
+    if (expansion.size()) {
+        return expansion[input];
+    }
+
+    nbits_t totdepth = depth();
+    DPFnode node = seed;
+    for (nbits_t d=0;d<totdepth;++d) {
+        bit_t dir = !!(input & (address_t(1)<<(totdepth-d-1)));
+        node = descend(node, d, dir, aes_ops);
+    }
+    return node;
+}
+
+// Expand the DPF if it's not already expanded
+//
+// This routine is slightly more efficient than repeatedly calling
+// StreamEval::next(), but it uses a lot more memory.
+template <nbits_t WIDTH>
+void RDPF<WIDTH>::expand(size_t &aes_ops)
+{
+    nbits_t depth = this->depth();
+    size_t num_leaves = size_t(1)<<depth;
+    if (expansion.size() == num_leaves) return;
+    expansion.resize(num_leaves);
+    address_t index = 0;
+    address_t lastindex = 0;
+    DPFnode *path = new DPFnode[depth];
+    path[0] = seed;
+    for (nbits_t i=1;i<depth;++i) {
+        path[i] = descend(path[i-1], i-1, 0, aes_ops);
+    }
+    expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
+    expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
+    while(index < num_leaves) {
+        // Invariant: lastindex and index will both be even, and
+        // index=lastindex+2
+        uint64_t index_xor = index ^ lastindex;
+        nbits_t how_many_1_bits = __builtin_popcount(index_xor);
+        // If lastindex -> index goes for example from (in binary)
+        // 010010110 -> 010011000, then index_xor will be
+        // 000001110 and how_many_1_bits will be 3.
+        // That indicates that path[depth-3] was a left child, and now
+        // we need to change it to a right child by descending right
+        // from path[depth-4], and then filling the path after that with
+        // left children.
+        path[depth-how_many_1_bits] =
+            descend(path[depth-how_many_1_bits-1],
+                depth-how_many_1_bits-1, 1, aes_ops);
+        for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
+            path[i+1] = descend(path[i], i, 0, aes_ops);
+        }
+        lastindex = index;
+        expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
+        expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
+    }
+
+    delete[] path;
+}
+
+// Construct three RDPFs of the given depth all with the same randomly
+// generated target index.
+template <nbits_t WIDTH>
+RDPFTriple<WIDTH>::RDPFTriple(MPCTIO &tio, yield_t &yield,
+    nbits_t depth, bool save_expansion)
+{
+    // Pick a random XOR share of the target
+    xs_target.randomize(depth);
+
+    // Now create three RDPFs with that target, and also convert the XOR
+    // shares of the target to additive shares
+    std::vector<coro_t> coroutines;
+    for (int i=0;i<3;++i) {
+        coroutines.emplace_back(
+            [this, &tio, depth, i, save_expansion](yield_t &yield) {
+                dpf[i] = RDPF<WIDTH>(tio, yield, xs_target, depth,
+                    save_expansion);
+            });
+    }
+    coroutines.emplace_back(
+        [this, &tio, depth](yield_t &yield) {
+            mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false);
+        });
+    run_coroutines(yield, coroutines);
+}
+
+template <nbits_t WIDTH>
+typename RDPFTriple<WIDTH>::node RDPFTriple<WIDTH>::descend(
+    const RDPFTriple<WIDTH>::node &parent,
+    nbits_t parentdepth, bit_t whichchild,
+    size_t &aes_ops) const
+{
+    auto [P0, P1, P2] = parent;
+    DPFnode C0, C1, C2;
+    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
+    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
+    C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops);
+    return std::make_tuple(C0,C1,C2);
+}
+
+template <nbits_t WIDTH>
+typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
+    const RDPFPair<WIDTH>::node &parent,
+    nbits_t parentdepth, bit_t whichchild,
+    size_t &aes_ops) const
+{
+    auto [P0, P1] = parent;
+    DPFnode C0, C1;
+    C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
+    C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
+    return std::make_tuple(C0,C1);
+}

+ 2 - 2
types.hpp

@@ -643,9 +643,9 @@ struct ValSelectTripleName { static constexpr const char *name = "s"; };
 
 // These are defined in rdpf.hpp, but declared here to avoid cyclic
 // header dependencies.
-struct RDPFPair;
+template <nbits_t WIDTH> struct RDPFPair;
 struct RDPFPairName { static constexpr const char *name = "r"; };
-struct RDPFTriple;
+template <nbits_t WIDTH> struct RDPFTriple;
 struct RDPFTripleName { static constexpr const char *name = "r"; };
 struct CDPF;
 struct CDPFName { static constexpr const char *name = "c"; };