Browse Source

Create wide RDPFs

Ian Goldberg 1 year ago
parent
commit
ff2653d6ea
8 changed files with 578 additions and 360 deletions
  1. 22 21
      Makefile
  2. 19 0
      bitutils.hpp
  3. 13 0
      mpcops.hpp
  4. 16 0
      mpcops.tcc
  5. 2 2
      prg.hpp
  6. 2 1
      rdpf.hpp
  7. 487 336
      rdpf.tcc
  8. 17 0
      types.hpp

+ 22 - 21
Makefile

@@ -31,26 +31,27 @@ depend:
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
-prac.o: mpcio.hpp types.hpp corotypes.hpp mpcio.tcc preproc.hpp options.hpp
-prac.o: online.hpp
-mpcio.o: mpcio.hpp types.hpp corotypes.hpp mpcio.tcc rdpf.hpp coroutine.hpp
-mpcio.o: bitutils.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp cdpf.hpp
-mpcio.o: cdpf.tcc
-preproc.o: types.hpp coroutine.hpp corotypes.hpp mpcio.hpp mpcio.tcc
-preproc.o: preproc.hpp options.hpp rdpf.hpp bitutils.hpp dpf.hpp prg.hpp
-preproc.o: aes.hpp rdpf.tcc mpcops.hpp cdpf.hpp cdpf.tcc
-online.o: online.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc options.hpp
-online.o: mpcops.hpp coroutine.hpp rdpf.hpp bitutils.hpp dpf.hpp prg.hpp
-online.o: aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc cell.hpp
-mpcops.o: mpcops.hpp types.hpp mpcio.hpp corotypes.hpp mpcio.tcc
-mpcops.o: coroutine.hpp bitutils.hpp
-rdpf.o: rdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc coroutine.hpp
-rdpf.o: bitutils.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp
+prac.o: mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc preproc.hpp
+prac.o: options.hpp online.hpp
+mpcio.o: mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc rdpf.hpp
+mpcio.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp mpcops.tcc
+mpcio.o: cdpf.hpp cdpf.tcc
+preproc.o: types.hpp bitutils.hpp coroutine.hpp corotypes.hpp mpcio.hpp
+preproc.o: mpcio.tcc preproc.hpp options.hpp rdpf.hpp dpf.hpp prg.hpp aes.hpp
+preproc.o: rdpf.tcc mpcops.hpp mpcops.tcc cdpf.hpp cdpf.tcc
+online.o: online.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
+online.o: options.hpp mpcops.hpp coroutine.hpp mpcops.tcc rdpf.hpp dpf.hpp
+online.o: prg.hpp aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc
+online.o: cell.hpp
+mpcops.o: mpcops.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
+mpcops.o: coroutine.hpp mpcops.tcc
+rdpf.o: rdpf.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
+rdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp rdpf.tcc mpcops.hpp mpcops.tcc
 cdpf.o: bitutils.hpp cdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc
 cdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp cdpf.tcc
-duoram.o: duoram.hpp types.hpp mpcio.hpp corotypes.hpp mpcio.tcc
-duoram.o: coroutine.hpp duoram.tcc mpcops.hpp cdpf.hpp dpf.hpp prg.hpp
-duoram.o: bitutils.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
-cell.o: types.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc coroutine.hpp
-cell.o: duoram.tcc mpcops.hpp cdpf.hpp dpf.hpp prg.hpp bitutils.hpp aes.hpp
-cell.o: cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp
+duoram.o: duoram.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
+duoram.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
+duoram.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
+cell.o: types.hpp bitutils.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc
+cell.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
+cell.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp

+ 19 - 0
bitutils.hpp

@@ -5,6 +5,7 @@
 #ifndef __BITUTILS_HPP__
 #define __BITUTILS_HPP__
 
+#include <array>
 #include <cstdint>
 #include <x86intrin.h>  // SSE and AVX intrinsics
 
@@ -42,12 +43,30 @@ inline __m128i xor_if(const __m128i & block1, const __m128i & block2, bool flag)
     return _mm_xor_si128(block1, _mm_and_si128(block2, if128_mask[flag ? 1 : 0]));
 }
 
+template <size_t LWIDTH>
+inline std::array<__m128i,LWIDTH> xor_if(
+    const std::array<__m128i,LWIDTH> & block1,
+    const std::array<__m128i,LWIDTH> & block2, bool flag)
+{
+    std::array<__m128i,LWIDTH> res;
+    for (size_t j=0;j<LWIDTH;++j) {
+        res[j] = xor_if(block1[j], block2[j], flag);
+    }
+    return res;
+}
+
 inline uint8_t get_lsb(const __m128i & block, uint8_t bits = 0b01)
 {
     __m128i vcmp = _mm_xor_si128(_mm_and_si128(block, lsb128_mask[bits]), lsb128_mask[bits]);
     return static_cast<uint8_t>(_mm_testz_si128(vcmp, vcmp));
 }
 
+template <size_t LWIDTH>
+inline uint8_t get_lsb(const std::array<__m128i,LWIDTH> & block)
+{
+    return get_lsb(block[0]);
+}
+
 inline __m128i clear_lsb(const __m128i & block, uint8_t bits = 0b01)
 {
     return _mm_and_si128(block, lsb128_mask_inv[bits]);

+ 13 - 0
mpcops.hpp

@@ -110,6 +110,17 @@ void mpc_xs_to_as(MPCTIO &tio, yield_t &yield,
 void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
     DPFnode &z, RegBS f, DPFnode x, DPFnode y);
 
+// As above, but for arrays of DPFnode
+//
+// Cost:
+// 6*LWIDTH 64-bit words sent in 2 messages
+// consumes LWIDTH AndTriples
+template <size_t LWIDTH>
+void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
+    std::array<DPFnode,LWIDTH> &z, RegBS f,
+    const std::array<DPFnode,LWIDTH> &x,
+    const std::array<DPFnode,LWIDTH> &y);
+
 // P0 and P1 hold bit shares of x and y.  Set z to bit shares of x & y.
 //
 // Cost:
@@ -126,4 +137,6 @@ void mpc_and(MPCTIO &tio, yield_t &yield,
 void mpc_or(MPCTIO &tio, yield_t &yield,
     RegBS &z, RegBS x, RegBS y);
 
+#include "mpcops.tcc"
+
 #endif

+ 16 - 0
mpcops.tcc

@@ -0,0 +1,16 @@
+template <size_t LWIDTH>
+void mpc_reconstruct_choice(MPCTIO &tio, yield_t &yield,
+    std::array<DPFnode,LWIDTH> &z, RegBS f,
+    const std::array<DPFnode,LWIDTH> &x,
+    const std::array<DPFnode,LWIDTH> &y)
+{
+    std::vector<coro_t> coroutines;
+    for (size_t j=0;j<LWIDTH;++j) {
+        coroutines.emplace_back(
+            [&tio, &z, f, &x, &y, j](yield_t &yield) {
+                mpc_reconstruct_choice(tio, yield, z[j],
+                    f, x[j], y[j]);
+            });
+    }
+    run_coroutines(yield, coroutines);
+}

+ 2 - 2
prg.hpp

@@ -50,7 +50,7 @@ static inline void prgboth(__m128i &left, __m128i &right, __m128i seed,
 // Compute one of the leaf children of node seed; whichchild=0 for
 // the left child, 1 for the right child
 template <size_t LWIDTH>
-static inline void prgleaf(std::array<__m128i,LWIDTH> &out,
+static inline void prg(std::array<__m128i,LWIDTH> &out,
     __m128i seed, bool whichchild, size_t &aes_ops)
 {
     __m128i in = set_lsb(seed, whichchild);
@@ -73,7 +73,7 @@ static inline void prgleaf(std::array<__m128i,LWIDTH> &out,
 
 // Compute both of the leaf children of node seed
 template <size_t LWIDTH>
-static inline void prgleafboth(std::array<__m128i,LWIDTH> &left,
+static inline void prgboth(std::array<__m128i,LWIDTH> &left,
     std::array<__m128i,LWIDTH> &right, __m128i seed, size_t &aes_ops)
 {
     __m128i inl = set_lsb(seed, 0);

+ 2 - 1
rdpf.hpp

@@ -45,6 +45,7 @@ struct RDPF : public DPF {
     // one leaf level (at the bottom), but incremental RDPFs have a leaf
     // level for each level of the DPF.
     struct LeafInfo {
+        static const nbits_t W = WIDTH;
         // The correction word for this leaf level
         LeafNode leaf_cw;
         // The amount we have to scale the low words of the leaf values by
@@ -153,7 +154,7 @@ struct RDPF : public DPF {
         value_t highword =
             value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[0],8)));
         x[j++].xshare = highword;
-        for (nbits_t i=1;i<WIDTH;++i) {
+        for (nbits_t i=1;i<LWIDTH;++i) {
             value_t lowword =
                 value_t(_mm_cvtsi128_si64x(leaf[i]));
             value_t highword =

+ 487 - 336
rdpf.tcc

@@ -191,7 +191,7 @@ inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
 {
     typename RDPF<WIDTH>::LeafNode prgout;
     bool flag = get_lsb(parent);
-    prgleaf(prgout, parent, whichchild, aes_ops);
+    prg(prgout, parent, whichchild, aes_ops);
     if (flag) {
         LeafNode CW = li[0].leaf_cw;
         LeafNode CWR = CW;
@@ -335,6 +335,463 @@ T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
     return is;
 }
 
+// Set a DPFnode to zero
+static inline void zero(DPFnode &z)
+{
+    z = _mm_setzero_si128();
+}
+
+// Set a LeafNode to zero
+template <size_t LWIDTH>
+static inline void zero(std::array<DPFnode,LWIDTH> &z)
+{
+    for (size_t j=0;j<LWIDTH;++j) {
+        zero(z[j]);
+    }
+}
+
+// Set an array of value_r to zero
+template <size_t WIDTH>
+static inline void zero(std::array<value_t,WIDTH> &z)
+{
+    for (size_t j=0;j<WIDTH;++j) {
+        z[j] = 0;
+    }
+}
+
+
+// Expand a level of the RDPF into the next level without threads. This
+// just computes the PRGs without computing or applying the correction
+// words.  L and R will be set to the XORs of the left children and the
+// XORs of the right children respectively. NT will be LeafNode if we
+// are expanding into a leaf level, DPFnode if not.
+template <typename NT>
+static inline void expand_level_nothreads(size_t start, size_t end,
+    const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
+    size_t &aes_ops)
+{
+    // Only touch registers in the inner loop if possible
+    NT lL, lR;
+    zero(lL);
+    zero(lR);
+    size_t laes_ops = 0;
+    for(size_t i=start;i<end;++i) {
+        NT lchild, rchild;
+        prgboth(lchild, rchild, curlevel[i], laes_ops);
+        lL ^= lchild;
+        lR ^= rchild;
+        nextlevel[2*i] = lchild;
+        nextlevel[2*i+1] = rchild;
+    }
+    L = lL;
+    R = lR;
+    aes_ops += laes_ops;
+}
+
+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
+// timing benchmarks
+template <typename NT>
+static inline void expand_level(int max_nthreads, nbits_t level,
+    const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
+    size_t &aes_ops)
+{
+    size_t curlevel_size = (size_t(1)<<level);
+    if (max_nthreads == 1 || level < 19) {
+        // No threading
+        expand_level_nothreads(0, curlevel_size,
+            curlevel, nextlevel, L, R, aes_ops);
+    } else {
+        int nthreads =
+            int(ceil(sqrt(double(curlevel_size/6000))));
+        if (nthreads > max_nthreads) {
+            nthreads = max_nthreads;
+        }
+        NT tL[nthreads];
+        NT tR[nthreads];
+        size_t taes_ops[nthreads];
+        size_t threadstart = 0;
+        size_t threadchunk = curlevel_size / nthreads;
+        size_t threadextra = curlevel_size % nthreads;
+        boost::asio::thread_pool pool(nthreads);
+        for (int t=0;t<nthreads;++t) {
+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
+            size_t threadend = threadstart + threadsize;
+            taes_ops[t] = 0;
+            boost::asio::post(pool,
+                [t, &tL, &tR, &taes_ops, threadstart, threadend,
+                &curlevel, &nextlevel] {
+                    expand_level_nothreads(threadstart, threadend,
+                        curlevel, nextlevel, tL[t], tR[t], taes_ops[t]);
+                });
+            threadstart = threadend;
+        }
+        pool.join();
+        // Again work on registers as much as possible
+        NT lL, lR;
+        zero(lL);
+        zero(lR);
+        size_t laes_ops = 0;
+        for (int t=0;t<nthreads;++t) {
+            lL ^= tL[t];
+            lR ^= tR[t];
+            laes_ops += taes_ops[t];
+        }
+        L = lL;
+        R = lR;
+        aes_ops += laes_ops;
+    }
+}
+
+// Apply the correction words to an expanded non-leaf level (nextlevel),
+// based on the flag bits in curlevel. This version does not use
+// threads.
+static inline void finalize_nonleaf_layer_nothreads(size_t start,
+    size_t end, const DPFnode *curlevel, DPFnode *nextlevel,
+    DPFnode CWL, DPFnode CWR)
+{
+    for(size_t i=start;i<end;++i) {
+        bool flag = get_lsb(curlevel[i]);
+        nextlevel[2*i] = xor_if(nextlevel[2*i], CWL, flag);
+        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
+    }
+}
+
+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
+// timing benchmarks.  The timing of each iteration of the inner loop is
+// comparable to the above, so just use the same computations.  All of
+// this could be tuned, of course.
+static inline void finalize_nonleaf_layer(int max_nthreads, nbits_t level,
+    const DPFnode *curlevel, DPFnode *nextlevel, DPFnode CWL,
+    DPFnode CWR)
+{
+    size_t curlevel_size = (size_t(1)<<level);
+    if (max_nthreads == 1 || level < 19) {
+        // No threading
+        finalize_nonleaf_layer_nothreads(0, curlevel_size,
+            curlevel, nextlevel, CWL, CWR);
+    } else {
+        int nthreads =
+            int(ceil(sqrt(double(curlevel_size/6000))));
+        if (nthreads > max_nthreads) {
+            nthreads = max_nthreads;
+        }
+        size_t threadstart = 0;
+        size_t threadchunk = curlevel_size / nthreads;
+        size_t threadextra = curlevel_size % nthreads;
+        boost::asio::thread_pool pool(nthreads);
+        for (int t=0;t<nthreads;++t) {
+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
+            size_t threadend = threadstart + threadsize;
+            boost::asio::post(pool,
+                [threadstart, threadend, CWL, CWR,
+                &curlevel, &nextlevel] {
+                    finalize_nonleaf_layer_nothreads(threadstart, threadend,
+                        curlevel, nextlevel, CWL, CWR);
+                });
+            threadstart = threadend;
+        }
+        pool.join();
+    }
+}
+
+// Finalize a leaf layer. This applies the correction words, and
+// computes the low and high sums and XORs.  This version does not use
+// threads.  You can pass save_expansion = false here if you don't need
+// to save the expansion.  LN is a LeafNode.
+template <size_t WIDTH, typename LN>
+static inline void finalize_leaf_layer_nothreads(size_t start,
+    size_t end, const DPFnode *curlevel, LN *nextlevel,
+    bool save_expansion, LN CWL, LN CWR, value_t &low_sum,
+    std::array<value_t,WIDTH> &high_sum,
+    std::array<value_t,WIDTH> &high_xor)
+{
+    value_t llow_sum = 0;
+    std::array<value_t,WIDTH> lhigh_sum;
+    std::array<value_t,WIDTH> lhigh_xor;
+    zero(lhigh_sum);
+    zero(lhigh_xor);
+    for(size_t i=start;i<end;++i) {
+        bool flag = get_lsb(curlevel[i]);
+        LN leftchild = xor_if(nextlevel[2*i], CWL, flag);
+        LN rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
+        if (save_expansion) {
+            nextlevel[2*i] = leftchild;
+            nextlevel[2*i+1] = rightchild;
+        }
+        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[0]));
+        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[0]));
+        value_t lefthigh =
+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[0],8)));
+        value_t righthigh =
+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[0],8)));
+        llow_sum += (leftlow + rightlow);
+        lhigh_sum[0] += (lefthigh + righthigh);
+        lhigh_xor[0] ^= (lefthigh ^ righthigh);
+        size_t w = 0;
+        for (size_t j=1; j<WIDTH; j+=2) {
+            ++w;
+            value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[w]));
+            value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[w]));
+            value_t lefthigh =
+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[w],8)));
+            value_t righthigh =
+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[w],8)));
+            lhigh_sum[j] += (leftlow + rightlow);
+            lhigh_xor[j] ^= (leftlow ^ rightlow);
+            if (j+1 < WIDTH) {
+                lhigh_sum[j+1] += (lefthigh + righthigh);
+                lhigh_xor[j+1] ^= (lefthigh ^ righthigh);
+            }
+        }
+    }
+    low_sum = llow_sum;
+    high_sum = lhigh_sum;
+    high_xor = lhigh_xor;
+}
+
+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
+// timing benchmarks.  The timing of each iteration of the inner loop is
+// comparable to the above, so just use the same computations.  All of
+// this could be tuned, of course.
+template <size_t WIDTH, typename LN>
+static inline void finalize_leaf_layer(int max_nthreads, nbits_t level,
+    const DPFnode *curlevel, LN *nextlevel, bool save_expansion,
+    LN CWL, LN CWR, value_t &low_sum,
+    std::array<value_t,WIDTH> &high_sum,
+    std::array<value_t,WIDTH> &high_xor)
+{
+    size_t curlevel_size = (size_t(1)<<level);
+    if (max_nthreads == 1 || level < 19) {
+        // No threading
+        finalize_leaf_layer_nothreads(0, curlevel_size,
+            curlevel, nextlevel, save_expansion, CWL, CWR,
+            low_sum, high_sum, high_xor);
+    } else {
+        int nthreads =
+            int(ceil(sqrt(double(curlevel_size/6000))));
+        if (nthreads > max_nthreads) {
+            nthreads = max_nthreads;
+        }
+        value_t tlow_sum[nthreads];
+        std::array<value_t,WIDTH> thigh_sum[nthreads];
+        std::array<value_t,WIDTH> thigh_xor[nthreads];
+        size_t threadstart = 0;
+        size_t threadchunk = curlevel_size / nthreads;
+        size_t threadextra = curlevel_size % nthreads;
+        boost::asio::thread_pool pool(nthreads);
+        for (int t=0;t<nthreads;++t) {
+            size_t threadsize = threadchunk + (size_t(t) < threadextra);
+            size_t threadend = threadstart + threadsize;
+            boost::asio::post(pool,
+                [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
+                &curlevel, &nextlevel, CWL, CWR, save_expansion] {
+                    finalize_leaf_layer_nothreads(threadstart, threadend,
+                        curlevel, nextlevel, save_expansion, CWL, CWR,
+                        tlow_sum[t], thigh_sum[t], thigh_xor[t]);
+                });
+            threadstart = threadend;
+        }
+        pool.join();
+        low_sum = 0;
+        zero(high_sum);
+        zero(high_xor);
+        for (int t=0;t<nthreads;++t) {
+            low_sum += tlow_sum[t];
+            high_sum += thigh_sum[t];
+            high_xor ^= thigh_xor[t];
+        }
+    }
+}
+
+
+
+// Create one level of the RDPF.  NT will be as above: LeafNode if we
+// are expanding into a leaf level, DPFnode if not.  LI will be LeafInfo
+// if we are expanding into a leaf level, and it is unused otherwise.
+template<typename NT, typename LI>
+static inline void create_level(MPCTIO &tio, yield_t &yield,
+    const DPFnode *curlevel, NT *nextlevel,
+    int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW,
+    bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops)
+{
+    // tio.cpu_nthreads() is the maximum number of threads we
+    // have available.
+    int max_nthreads = tio.cpu_nthreads();
+
+    NT L, R;
+    zero(L);
+    zero(R);
+    // The server doesn't need to do this computation, but it does
+    // need to execute mpc_reconstruct_choice so that it sends
+    // the AndTriples at the appropriate time.
+    if (player < 2) {
+#ifdef RDPF_MTGEN_TIMING_1
+        if (player == 0) {
+            mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
+                nextlevel, aes_ops);
+            size_t niters = 2048;
+            if (level > 8) niters = (1<<20)>>level;
+            for(int t=1;t<=8;++t) {
+                mtgen_timetest_1(level, t, niters, curlevel,
+                    nextlevel, aes_ops);
+            }
+            mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
+                nextlevel, aes_ops);
+        }
+#endif
+        // Using the timing results gathered above, decide whether
+        // to multithread, and if so, how many threads to use.
+        expand_level(max_nthreads, level, curlevel, nextlevel,
+            L, R, aes_ops);
+    }
+
+    // If we're going left (bs_choice = 0), we want the correction
+    // word to be the XOR of our right side and our peer's right
+    // side; if bs_choice = 1, it should be the XOR or our left side
+    // and our peer's left side.
+
+    // We also have to ensure that the flag bits (the lsb) of the
+    // side that will end up the same be of course the same, but
+    // also that the flag bits (the lsb) of the side that will end
+    // up different _must_ be different.  That is, it's not enough
+    // for the nodes of the child selected by choice to be different
+    // as 128-bit values; they also have to be different in their
+    // lsb.
+
+    // This is where we make a small optimization over Appendix C of
+    // the Duoram paper: instead of keeping separate correction flag
+    // bits for the left and right children, we observe that the low
+    // bit of the overall correction word effectively serves as one
+    // of those bits, so we just need to store one extra bit per
+    // level, not two.  (We arbitrarily choose the one for the right
+    // child.)
+
+    // Note that the XOR of our left and right child before and
+    // after applying the correction word won't change, since the
+    // correction word is applied to either both children or
+    // neither, depending on the value of the parent's flag. So in
+    // particular, the XOR of the flag bits won't change, and if our
+    // children's flag's XOR equals our peer's children's flag's
+    // XOR, then we won't have different flag bits even for the
+    // children that have different 128-bit values.
+
+    // So we compute our_parity = lsb(L^R)^player, and we XOR that
+    // into the R value in the correction word computation.  At the
+    // same time, we exchange these parity values to compute the
+    // combined parity, which we store in the DPF.  Then when the
+    // DPF is evaluated, if the parent's flag is set, not only apply
+    // the correction work to both children, but also apply the
+    // (combined) parity bit to just the right child.  Then for
+    // unequal nodes (where the flag bit is different), exactly one
+    // of the four children (two for P0 and two for P1) will have
+    // the parity bit applied, which will set the XOR of the lsb of
+    // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
+    // = 1 because everything cancels out except player (for which
+    // one player is 0 and the other is 1).
+
+    bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player;
+    xor_lsb(R, our_parity_bit);
+
+    NT CWL;
+    bool peer_parity_bit;
+    // Exchange the parities and do mpc_reconstruct_choice at the
+    // same time (bundled into the same rounds)
+    run_coroutines(yield,
+        [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
+            tio.queue_peer(&our_parity_bit, 1);
+            yield();
+            uint8_t peer_parity_byte;
+            tio.recv_peer(&peer_parity_byte, 1);
+            peer_parity_bit = peer_parity_byte & 1;
+        },
+        [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) {
+            mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L);
+        });
+    cfbit = our_parity_bit ^ peer_parity_bit;
+    CW = CWL;
+    NT CWR = CWL;
+    xor_lsb(CWR, cfbit);
+    if (player < 2) {
+        // The timing of each iteration of the inner loop is
+        // comparable to the above, so just use the same
+        // computations.  All of this could be tuned, of course.
+
+        if constexpr (std::is_same_v<NT, DPFnode>) {
+            finalize_nonleaf_layer(max_nthreads, level, curlevel,
+                nextlevel, CWL, CWR);
+        } else {
+            // Recall there are four potentially useful vectors that
+            // can come out of a DPF:
+            // - (single-bit) bitwise unit vector
+            // - additive-shared unit vector
+            // - XOR-shared scaled unit vector
+            // - additive-shared scaled unit vector
+            //
+            // (No single DPF should be used for both of the first
+            // two or both of the last two, though, since they're
+            // correlated; you _can_ use one of the first two and
+            // one of the last two.)
+            //
+            // For each 128-bit leaf, the low bit is the flag bit,
+            // and we're guaranteed that the flag bits (and indeed
+            // the whole 128-bit value) for P0 and P1 are the same
+            // for every leaf except the target, and that the flag
+            // bits definitely differ for the target (and the other
+            // 127 bits are independently random on each side).
+            //
+            // We divide the 128-bit leaf into a low 64-bit word and
+            // a high 64-bit word.  We use the low word for the unit
+            // vector and the high word for the scaled vector; this
+            // choice is not arbitrary: the flag bit in the low word
+            // means that the sum of all the low words (with P1's
+            // low words negated) across both P0 and P1 is
+            // definitely odd, so we can compute that sum's inverse
+            // mod 2^64, and store it now during precomputation.  At
+            // evaluation time for the additive-shared unit vector,
+            // we will output this global inverse times the low word
+            // of each leaf, which will make the sum of all of those
+            // values 1.  (This technique replaces the protocol in
+            // Appendix D of the Duoram paper.)
+            //
+            // For the scaled vector, we just have to compute shares
+            // of what the scaled vector is a sharing _of_, but
+            // that's just XORing or adding all of each party's
+            // local high words; no communication needed.
+
+            value_t low_sum;
+            const size_t WIDTH = LI::W;
+            std::array<value_t,WIDTH> high_sum;
+            std::array<value_t,WIDTH> high_xor;
+            finalize_leaf_layer(max_nthreads, level, curlevel,
+                nextlevel, save_expansion, CWL, CWR, low_sum, high_sum,
+                high_xor);
+
+            if (player == 1) {
+                low_sum = -low_sum;
+                for(size_t j=0; j<WIDTH; ++j) {
+                    high_sum[j] = -high_sum[j];
+                }
+            }
+            for(size_t j=0; j<WIDTH; ++j) {
+                li.scaled_sum[j].ashare = high_sum[j];
+                li.scaled_xor[j].xshare = high_xor[j];
+            }
+            // Exchange low_sum and add them up
+            tio.queue_peer(&low_sum, sizeof(low_sum));
+            yield();
+            value_t peer_low_sum;
+            tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
+            low_sum += peer_low_sum;
+            // The low_sum had better be odd
+            assert(low_sum & 1);
+            li.unit_sum_inverse = inverse_value_t(low_sum);
+        }
+    } else if (level == depth-1) {
+        yield();
+    }
+}
+
+
 // Construct a DPF with the given (XOR-shared) target location, and
 // of the given depth, to be used for random-access memory reads and
 // writes.  The DPF is construction collaboratively by P0 and P1,
@@ -369,362 +826,56 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
     // Construct each intermediate level
     while(level < depth) {
+        LeafNode *leaflevel = NULL;
         if (player < 2) {
             delete[] curlevel;
             curlevel = nextlevel;
+            nextlevel = NULL;
             if (save_expansion && level == depth-1) {
                 expansion.resize(1<<depth);
-                nextlevel = (DPFnode *)expansion.data();
+                leaflevel = expansion.data();
+            } else if (level == depth-1) {
+                leaflevel = new LeafNode[1<<depth];
             } else {
                 nextlevel = new DPFnode[1<<(level+1)];
             }
         }
         // Invariant: curlevel has 2^level elements; nextlevel has
-        // 2^{level+1} elements
+        // 2^{level+1} DPFnode elements if we're not at the last level,
+        // and leaflevel has 2^{level+1} LeafNode elements if we are at
+        // a leaf level (the last level always, and all levels if we are
+        // making an incremental RDPF).
 
         // The bit-shared choice bit is bit (depth-level-1) of the
         // XOR-shared target index
         RegBS bs_choice = target.bit(depth-level-1);
-        size_t curlevel_size = (size_t(1)<<level);
-        DPFnode L = _mm_setzero_si128();
-        DPFnode R = _mm_setzero_si128();
-        // The server doesn't need to do this computation, but it does
-        // need to execute mpc_reconstruct_choice so that it sends
-        // the AndTriples at the appropriate time.
-        if (player < 2) {
-#ifdef RDPF_MTGEN_TIMING_1
-            if (player == 0) {
-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
-                    nextlevel, aes_ops);
-                size_t niters = 2048;
-                if (level > 8) niters = (1<<20)>>level;
-                for(int t=1;t<=8;++t) {
-                    mtgen_timetest_1(level, t, niters, curlevel,
-                        nextlevel, aes_ops);
-                }
-                mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
-                    nextlevel, aes_ops);
-            }
-#endif
-            // Using the timing results gathered above, decide whether
-            // to multithread, and if so, how many threads to use.
-            // tio.cpu_nthreads() is the maximum number we have
-            // available.
-            int max_nthreads = tio.cpu_nthreads();
-            if (max_nthreads == 1 || level < 19) {
-                // No threading
-                size_t laes_ops = 0;
-                for(size_t i=0;i<curlevel_size;++i) {
-                    DPFnode lchild, rchild;
-                    prgboth(lchild, rchild, curlevel[i], laes_ops);
-                    L = (L ^ lchild);
-                    R = (R ^ rchild);
-                    nextlevel[2*i] = lchild;
-                    nextlevel[2*i+1] = rchild;
-                }
-                aes_ops += laes_ops;
-            } else {
-                size_t curlevel_size = size_t(1)<<level;
-                int nthreads =
-                    int(ceil(sqrt(double(curlevel_size/6000))));
-                if (nthreads > max_nthreads) {
-                    nthreads = max_nthreads;
-                }
-                DPFnode tL[nthreads];
-                DPFnode tR[nthreads];
-                size_t taes_ops[nthreads];
-                size_t threadstart = 0;
-                size_t threadchunk = curlevel_size / nthreads;
-                size_t threadextra = curlevel_size % nthreads;
-                boost::asio::thread_pool pool(nthreads);
-                for (int t=0;t<nthreads;++t) {
-                    size_t threadsize = threadchunk + (size_t(t) < threadextra);
-                    size_t threadend = threadstart + threadsize;
-                    boost::asio::post(pool,
-                        [t, &tL, &tR, &taes_ops, threadstart, threadend,
-                        &curlevel, &nextlevel] {
-                            DPFnode L = _mm_setzero_si128();
-                            DPFnode R = _mm_setzero_si128();
-                            size_t aes_ops = 0;
-                            for(size_t i=threadstart;i<threadend;++i) {
-                                DPFnode lchild, rchild;
-                                prgboth(lchild, rchild, curlevel[i], aes_ops);
-                                L = (L ^ lchild);
-                                R = (R ^ rchild);
-                                nextlevel[2*i] = lchild;
-                                nextlevel[2*i+1] = rchild;
-                            }
-                            tL[t] = L;
-                            tR[t] = R;
-                            taes_ops[t] = aes_ops;
-                        });
-                    threadstart = threadend;
-                }
-                pool.join();
-                for (int t=0;t<nthreads;++t) {
-                    L ^= tL[t];
-                    R ^= tR[t];
-                    aes_ops += taes_ops[t];
-                }
-            }
-        }
-        // If we're going left (bs_choice = 0), we want the correction
-        // word to be the XOR of our right side and our peer's right
-        // side; if bs_choice = 1, it should be the XOR or our left side
-        // and our peer's left side.
-
-        // We also have to ensure that the flag bits (the lsb) of the
-        // side that will end up the same be of course the same, but
-        // also that the flag bits (the lsb) of the side that will end
-        // up different _must_ be different.  That is, it's not enough
-        // for the nodes of the child selected by choice to be different
-        // as 128-bit values; they also have to be different in their
-        // lsb.
-
-        // This is where we make a small optimization over Appendix C of
-        // the Duoram paper: instead of keeping separate correction flag
-        // bits for the left and right children, we observe that the low
-        // bit of the overall correction word effectively serves as one
-        // of those bits, so we just need to store one extra bit per
-        // level, not two.  (We arbitrarily choose the one for the right
-        // child.)
-
-        // Note that the XOR of our left and right child before and
-        // after applying the correction word won't change, since the
-        // correction word is applied to either both children or
-        // neither, depending on the value of the parent's flag. So in
-        // particular, the XOR of the flag bits won't change, and if our
-        // children's flag's XOR equals our peer's children's flag's
-        // XOR, then we won't have different flag bits even for the
-        // children that have different 128-bit values.
-
-        // So we compute our_parity = lsb(L^R)^player, and we XOR that
-        // into the R value in the correction word computation.  At the
-        // same time, we exchange these parity values to compute the
-        // combined parity, which we store in the DPF.  Then when the
-        // DPF is evaluated, if the parent's flag is set, not only apply
-        // the correction work to both children, but also apply the
-        // (combined) parity bit to just the right child.  Then for
-        // unequal nodes (where the flag bit is different), exactly one
-        // of the four children (two for P0 and two for P1) will have
-        // the parity bit applied, which will set the XOR of the lsb of
-        // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
-        // = 1 because everything cancels out except player (for which
-        // one player is 0 and the other is 1).
-
-        bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
-        DPFnode our_parity = lsb128_mask[our_parity_bit];
-
-        DPFnode CW;
-        bool peer_parity_bit;
-        // Exchange the parities and do mpc_reconstruct_choice at the
-        // same time (bundled into the same rounds)
-        run_coroutines(yield,
-            [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
-                tio.queue_peer(&our_parity_bit, 1);
-                yield();
-                uint8_t peer_parity_byte;
-                tio.recv_peer(&peer_parity_byte, 1);
-                peer_parity_bit = peer_parity_byte & 1;
-            },
-            [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
-                mpc_reconstruct_choice(tio, yield, CW, bs_choice,
-                    (R ^ our_parity), L);
-            });
-        bool parity_bit = our_parity_bit ^ peer_parity_bit;
-        cfbits |= (value_t(parity_bit)<<level);
-        DPFnode CWR = CW ^ lsb128_mask[parity_bit];
-        if (player < 2) {
-            // The timing of each iteration of the inner loop is
-            // comparable to the above, so just use the same
-            // computations.  All of this could be tuned, of course.
-
-            if (level < depth-1) {
-                // Using the timing results gathered above, decide whether
-                // to multithread, and if so, how many threads to use.
-                // tio.cpu_nthreads() is the maximum number we have
-                // available.
-                int max_nthreads = tio.cpu_nthreads();
-                if (max_nthreads == 1 || level < 19) {
-                    // No threading
-                    for(size_t i=0;i<curlevel_size;++i) {
-                        bool flag = get_lsb(curlevel[i]);
-                        nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
-                        nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
-                    }
-                } else {
-                    int nthreads =
-                        int(ceil(sqrt(double(curlevel_size/6000))));
-                    if (nthreads > max_nthreads) {
-                        nthreads = max_nthreads;
-                    }
-                    size_t threadstart = 0;
-                    size_t threadchunk = curlevel_size / nthreads;
-                    size_t threadextra = curlevel_size % nthreads;
-                    boost::asio::thread_pool pool(nthreads);
-                    for (int t=0;t<nthreads;++t) {
-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
-                        size_t threadend = threadstart + threadsize;
-                        boost::asio::post(pool, [CW, CWR, threadstart, threadend,
-                            &curlevel, &nextlevel] {
-                                for(size_t i=threadstart;i<threadend;++i) {
-                                    bool flag = get_lsb(curlevel[i]);
-                                    nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
-                                    nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
-                                }
-                        });
-                        threadstart = threadend;
-                    }
-                    pool.join();
-                }
-            } else {
-                // Recall there are four potentially useful vectors that
-                // can come out of a DPF:
-                // - (single-bit) bitwise unit vector
-                // - additive-shared unit vector
-                // - XOR-shared scaled unit vector
-                // - additive-shared scaled unit vector
-                //
-                // (No single DPF should be used for both of the first
-                // two or both of the last two, though, since they're
-                // correlated; you _can_ use one of the first two and
-                // one of the last two.)
-                //
-                // For each 128-bit leaf, the low bit is the flag bit,
-                // and we're guaranteed that the flag bits (and indeed
-                // the whole 128-bit value) for P0 and P1 are the same
-                // for every leaf except the target, and that the flag
-                // bits definitely differ for the target (and the other
-                // 127 bits are independently random on each side).
-                //
-                // We divide the 128-bit leaf into a low 64-bit word and
-                // a high 64-bit word.  We use the low word for the unit
-                // vector and the high word for the scaled vector; this
-                // choice is not arbitrary: the flag bit in the low word
-                // means that the sum of all the low words (with P1's
-                // low words negated) across both P0 and P1 is
-                // definitely odd, so we can compute that sum's inverse
-                // mod 2^64, and store it now during precomputation.  At
-                // evaluation time for the additive-shared unit vector,
-                // we will output this global inverse times the low word
-                // of each leaf, which will make the sum of all of those
-                // values 1.  (This technique replaces the protocol in
-                // Appendix D of the Duoram paper.)
-                //
-                // For the scaled vector, we just have to compute shares
-                // of what the scaled vector is a sharing _of_, but
-                // that's just XORing or adding all of each party's
-                // local high words; no communication needed.
-
-                value_t low_sum = 0;
-                value_t high_sum = 0;
-                value_t high_xor = 0;
-                // Using the timing results gathered above, decide whether
-                // to multithread, and if so, how many threads to use.
-                // tio.cpu_nthreads() is the maximum number we have
-                // available.
-                int max_nthreads = tio.cpu_nthreads();
-                if (max_nthreads == 1 || level < 19) {
-                    // No threading
-                    for(size_t i=0;i<curlevel_size;++i) {
-                        bool flag = get_lsb(curlevel[i]);
-                        DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
-                        DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
-                        if (save_expansion) {
-                            nextlevel[2*i] = leftchild;
-                            nextlevel[2*i+1] = rightchild;
-                        }
-                        value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
-                        value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
-                        value_t lefthigh =
-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
-                        value_t righthigh =
-                            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
-                        low_sum += (leftlow + rightlow);
-                        high_sum += (lefthigh + righthigh);
-                        high_xor ^= (lefthigh ^ righthigh);
-                    }
-                } else {
-                    int nthreads =
-                        int(ceil(sqrt(double(curlevel_size/6000))));
-                    if (nthreads > max_nthreads) {
-                        nthreads = max_nthreads;
-                    }
-                    value_t tlow_sum[nthreads];
-                    value_t thigh_sum[nthreads];
-                    value_t thigh_xor[nthreads];
-                    size_t threadstart = 0;
-                    size_t threadchunk = curlevel_size / nthreads;
-                    size_t threadextra = curlevel_size % nthreads;
-                    boost::asio::thread_pool pool(nthreads);
-                    for (int t=0;t<nthreads;++t) {
-                        size_t threadsize = threadchunk + (size_t(t) < threadextra);
-                        size_t threadend = threadstart + threadsize;
-                        boost::asio::post(pool,
-                            [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
-                            &curlevel, &nextlevel, CW, CWR, save_expansion] {
-                                value_t low_sum = 0;
-                                value_t high_sum = 0;
-                                value_t high_xor = 0;
-                                for(size_t i=threadstart;i<threadend;++i) {
-                                    bool flag = get_lsb(curlevel[i]);
-                                    DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
-                                    DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
-                                    if (save_expansion) {
-                                        nextlevel[2*i] = leftchild;
-                                        nextlevel[2*i+1] = rightchild;
-                                    }
-                                    value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
-                                    value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
-                                    value_t lefthigh =
-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
-                                    value_t righthigh =
-                                        value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
-                                    low_sum += (leftlow + rightlow);
-                                    high_sum += (lefthigh + righthigh);
-                                    high_xor ^= (lefthigh ^ righthigh);
-                                }
-                                tlow_sum[t] = low_sum;
-                                thigh_sum[t] = high_sum;
-                                thigh_xor[t] = high_xor;
-                            });
-                        threadstart = threadend;
-                    }
-                    pool.join();
-                    for (int t=0;t<nthreads;++t) {
-                        low_sum += tlow_sum[t];
-                        high_sum += thigh_sum[t];
-                        high_xor ^= thigh_xor[t];
-                    }
-                }
-                if (player == 1) {
-                    low_sum = -low_sum;
-                    high_sum = -high_sum;
-                }
-                li[0].scaled_sum[0].ashare = high_sum;
-                li[0].scaled_xor[0].xshare = high_xor;
-                // Exchange low_sum and add them up
-                tio.queue_peer(&low_sum, sizeof(low_sum));
-                yield();
-                value_t peer_low_sum;
-                tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
-                low_sum += peer_low_sum;
-                // The low_sum had better be odd
-                assert(low_sum & 1);
-                li[0].unit_sum_inverse = inverse_value_t(low_sum);
+        bool cfbit;
+
+        if (level < depth-1) {
+            DPFnode CW;
+            create_level(tio, yield, curlevel, nextlevel, player, level,
+                depth, bs_choice, CW, cfbit, save_expansion, li[0],
+                aes_ops);
+            cfbits |= (value_t(cfbit)<<level);
+            if (player < 2) {
+                cw.push_back(CW);
             }
-            cw.push_back(CW);
-        } else if (level == depth-1) {
-            yield();
+        } else {
+            LeafNode CW;
+            create_level(tio, yield, curlevel, leaflevel, player, level,
+                depth, bs_choice, CW, cfbit, save_expansion, li[0],
+                aes_ops);
+            li[0].leaf_cw = CW;
         }
 
+        if (!save_expansion) {
+            delete[] leaflevel;
+        }
         ++level;
     }
 
     delete[] curlevel;
-    if (!save_expansion || player == 2) {
-        delete[] nextlevel;
-    }
+    delete[] nextlevel;
 }
 
 // Get the leaf node for the given input

+ 17 - 0
types.hpp

@@ -8,6 +8,8 @@
 #include <x86intrin.h>  // SSE and AVX intrinsics
 #include <bsd/stdlib.h> // arc4random_buf
 
+#include "bitutils.hpp"
+
 // The number of bits in an MPC secret-shared memory word
 
 #ifndef VALUE_BITS
@@ -648,6 +650,14 @@ inline std::array<S,N> &operator^=(std::array<S,N> &A, const std::array<S,N> &B)
     return A;
 }
 
+// XOR the bit B into the low bit of A
+template <typename S, size_t N>
+inline std::array<S,N> &xor_lsb(std::array<S,N> &A, bit_t B)
+{
+    A[0] ^= lsb128_mask[B];
+    return A;
+}
+
 template <typename S, size_t N>
 inline std::tuple<std::array<value_t,N>,std::array<value_t,N>,std::array<value_t,N>>
     combine(
@@ -713,6 +723,13 @@ struct AndTripleName { static constexpr const char *name = "a"; };
 
 using DPFnode = __m128i;
 
+// XOR the bit B into the low bit of A
+inline DPFnode &xor_lsb(DPFnode &A, bit_t B)
+{
+    A ^= lsb128_mask[B];
+    return A;
+}
+
 // A Select triple for type V (V is DPFnode, value_t, or bit_t) is a
 // triple of (X0,Y0,Z0) where X0 is a bit and Y0 and Z0 are Vs held by
 // P0 (and correspondingly (X1,Y1,Z1) held by P1), with all values