Procházet zdrojové kódy

Generalize a bunch of types to eventually support wide DPFs

Ian Goldberg před 1 rokem
rodič
revize
953c1fd3a1
7 změnil soubory, kde provedl 475 přidání a 182 odebrání
  1. 2 1
      cell.cpp
  2. 23 17
      duoram.tcc
  3. 73 75
      online.cpp
  4. 13 13
      preproc.cpp
  5. 142 50
      rdpf.hpp
  6. 81 20
      rdpf.tcc
  7. 141 6
      types.hpp

+ 2 - 1
cell.cpp

@@ -92,7 +92,8 @@ struct Cell {
     // the word with value 1.  This is used for ORAM reads, where the
     // same DPF is used for all the fields.
     template <nbits_t WIDTH>
-    inline void unit(const RDPF<WIDTH> &dpf, DPFnode leaf) {
+    inline void unit(const RDPF<WIDTH> &dpf,
+        typename RDPF<WIDTH>::LeafNode leaf) {
         key = dpf.unit_as(leaf);
         pointers = dpf.unit_bs(leaf);
         value = dpf.unit_bs(leaf);

+ 23 - 17
duoram.tcc

@@ -304,7 +304,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
             shape.tio.aes_ops());
         FT init;
         res = pe.reduce(init, [this, &dp, &shape] (int thread_num,
-                address_t i, const RDPFPair<1>::node &leaf) {
+                address_t i, const RDPFPair<1>::LeafNode &leaf) {
             // The values from the two DPFs, which will each be of type T
             std::tuple<FT,FT> V;
             dp.unit(V, leaf);
@@ -341,7 +341,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
             shape.shape_size, shape.tio.cpu_nthreads(),
             shape.tio.aes_ops());
         gamma = pe.reduce(init, [this, &dp, &shape] (int thread_num,
-                address_t i, const RDPFPair<1>::node &leaf) {
+                address_t i, const RDPFPair<1>::LeafNode &leaf) {
             // The values from the two DPFs, each of type FT
             std::tuple<FT,FT> V;
             dp.unit(V, leaf);
@@ -389,8 +389,10 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
         U indoffset;
         dt.get_target(indoffset);
         indoffset -= idx;
-        auto Moffset = std::make_tuple(M, M, M);
-        std::tuple<FT,FT,FT> scaled_val;
+        RDPF<1>::W<FT> MW;
+        MW[0] = M;
+        auto Moffset = std::make_tuple(MW, MW, MW);
+        RDPFTriple<1>::WTriple<FT> scaled_val;
         dt.scaled_value(scaled_val);
         Moffset -= scaled_val;
 
@@ -406,7 +408,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
         // Receive the above from the peer
         U peerindoffset;
-        std::tuple<FT,FT,FT> peerMoffset;
+        RDPFTriple<1>::WTriple<FT> peerMoffset;
         shape.tio.recv_peer(&peerindoffset, BITBYTES(shape.addr_size));
         shape.tio.iostream_peer() >> peerMoffset;
 
@@ -420,22 +422,23 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             shape.tio.aes_ops());
         int init = 0;
         pe.reduce(init, [this, &dt, &shape, &Mshift, player] (int thread_num,
-                address_t i, const RDPFTriple<1>::node &leaf) {
+                address_t i, const RDPFTriple<1>::LeafNode &leaf) {
             // The values from the three DPFs
-            std::tuple<FT,FT,FT> scaled, unit;
+            RDPFTriple<1>::WTriple<FT> scaled;
+            std::tuple<FT,FT,FT> unit;
             dt.scaled(scaled, leaf);
             dt.unit(unit, leaf);
             auto [V0, V1, V2] = scaled + unit * Mshift;
             // References to the appropriate cells in our database, our
             // blind, and our copy of the peer's blinded database
             auto [DB, BL, PBD] = shape.get_comp(i,fieldsel);
-            DB += V0;
+            DB += V0[0];
             if (player == 0) {
-                BL -= V1;
-                PBD += V2-V0;
+                BL -= V1[0];
+                PBD += V2[0]-V0[0];
             } else {
-                BL -= V2;
-                PBD += V1-V0;
+                BL -= V2[0];
+                PBD += V1[0]-V0[0];
             }
             return 0;
         });
@@ -444,7 +447,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
         RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
         U p0indoffset, p1indoffset;
-        std::tuple<FT,FT> p0Moffset, p1Moffset;
+        RDPFPair<1>::WPair<FT> p0Moffset, p1Moffset;
 
         shape.yield();
 
@@ -463,16 +466,19 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             shape.tio.aes_ops());
         int init = 0;
         pe.reduce(init, [this, &dp, &shape, &Mshift] (int thread_num,
-                address_t i, const RDPFPair<1>::node &leaf) {
+                address_t i, const RDPFPair<1>::LeafNode &leaf) {
             // The values from the two DPFs
-            std::tuple<FT,FT> scaled, unit;
+            RDPFPair<1>::WPair<FT> scaled;
+            std::tuple<FT,FT> unit;
             dp.scaled(scaled, leaf);
             dp.unit(unit, leaf);
-            auto V = scaled + unit * Mshift;
+            auto [V0, V1] = scaled + unit * Mshift;
             // shape.get_server(i) returns a pair of references to the
             // appropriate cells in the two blinded databases, so we can
             // subtract the pair directly.
-            shape.get_server(i,fieldsel) -= V;
+            auto [BL0, BL1] = shape.get_server(i,fieldsel);
+            BL0 -= V0[0];
+            BL1 -= V1[0];
             return 0;
         });
     }

+ 73 - 75
online.cpp

@@ -220,13 +220,13 @@ static void rdpf_test(MPCIO &mpcio,
                         for (int i=0;i<2;++i) {
                             const RDPF<1> &dpf = dp.dpf[i];
                             for (address_t x=0;x<(address_t(1)<<depth);++x) {
-                                DPFnode leaf = dpf.leaf(x, aes_ops);
+                                RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
                                 RegBS ub = dpf.unit_bs(leaf);
                                 RegAS ua = dpf.unit_as(leaf);
-                                RegXS sx = dpf.scaled_xs(leaf);
-                                RegAS sa = dpf.scaled_as(leaf);
+                                RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
+                                RDPF<1>::RegASW sa = dpf.scaled_as(leaf);
                                 printf("%04x %x %016lx %016lx %016lx\n", x,
-                                    ub.bshare, ua.ashare, sx.xshare, sa.ashare);
+                                    ub.bshare, ua.ashare, sx[0].xshare, sa[0].ashare);
                             }
                             printf("\n");
                         }
@@ -237,36 +237,36 @@ static void rdpf_test(MPCIO &mpcio,
                             RegXS peer_scaled_xor;
                             RegAS peer_scaled_sum;
                             if (tio.player() == 1) {
-                                tio.iostream_peer() << dpf.scaled_xor << dpf.scaled_sum;
+                                tio.iostream_peer() << dpf.li[0].scaled_xor[0] << dpf.li[0].scaled_sum[0];
                             } else {
                                 tio.iostream_peer() >> peer_scaled_xor >> peer_scaled_sum;
-                                peer_scaled_sum += dpf.scaled_sum;
-                                peer_scaled_xor ^= dpf.scaled_xor;
+                                peer_scaled_sum += dpf.li[0].scaled_sum[0];
+                                peer_scaled_xor ^= dpf.li[0].scaled_xor[0];
                             }
                             for (address_t x=0;x<(address_t(1)<<depth);++x) {
-                                DPFnode leaf = dpf.leaf(x, aes_ops);
+                                RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
                                 RegBS ub = dpf.unit_bs(leaf);
                                 RegAS ua = dpf.unit_as(leaf);
-                                RegXS sx = dpf.scaled_xs(leaf);
-                                RegAS sa = dpf.scaled_as(leaf);
+                                RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
+                                RDPF<1>::RegASW sa = dpf.scaled_as(leaf);
                                 printf("%04x %x %016lx %016lx %016lx\n", x,
-                                    ub.bshare, ua.ashare, sx.xshare, sa.ashare);
+                                    ub.bshare, ua.ashare, sx[0].xshare, sa[0].ashare);
                                 if (tio.player() == 1) {
                                     tio.iostream_peer() << ub << ua << sx << sa;
                                 } else {
                                     RegBS peer_ub;
                                     RegAS peer_ua;
-                                    RegXS peer_sx;
-                                    RegAS peer_sa;
+                                    RDPF<1>::RegXSW peer_sx;
+                                    RDPF<1>::RegASW peer_sa;
                                     tio.iostream_peer() >> peer_ub >> peer_ua >>
                                         peer_sx >> peer_sa;
                                     ub ^= peer_ub;
                                     ua += peer_ua;
                                     sx ^= peer_sx;
                                     sa += peer_sa;
-                                    if (ub.bshare || ua.ashare || sx.xshare || sa.ashare) {
+                                    if (ub.bshare || ua.ashare || sx[0].xshare || sa[0].ashare) {
                                         printf("**** %x %016lx %016lx %016lx\n",
-                                            ub.bshare, ua.ashare, sx.xshare, sa.ashare);
+                                            ub.bshare, ua.ashare, sx[0].xshare, sa[0].ashare);
                                         printf("SCALE                   %016lx %016lx\n",
                                             peer_scaled_xor.xshare, peer_scaled_sum.ashare);
                                     }
@@ -304,14 +304,14 @@ static void rdpf_timing(MPCIO &mpcio,
                     for (int i=0;i<2;++i) {
                         RDPF<1> &dpf = dp.dpf[i];
                         dpf.expand(aes_ops);
-                        RegXS scaled_xor;
+                        RDPF<1>::RegXSW scaled_xor;
                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
-                            DPFnode leaf = dpf.leaf(x, aes_ops);
-                            RegXS sx = dpf.scaled_xs(leaf);
+                            RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
+                            RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
                             scaled_xor ^= sx;
                         }
-                        printf("%016lx\n%016lx\n", scaled_xor.xshare,
-                            dpf.scaled_xor.xshare);
+                        printf("%016lx\n%016lx\n", scaled_xor[0].xshare,
+                            dpf.li[0].scaled_xor[0].xshare);
                         printf("\n");
                     }
                 } else {
@@ -319,14 +319,14 @@ static void rdpf_timing(MPCIO &mpcio,
                     for (int i=0;i<3;++i) {
                         RDPF<1> &dpf = dt.dpf[i];
                         dpf.expand(aes_ops);
-                        RegXS scaled_xor;
+                        RDPF<1>::RegXSW scaled_xor;
                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
-                            DPFnode leaf = dpf.leaf(x, aes_ops);
-                            RegXS sx = dpf.scaled_xs(leaf);
+                            RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
+                            RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
                             scaled_xor ^= sx;
                         }
-                        printf("%016lx\n%016lx\n", scaled_xor.xshare,
-                            dpf.scaled_xor.xshare);
+                        printf("%016lx\n%016lx\n", scaled_xor[0].xshare,
+                            dpf.li[0].scaled_xor[0].xshare);
                         printf("\n");
                     }
                 }
@@ -339,7 +339,7 @@ static void rdpf_timing(MPCIO &mpcio,
 static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
     address_t start, int num_threads)
 {
-    RegXS scaled_xor[num_threads];
+    RDPF<1>::RegXSW scaled_xor[num_threads];
     boost::asio::thread_pool pool(num_threads);
     address_t totsize = (address_t(1)<<dpf.depth());
     address_t threadstart = start;
@@ -351,12 +351,12 @@ static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
             [&mpcio, &dpf, &scaled_xor, thread_num, threadstart, threadsize] {
                 MPCTIO tio(mpcio, thread_num);
 //printf("Thread %d from %X for %X\n", thread_num, threadstart, threadsize);
-                RegXS local_xor;
+                RDPF<1>::RegXSW local_xor;
                 size_t local_aes_ops = 0;
                 auto ev = StreamEval(dpf, threadstart, 0, local_aes_ops);
                 for (address_t x=0;x<threadsize;++x) {
 //if (x%0x10000 == 0) printf("%d", thread_num);
-                    DPFnode leaf = ev.next();
+                    RDPF<1>::LeafNode leaf = ev.next();
                     local_xor ^= dpf.scaled_xs(leaf);
                 }
                 scaled_xor[thread_num] = local_xor;
@@ -366,11 +366,11 @@ static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
         threadstart = (threadstart + threadsize) % totsize;
     }
     pool.join();
-    RegXS res;
+    RDPF<1>::RegXSW res;
     for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
         res ^= scaled_xor[thread_num];
     }
-    return res.xshare;
+    return res[0].xshare;
 }
 
 static void rdpfeval_timing(MPCIO &mpcio,
@@ -398,7 +398,7 @@ static void rdpfeval_timing(MPCIO &mpcio,
                 value_t scaled_xor =
                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
                 printf("%016lx\n%016lx\n", scaled_xor,
-                    dpf.scaled_xor.xshare);
+                    dpf.li[0].scaled_xor[0].xshare);
                 printf("\n");
             }
         } else {
@@ -408,7 +408,7 @@ static void rdpfeval_timing(MPCIO &mpcio,
                 value_t scaled_xor =
                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
                 printf("%016lx\n%016lx\n", scaled_xor,
-                    dpf.scaled_xor.xshare);
+                    dpf.li[0].scaled_xor[0].xshare);
                 printf("\n");
             }
         }
@@ -440,13 +440,13 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
                 nbits_t depth = dpf.depth();
                 auto pe = ParallelEval(dpf, start, 0,
                     address_t(1)<<depth, num_threads, tio.aes_ops());
-                RegXS result, init;
+                RDPF<1>::RegXSW result, init;
                 result = pe.reduce(init, [&dpf] (int thread_num,
-                        address_t i, const RDPF<1>::node &leaf) {
+                        address_t i, const RDPF<1>::LeafNode &leaf) {
                     return dpf.scaled_xs(leaf);
                 });
-                printf("%016lx\n%016lx\n", result.xshare,
-                    dpf.scaled_xor.xshare);
+                printf("%016lx\n%016lx\n", result[0].xshare,
+                    dpf.li[0].scaled_xor[0].xshare);
                 printf("\n");
             }
         } else {
@@ -456,13 +456,13 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
                 nbits_t depth = dpf.depth();
                 auto pe = ParallelEval(dpf, start, 0,
                     address_t(1)<<depth, num_threads, tio.aes_ops());
-                RegXS result, init;
+                RDPF<1>::RegXSW result, init;
                 result = pe.reduce(init, [&dpf] (int thread_num,
-                        address_t i, const RDPF<1>::node &leaf) {
+                        address_t i, const RDPF<1>::LeafNode &leaf) {
                     return dpf.scaled_xs(leaf);
                 });
-                printf("%016lx\n%016lx\n", result.xshare,
-                    dpf.scaled_xor.xshare);
+                printf("%016lx\n%016lx\n", result[0].xshare,
+                    dpf.li[0].scaled_xor[0].xshare);
                 printf("\n");
             }
         }
@@ -490,42 +490,42 @@ static void tupleeval_timing(MPCIO &mpcio,
         size_t &aes_ops = tio.aes_ops();
         if (tio.player() == 2) {
             RDPFPair<1> dp = tio.rdpfpair(yield, depth);
-            RegXS scaled_xor0, scaled_xor1;
+            RDPF<1>::RegXSW scaled_xor0, scaled_xor1;
             auto ev = StreamEval(dp, start, 0, aes_ops, false);
             for (address_t x=0;x<(address_t(1)<<depth);++x) {
                 auto [L0, L1] = ev.next();
-                RegXS sx0 = dp.dpf[0].scaled_xs(L0);
-                RegXS sx1 = dp.dpf[1].scaled_xs(L1);
+                RDPF<1>::RegXSW sx0 = dp.dpf[0].scaled_xs(L0);
+                RDPF<1>::RegXSW sx1 = dp.dpf[1].scaled_xs(L1);
                 scaled_xor0 ^= sx0;
                 scaled_xor1 ^= sx1;
             }
-            printf("%016lx\n%016lx\n", scaled_xor0.xshare,
-                dp.dpf[0].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", scaled_xor0[0].xshare,
+                dp.dpf[0].li[0].scaled_xor[0].xshare);
             printf("\n");
-            printf("%016lx\n%016lx\n", scaled_xor1.xshare,
-                dp.dpf[1].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", scaled_xor1[0].xshare,
+                dp.dpf[1].li[0].scaled_xor[0].xshare);
             printf("\n");
         } else {
             RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
-            RegXS scaled_xor0, scaled_xor1, scaled_xor2;
+            RDPF<1>::RegXSW scaled_xor0, scaled_xor1, scaled_xor2;
             auto ev = StreamEval(dt, start, 0, aes_ops, false);
             for (address_t x=0;x<(address_t(1)<<depth);++x) {
                 auto [L0, L1, L2] = ev.next();
-                RegXS sx0 = dt.dpf[0].scaled_xs(L0);
-                RegXS sx1 = dt.dpf[1].scaled_xs(L1);
-                RegXS sx2 = dt.dpf[2].scaled_xs(L2);
+                RDPF<1>::RegXSW sx0 = dt.dpf[0].scaled_xs(L0);
+                RDPF<1>::RegXSW sx1 = dt.dpf[1].scaled_xs(L1);
+                RDPF<1>::RegXSW sx2 = dt.dpf[2].scaled_xs(L2);
                 scaled_xor0 ^= sx0;
                 scaled_xor1 ^= sx1;
                 scaled_xor2 ^= sx2;
             }
-            printf("%016lx\n%016lx\n", scaled_xor0.xshare,
-                dt.dpf[0].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", scaled_xor0[0].xshare,
+                dt.dpf[0].li[0].scaled_xor[0].xshare);
             printf("\n");
-            printf("%016lx\n%016lx\n", scaled_xor1.xshare,
-                dt.dpf[1].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", scaled_xor1[0].xshare,
+                dt.dpf[1].li[0].scaled_xor[0].xshare);
             printf("\n");
-            printf("%016lx\n%016lx\n", scaled_xor2.xshare,
-                dt.dpf[2].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", scaled_xor2[0].xshare,
+                dt.dpf[2].li[0].scaled_xor[0].xshare);
             printf("\n");
         }
     });
@@ -554,40 +554,38 @@ static void par_tupleeval_timing(MPCIO &mpcio,
             RDPFPair<1> dp = tio.rdpfpair(yield, depth);
             auto pe = ParallelEval(dp, start, 0, address_t(1)<<depth,
                 num_threads, aes_ops);
-            using V = std::tuple<RegXS,RegXS>;
-            V result, init;
+            RDPFPair<1>::RegXSWP result, init;
             result = pe.reduce(init, [&dp] (int thread_num, address_t i,
-                    const RDPFPair<1>::node &leaf) {
-                std::tuple<RegXS,RegXS> scaled;
+                    const RDPFPair<1>::LeafNode &leaf) {
+                RDPFPair<1>::RegXSWP scaled;
                 dp.scaled(scaled, leaf);
                 return scaled;
             });
-            printf("%016lx\n%016lx\n", std::get<0>(result).xshare,
-                dp.dpf[0].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", std::get<0>(result)[0].xshare,
+                dp.dpf[0].li[0].scaled_xor[0].xshare);
             printf("\n");
-            printf("%016lx\n%016lx\n", std::get<1>(result).xshare,
-                dp.dpf[1].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", std::get<1>(result)[0].xshare,
+                dp.dpf[1].li[0].scaled_xor[0].xshare);
             printf("\n");
         } else {
             RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
             auto pe = ParallelEval(dt, start, 0, address_t(1)<<depth,
                 num_threads, aes_ops);
-            using V = std::tuple<RegXS,RegXS,RegXS>;
-            V result, init;
+            RDPFTriple<1>::RegXSWT result, init;
             result = pe.reduce(init, [&dt] (int thread_num, address_t i,
-                    const RDPFTriple<1>::node &leaf) {
-                std::tuple<RegXS,RegXS,RegXS> scaled;
+                    const RDPFTriple<1>::LeafNode &leaf) {
+                RDPFTriple<1>::RegXSWT scaled;
                 dt.scaled(scaled, leaf);
                 return scaled;
             });
-            printf("%016lx\n%016lx\n", std::get<0>(result).xshare,
-                dt.dpf[0].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", std::get<0>(result)[0].xshare,
+                dt.dpf[0].li[0].scaled_xor[0].xshare);
             printf("\n");
-            printf("%016lx\n%016lx\n", std::get<1>(result).xshare,
-                dt.dpf[1].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", std::get<1>(result)[0].xshare,
+                dt.dpf[1].li[0].scaled_xor[0].xshare);
             printf("\n");
-            printf("%016lx\n%016lx\n", std::get<2>(result).xshare,
-                dt.dpf[2].scaled_xor.xshare);
+            printf("%016lx\n%016lx\n", std::get<2>(result)[0].xshare,
+                dt.dpf[2].li[0].scaled_xor[0].xshare);
             printf("\n");
         }
     });

+ 13 - 13
preproc.cpp

@@ -161,13 +161,13 @@ void preprocessing_comp(MPCIO &mpcio, const PRACOptions &opts, char **args)
                                 yield();
                                 RDPFTriple rdpftrip =
                                     tio.rdpftriple(yield, type, opts.expand_rdpfs);
-                                printf("dep  = %d\n", type);
-                                printf("usi0 = %016lx\n", rdpftrip.dpf[0].unit_sum_inverse);
-                                printf("sxr0 = %016lx\n", rdpftrip.dpf[0].scaled_xor.xshare);
-                                printf("usi1 = %016lx\n", rdpftrip.dpf[1].unit_sum_inverse);
-                                printf("sxr1 = %016lx\n", rdpftrip.dpf[1].scaled_xor.xshare);
-                                printf("usi2 = %016lx\n", rdpftrip.dpf[2].unit_sum_inverse);
-                                printf("sxr2 = %016lx\n", rdpftrip.dpf[2].scaled_xor.xshare);
+printf("dep  = %d\n", type);
+printf("usi0 = %016lx\n", rdpftrip.dpf[0].li[0].unit_sum_inverse);
+printf("sxr0 = %016lx\n", rdpftrip.dpf[0].li[0].scaled_xor[0].xshare);
+printf("usi1 = %016lx\n", rdpftrip.dpf[1].li[0].unit_sum_inverse);
+printf("sxr1 = %016lx\n", rdpftrip.dpf[1].li[0].scaled_xor[0].xshare);
+printf("usi2 = %016lx\n", rdpftrip.dpf[2].li[0].unit_sum_inverse);
+printf("sxr2 = %016lx\n", rdpftrip.dpf[2].li[0].scaled_xor[0].xshare);
                                 tripfile.os() << rdpftrip;
                             });
                     }
@@ -327,12 +327,12 @@ void preprocessing_server(MPCServerIO &mpcsrvio, const PRACOptions &opts, char *
                                 [&stio, &opts, pairfile, depth](yield_t &yield) {
                                     yield();
                                     RDPFPair rdpfpair = stio.rdpfpair(yield, depth);
-                                printf("usi0 = %016lx\n", rdpfpair.dpf[0].unit_sum_inverse);
-                                printf("sxr0 = %016lx\n", rdpfpair.dpf[0].scaled_xor.xshare);
-                                printf("dep0 = %d\n", rdpfpair.dpf[0].depth());
-                                printf("usi1 = %016lx\n", rdpfpair.dpf[1].unit_sum_inverse);
-                                printf("sxr1 = %016lx\n", rdpfpair.dpf[1].scaled_xor.xshare);
-                                printf("dep1 = %d\n", rdpfpair.dpf[1].depth());
+printf("usi0 = %016lx\n", rdpfpair.dpf[0].li[0].unit_sum_inverse);
+printf("sxr0 = %016lx\n", rdpfpair.dpf[0].li[0].scaled_xor[0].xshare);
+printf("dep0 = %d\n", rdpfpair.dpf[0].depth());
+printf("usi1 = %016lx\n", rdpfpair.dpf[1].li[0].unit_sum_inverse);
+printf("sxr1 = %016lx\n", rdpfpair.dpf[1].li[0].scaled_xor[0].xshare);
+printf("dep1 = %d\n", rdpfpair.dpf[1].depth());
                                     if (opts.expand_rdpfs) {
                                         rdpfpair.dpf[0].expand(stio.aes_ops());
                                         rdpfpair.dpf[1].expand(stio.aes_ops());

+ 142 - 50
rdpf.hpp

@@ -1,6 +1,7 @@
 #ifndef __RDPF_HPP__
 #define __RDPF_HPP__
 
+#include <array>
 #include <vector>
 #include <iostream>
 
@@ -15,17 +16,50 @@
 
 template <nbits_t WIDTH>
 struct RDPF : public DPF {
-    // The amount we have to scale the low words of the leaf values by
-    // to get additive shares of a unit vector
-    value_t unit_sum_inverse;
-    // Additive share of the scaling value M_as such that the high words
-    // of the leaf values for P0 and P1 add to M_as * e_{target}
-    RegAS scaled_sum;
-    // XOR share of the scaling value M_xs such that the high words
-    // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
-    RegXS scaled_xor;
+    template <typename T>
+    using W = std::array<T, WIDTH>;
+    using RegASW = W<RegAS>;
+    using RegXSW = W<RegXS>;
+    // The number of 128-bit leaf node entries you need to get 1 unit
+    // value and WIDTH scaled values (each is 64 bits)
+    static const nbits_t LWIDTH = 1 + (WIDTH/2);
+    using LeafNode = std::array<DPFnode,LWIDTH>;
+
+    // Information for leaf levels of the RDPF.  Normal RDPFs only have
+    // one leaf level (at the bottom), but incremental RDPFs have a leaf
+    // level for each level of the DPF.
+    struct LeafInfo {
+        // The WIDTH correction words for this leaf level
+        std::array<DPFnode,WIDTH> leaf_cw;
+        // The amount we have to scale the low words of the leaf values by
+        // to get additive shares of a unit vector
+        value_t unit_sum_inverse;
+        // Additive share of the scaling values M_as such that the high words
+        // of the WIDTH leaf values for P0 and P1 add to M_as * e_{target}
+        std::array<RegAS,WIDTH> scaled_sum;
+        // XOR share of the scaling values M_xs such that the high words
+        // of the WIDTH leaf values for P0 and P1 XOR to M_xs * e_{target}
+        std::array<RegXS,WIDTH> scaled_xor;
+
+        LeafInfo() : unit_sum_inverse(0) {}
+    };
+
+    // The LeafInfo for each leaf level.  Normal RDPFs only have one
+    // leaf level, so this will be a vector of length 1.  Incremental
+    // RDPFs will have one entry for each level in the DPF.  The entry
+    // corresponding to level i of the DPF (of total depth d) is
+    // leaf_info[d-i].
+    std::vector<LeafInfo> li;
+
+    // The leaf correction flag bits for the WIDTH leaf words at each
+    // leaf level.  The bit for leaf word j of level i (for an
+    // incremental DPF of total depth d) is leaf_cfbits[j] & (1<<(d-i)).
+    // For a normal (not incremental) RDPF, it's the same, but therefore
+    // only the low bit of each of these WIDTH words gets used.
+    std::array<value_t,WIDTH> leaf_cfbits;
+
     // If we're saving the expansion, put it here
-    std::vector<DPFnode> expansion;
+    std::vector<LeafNode> expansion;
 
     RDPF() {}
 
@@ -48,7 +82,7 @@ struct RDPF : public DPF {
     inline bool has_expansion() const { return expansion.size() > 0; }
 
     // Get an element of the expansion
-    inline node get_expansion(address_t index) const {
+    inline LeafNode get_expansion(address_t index) const {
         return expansion[index];
     }
 
@@ -58,47 +92,81 @@ struct RDPF : public DPF {
     // Get the leaf node for the given input
     //
     // Cost: depth AES operations
-    DPFnode leaf(address_t input, size_t &aes_ops) const;
+    LeafNode leaf(address_t input, size_t &aes_ops) const;
 
     // Expand the DPF if it's not already expanded
     void expand(size_t &aes_ops);
 
+    // Descend from a node at depth parentdepth to one of its leaf children
+    // whichchild = 0: left child
+    // whichchild = 1: right child
+    //
+    // Cost: 1 AES operation
+    inline LeafNode descend_to_leaf(const DPFnode &parent,
+        nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const;
+
     // Get the bit-shared unit vector entry from the leaf node
-    inline RegBS unit_bs(DPFnode leaf) const {
+    inline RegBS unit_bs(LeafNode leaf) const {
         RegBS b;
-        b.bshare = get_lsb(leaf);
+        b.bshare = get_lsb(leaf[0]);
         return b;
     }
 
     // Get the additive-shared unit vector entry from the leaf node
-    inline RegAS unit_as(DPFnode leaf) const {
+    inline RegAS unit_as(LeafNode leaf) const {
         RegAS a;
-        value_t lowword = value_t(_mm_cvtsi128_si64x(leaf));
+        value_t lowword = value_t(_mm_cvtsi128_si64x(leaf[0]));
         if (whichhalf == 1) {
             lowword = -lowword;
         }
-        a.ashare = lowword * unit_sum_inverse;
+        a.ashare = lowword * li[0].unit_sum_inverse;
         return a;
     }
 
     // Get the XOR-shared scaled vector entry from the leaf node
-    inline RegXS scaled_xs(DPFnode leaf) const {
-        RegXS x;
+    inline RegXSW scaled_xs(LeafNode leaf) const {
+        RegXSW x;
+        nbits_t j = 0;
         value_t highword =
-            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf,8)));
-        x.xshare = highword;
+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[0],8)));
+        x[j++].xshare = highword;
+        for (nbits_t i=1;i<WIDTH;++i) {
+            value_t lowword =
+                value_t(_mm_cvtsi128_si64x(leaf[i]));
+            value_t highword =
+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[i],8)));
+            x[j++].xshare = lowword;
+            if (j < WIDTH) {
+                x[j++].xshare = highword;
+            }
+        }
         return x;
     }
 
     // Get the additive-shared scaled vector entry from the leaf node
-    inline RegAS scaled_as(DPFnode leaf) const {
-        RegAS a;
+    inline RegASW scaled_as(LeafNode leaf) const {
+        RegASW a;
+        nbits_t j = 0;
         value_t highword =
-            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf,8)));
+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[0],8)));
         if (whichhalf == 1) {
             highword = -highword;
         }
-        a.ashare = highword;
+        a[j++].ashare = highword;
+        for (nbits_t i=1;i<WIDTH;++i) {
+            value_t lowword =
+                value_t(_mm_cvtsi128_si64x(leaf[i]));
+            value_t highword =
+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[i],8)));
+            if (whichhalf == 1) {
+                lowword = -lowword;
+                highword = -highword;
+            }
+            a[j++].ashare = lowword;
+            if (j < WIDTH) {
+                a[j++].ashare = highword;
+            }
+        }
         return a;
     }
 
@@ -115,8 +183,16 @@ struct RDPF : public DPF {
 
 template <nbits_t WIDTH>
 struct RDPFTriple {
+    template <typename T>
+    using Triple = std::tuple<T, T, T>;
+    template <typename T>
+    using WTriple = Triple<typename RDPF<WIDTH>::W<T>>;
+
     // The type of node triples
-    using node = std::tuple<DPFnode, DPFnode, DPFnode>;
+    using node = Triple<DPFnode>;
+    using LeafNode = Triple<typename RDPF<WIDTH>::LeafNode>;
+    using RegASWT = WTriple<RegAS>;
+    using RegXSWT = WTriple<RegXS>;
 
     RegAS as_target;
     RegXS xs_target;
@@ -137,7 +213,7 @@ struct RDPFTriple {
     }
 
     // Get an element of the expansion
-    inline node get_expansion(address_t index) const {
+    inline LeafNode get_expansion(address_t index) const {
         return std::make_tuple(dpf[0].get_expansion(index),
             dpf[1].get_expansion(index), dpf[2].get_expansion(index));
     }
@@ -153,6 +229,10 @@ struct RDPFTriple {
     node descend(const node &parent, nbits_t parentdepth,
         bit_t whichchild, size_t &aes_ops) const;
 
+    // Descend the three RDPFs in lock step to a leaf node
+    LeafNode descend_to_leaf(const node &parent, nbits_t parentdepth,
+        bit_t whichchild, size_t &aes_ops) const;
+
     // Overloaded versions of functions to get DPF components and
     // outputs so that the appropriate one can be selected with a
     // parameter
@@ -162,29 +242,29 @@ struct RDPFTriple {
 
     // Additive share of the scaling value M_as such that the high words
     // of the leaf values for P0 and P1 add to M_as * e_{target}
-    inline void scaled_value(std::tuple<RegAS,RegAS,RegAS> &v) const {
-        std::get<0>(v) = dpf[0].scaled_sum;
-        std::get<1>(v) = dpf[1].scaled_sum;
-        std::get<2>(v) = dpf[2].scaled_sum;
+    inline void scaled_value(RegASWT &v) const {
+        std::get<0>(v) = dpf[0].li[0].scaled_sum;
+        std::get<1>(v) = dpf[1].li[0].scaled_sum;
+        std::get<2>(v) = dpf[2].li[0].scaled_sum;
     }
 
     // XOR share of the scaling value M_xs such that the high words
     // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
-    inline void scaled_value(std::tuple<RegXS,RegXS,RegXS> &v) const {
-        std::get<0>(v) = dpf[0].scaled_xor;
-        std::get<1>(v) = dpf[1].scaled_xor;
-        std::get<2>(v) = dpf[2].scaled_xor;
+    inline void scaled_value(RegXSWT &v) const {
+        std::get<0>(v) = dpf[0].li[0].scaled_xor;
+        std::get<1>(v) = dpf[1].li[0].scaled_xor;
+        std::get<2>(v) = dpf[2].li[0].scaled_xor;
     }
 
     // Get the additive-shared unit vector entry from the leaf node
-    inline void unit(std::tuple<RegAS,RegAS,RegAS> &u, node leaf) const {
+    inline void unit(std::tuple<RegAS,RegAS,RegAS> &u, LeafNode leaf) const {
         std::get<0>(u) = dpf[0].unit_as(std::get<0>(leaf));
         std::get<1>(u) = dpf[1].unit_as(std::get<1>(leaf));
         std::get<2>(u) = dpf[2].unit_as(std::get<2>(leaf));
     }
 
     // Get the bit-shared unit vector entry from the leaf node
-    inline void unit(std::tuple<RegXS,RegXS,RegXS> &u, node leaf) const {
+    inline void unit(std::tuple<RegXS,RegXS,RegXS> &u, LeafNode leaf) const {
         std::get<0>(u) = dpf[0].unit_bs(std::get<0>(leaf));
         std::get<1>(u) = dpf[1].unit_bs(std::get<1>(leaf));
         std::get<2>(u) = dpf[2].unit_bs(std::get<2>(leaf));
@@ -193,21 +273,21 @@ struct RDPFTriple {
     // For any more complex entry type, that type will handle the conversion
     // for each DPF
     template <typename T>
-    inline void unit(std::tuple<T,T,T> &u, node leaf) const {
+    inline void unit(std::tuple<T,T,T> &u, LeafNode leaf) const {
         std::get<0>(u).unit(dpf[0], std::get<0>(leaf));
         std::get<1>(u).unit(dpf[1], std::get<1>(leaf));
         std::get<2>(u).unit(dpf[2], std::get<2>(leaf));
     }
 
     // Get the additive-shared scaled vector entry from the leaf node
-    inline void scaled(std::tuple<RegAS,RegAS,RegAS> &s, node leaf) const {
+    inline void scaled(RegASWT &s, LeafNode leaf) const {
         std::get<0>(s) = dpf[0].scaled_as(std::get<0>(leaf));
         std::get<1>(s) = dpf[1].scaled_as(std::get<1>(leaf));
         std::get<2>(s) = dpf[2].scaled_as(std::get<2>(leaf));
     }
 
     // Get the XOR-shared scaled vector entry from the leaf node
-    inline void scaled(std::tuple<RegXS,RegXS,RegXS> &s, node leaf) const {
+    inline void scaled(RegXSWT &s, LeafNode leaf) const {
         std::get<0>(s) = dpf[0].scaled_xs(std::get<0>(leaf));
         std::get<1>(s) = dpf[1].scaled_xs(std::get<1>(leaf));
         std::get<2>(s) = dpf[2].scaled_xs(std::get<2>(leaf));
@@ -216,8 +296,16 @@ struct RDPFTriple {
 
 template <nbits_t WIDTH>
 struct RDPFPair {
+    template <typename T>
+    using Pair = std::tuple<T, T>;
+    template <typename T>
+    using WPair = Pair<typename RDPF<WIDTH>::W<T>>;
+
     // The type of node pairs
-    using node = std::tuple<DPFnode, DPFnode>;
+    using node = Pair<DPFnode>;
+    using LeafNode = Pair<typename RDPF<WIDTH>::LeafNode>;
+    using RegASWP = WPair<RegAS>;
+    using RegXSWP = WPair<RegXS>;
 
     RDPF<WIDTH> dpf[2];
 
@@ -246,7 +334,7 @@ struct RDPFPair {
     }
 
     // Get an element of the expansion
-    inline node get_expansion(address_t index) const {
+    inline LeafNode get_expansion(address_t index) const {
         return std::make_tuple(dpf[0].get_expansion(index),
             dpf[1].get_expansion(index));
     }
@@ -255,32 +343,36 @@ struct RDPFPair {
     node descend(const node &parent, nbits_t parentdepth,
         bit_t whichchild, size_t &aes_ops) const;
 
+    // Descend the two RDPFs in lock step to a leaf node
+    LeafNode descend_to_leaf(const node &parent, nbits_t parentdepth,
+        bit_t whichchild, size_t &aes_ops) const;
+
     // Overloaded versions of functions to get DPF components and
     // outputs so that the appropriate one can be selected with a
     // parameter
 
     // Additive share of the scaling value M_as such that the high words
     // of the leaf values for P0 and P1 add to M_as * e_{target}
-    inline void scaled_value(std::tuple<RegAS,RegAS> &v) const {
+    inline void scaled_value(RegASWP &v) const {
         std::get<0>(v) = dpf[0].scaled_sum;
         std::get<1>(v) = dpf[1].scaled_sum;
     }
 
     // XOR share of the scaling value M_xs such that the high words
     // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
-    inline void scaled_value(std::tuple<RegXS,RegXS> &v) const {
+    inline void scaled_value(RegXSWP &v) const {
         std::get<0>(v) = dpf[0].scaled_xor;
         std::get<1>(v) = dpf[1].scaled_xor;
     }
 
     // Get the additive-shared unit vector entry from the leaf node
-    inline void unit(std::tuple<RegAS,RegAS> &u, node leaf) const {
+    inline void unit(std::tuple<RegAS,RegAS> &u, LeafNode leaf) const {
         std::get<0>(u) = dpf[0].unit_as(std::get<0>(leaf));
         std::get<1>(u) = dpf[1].unit_as(std::get<1>(leaf));
     }
 
     // Get the bit-shared unit vector entry from the leaf node
-    inline void unit(std::tuple<RegXS,RegXS> &u, node leaf) const {
+    inline void unit(std::tuple<RegXS,RegXS> &u, LeafNode leaf) const {
         std::get<0>(u) = dpf[0].unit_bs(std::get<0>(leaf));
         std::get<1>(u) = dpf[1].unit_bs(std::get<1>(leaf));
     }
@@ -288,19 +380,19 @@ struct RDPFPair {
     // For any more complex entry type, that type will handle the conversion
     // for each DPF
     template <typename T>
-    inline void unit(std::tuple<T,T> &u, node leaf) const {
+    inline void unit(std::tuple<T,T> &u, LeafNode leaf) const {
         std::get<0>(u).unit(dpf[0], std::get<0>(leaf));
         std::get<1>(u).unit(dpf[1], std::get<1>(leaf));
     }
 
     // Get the additive-shared scaled vector entry from the leaf node
-    inline void scaled(std::tuple<RegAS,RegAS> &s, node leaf) const {
+    inline void scaled(RegASWP &s, LeafNode leaf) const {
         std::get<0>(s) = dpf[0].scaled_as(std::get<0>(leaf));
         std::get<1>(s) = dpf[1].scaled_as(std::get<1>(leaf));
     }
 
     // Get the XOR-shared scaled vector entry from the leaf node
-    inline void scaled(std::tuple<RegXS,RegXS> &s, node leaf) const {
+    inline void scaled(RegXSWP &s, LeafNode leaf) const {
         std::get<0>(s) = dpf[0].scaled_xs(std::get<0>(leaf));
         std::get<1>(s) = dpf[1].scaled_xs(std::get<1>(leaf));
     }
@@ -336,7 +428,7 @@ public:
         bool use_expansion = true);
 
     // Get the next value (or tuple of values) from the evaluator
-    typename T::node next();
+    typename T::LeafNode next();
 };
 
 // Parallel evaluation.  This class launches a number of threads each

+ 81 - 20
rdpf.tcc

@@ -63,11 +63,11 @@ StreamEval<T>::StreamEval(const T &rdpf, address_t start,
 }
 
 template <typename T>
-typename T::node StreamEval<T>::next()
+typename T::LeafNode StreamEval<T>::next()
 {
     if (use_expansion && rdpf.has_expansion()) {
         // Just use the precomputed values
-        typename T::node leaf =
+        typename T::LeafNode leaf =
             rdpf.get_expansion(nextindex ^ counter_xor_offset);
         nextindex = (nextindex + 1) & indexmask;
         return leaf;
@@ -113,7 +113,7 @@ typename T::node StreamEval<T>::next()
         }
     }
     bool xor_offset_bit = counter_xor_offset & 1;
-    typename T::node leaf = rdpf.descend(path[depth-1], depth-1,
+    typename T::LeafNode leaf = rdpf.descend_to_leaf(path[depth-1], depth-1,
         (nextindex & 1) ^ xor_offset_bit, aes_ops);
     pathindex = nextindex;
     nextindex = (nextindex + 1) & indexmask;
@@ -161,7 +161,7 @@ inline V ParallelEval<T>::reduce(V init, W process)
                     xor_offset, local_aes_ops);
                 V accum = init;
                 for (address_t x=0;x<threadsize;++x) {
-                    typename T::node leaf = ev.next();
+                    typename T::LeafNode leaf = ev.next();
                     accum += process(thread_num,
                         (threadstart+x)&indexmask, leaf);
                 }
@@ -179,6 +179,28 @@ inline V ParallelEval<T>::reduce(V init, W process)
     return total;
 }
 
+// Descend from a node at depth parentdepth to one of its leaf children
+// whichchild = 0: left child
+// whichchild = 1: right child
+//
+// Cost: 1 AES operation
+template <nbits_t WIDTH>
+inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
+    const DPFnode &parent, nbits_t parentdepth, bit_t whichchild,
+    size_t &aes_ops) const
+{
+    typename RDPF<WIDTH>::LeafNode prgout;
+    bool flag = get_lsb(parent);
+    prg(prgout[0], parent, whichchild, aes_ops);
+    if (flag) {
+        DPFnode CW = cw[parentdepth];
+        bit_t cfbit = !!(cfbits & (value_t(1)<<parentdepth));
+        DPFnode CWR = CW ^ lsb128_mask[cfbit];
+        prgout[0] ^= (whichchild ? CWR : CW);
+    }
+    return prgout;
+}
+
 // I/O for RDPFs
 
 template <typename T, nbits_t WIDTH>
@@ -209,9 +231,13 @@ T& operator>>(T &is, RDPF<WIDTH> &rdpf)
     value_t cfbits = 0;
     is.read((char *)&cfbits, BITBYTES(depth));
     rdpf.cfbits = cfbits;
-    is.read((char *)&rdpf.unit_sum_inverse, sizeof(rdpf.unit_sum_inverse));
-    is.read((char *)&rdpf.scaled_sum, sizeof(rdpf.scaled_sum));
-    is.read((char *)&rdpf.scaled_xor, sizeof(rdpf.scaled_xor));
+    rdpf.li.resize(1);
+    is.read((char *)&rdpf.li[0].unit_sum_inverse,
+        sizeof(rdpf.li[0].unit_sum_inverse));
+    is.read((char *)&rdpf.li[0].scaled_sum,
+        sizeof(rdpf.li[0].scaled_sum));
+    is.read((char *)&rdpf.li[0].scaled_xor,
+        sizeof(rdpf.li[0].scaled_xor));
 
     return is;
 }
@@ -242,9 +268,12 @@ T& write_maybe_expanded(T &os, const RDPF<WIDTH> &rdpf,
             sizeof(rdpf.expansion[0])<<depth);
     }
     os.write((const char *)&rdpf.cfbits, BITBYTES(depth));
-    os.write((const char *)&rdpf.unit_sum_inverse, sizeof(rdpf.unit_sum_inverse));
-    os.write((const char *)&rdpf.scaled_sum, sizeof(rdpf.scaled_sum));
-    os.write((const char *)&rdpf.scaled_xor, sizeof(rdpf.scaled_xor));
+    os.write((const char *)&rdpf.li[0].unit_sum_inverse,
+        sizeof(rdpf.li[0].unit_sum_inverse));
+    os.write((const char *)&rdpf.li[0].scaled_sum,
+        sizeof(rdpf.li[0].scaled_sum));
+    os.write((const char *)&rdpf.li[0].scaled_xor,
+        sizeof(rdpf.li[0].scaled_xor));
 
     return os;
 }
@@ -332,6 +361,8 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
     DPFnode *nextlevel = new DPFnode[1];
     nextlevel[0] = seed;
 
+    li.resize(1);
+
     // Construct each intermediate level
     while(level < depth) {
         if (player < 2) {
@@ -339,7 +370,7 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
             curlevel = nextlevel;
             if (save_expansion && level == depth-1) {
                 expansion.resize(1<<depth);
-                nextlevel = expansion.data();
+                nextlevel = (DPFnode *)expansion.data();
             } else {
                 nextlevel = new DPFnode[1<<(level+1)];
             }
@@ -666,8 +697,8 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
                     low_sum = -low_sum;
                     high_sum = -high_sum;
                 }
-                scaled_sum.ashare = high_sum;
-                scaled_xor.xshare = high_xor;
+                li[0].scaled_sum[0].ashare = high_sum;
+                li[0].scaled_xor[0].xshare = high_xor;
                 // Exchange low_sum and add them up
                 tio.queue_peer(&low_sum, sizeof(low_sum));
                 yield();
@@ -676,7 +707,7 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
                 low_sum += peer_low_sum;
                 // The low_sum had better be odd
                 assert(low_sum & 1);
-                unit_sum_inverse = inverse_value_t(low_sum);
+                li[0].unit_sum_inverse = inverse_value_t(low_sum);
             }
             cw.push_back(CW);
         } else if (level == depth-1) {
@@ -694,7 +725,8 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
 // Get the leaf node for the given input
 template <nbits_t WIDTH>
-DPFnode RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
+typename RDPF<WIDTH>::LeafNode
+    RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
 {
     // If we have a precomputed expansion, just use it
     if (expansion.size()) {
@@ -707,7 +739,9 @@ DPFnode RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
         bit_t dir = !!(input & (address_t(1)<<(totdepth-d-1)));
         node = descend(node, d, dir, aes_ops);
     }
-    return node;
+    LeafNode ln;
+    ln[0] = node;
+    return ln;
 }
 
 // Expand the DPF if it's not already expanded
@@ -728,8 +762,8 @@ void RDPF<WIDTH>::expand(size_t &aes_ops)
     for (nbits_t i=1;i<depth;++i) {
         path[i] = descend(path[i-1], i-1, 0, aes_ops);
     }
-    expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
-    expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
+    expansion[index++][0] = descend(path[depth-1], depth-1, 0, aes_ops);
+    expansion[index++][0] = descend(path[depth-1], depth-1, 1, aes_ops);
     while(index < num_leaves) {
         // Invariant: lastindex and index will both be even, and
         // index=lastindex+2
@@ -749,8 +783,8 @@ void RDPF<WIDTH>::expand(size_t &aes_ops)
             path[i+1] = descend(path[i], i, 0, aes_ops);
         }
         lastindex = index;
-        expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
-        expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
+        expansion[index++][0] = descend(path[depth-1], depth-1, 0, aes_ops);
+        expansion[index++][0] = descend(path[depth-1], depth-1, 1, aes_ops);
     }
 
     delete[] path;
@@ -796,6 +830,20 @@ typename RDPFTriple<WIDTH>::node RDPFTriple<WIDTH>::descend(
     return std::make_tuple(C0,C1,C2);
 }
 
+template <nbits_t WIDTH>
+typename RDPFTriple<WIDTH>::LeafNode RDPFTriple<WIDTH>::descend_to_leaf(
+    const RDPFTriple<WIDTH>::node &parent,
+    nbits_t parentdepth, bit_t whichchild,
+    size_t &aes_ops) const
+{
+    auto [P0, P1, P2] = parent;
+    typename RDPF<WIDTH>::LeafNode C0, C1, C2;
+    C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
+    C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
+    C2 = dpf[2].descend_to_leaf(P2, parentdepth, whichchild, aes_ops);
+    return std::make_tuple(C0,C1,C2);
+}
+
 template <nbits_t WIDTH>
 typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
     const RDPFPair<WIDTH>::node &parent,
@@ -808,3 +856,16 @@ typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
     C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
     return std::make_tuple(C0,C1);
 }
+
+template <nbits_t WIDTH>
+typename RDPFPair<WIDTH>::LeafNode RDPFPair<WIDTH>::descend_to_leaf(
+    const RDPFPair<WIDTH>::node &parent,
+    nbits_t parentdepth, bit_t whichchild,
+    size_t &aes_ops) const
+{
+    auto [P0, P1] = parent;
+    typename RDPF<WIDTH>::LeafNode C0, C1;
+    C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
+    C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
+    return std::make_tuple(C0,C1);
+}

+ 141 - 6
types.hpp

@@ -99,6 +99,17 @@ struct RegAS {
         return res;
     }
 
+    // Multiply a scalar by a vector
+    template <size_t N>
+    inline std::array<RegAS,N> operator*(std::array<value_t,N> rhs) const {
+        std::array<RegAS,N> res;
+        for (size_t i=0;i<N;++i) {
+            res[i] = *this;
+            res[i] *= rhs[i];
+        }
+        return res;
+    }
+
     inline RegAS &operator&=(value_t mask) {
         this->ashare &= mask;
         return *this;
@@ -235,6 +246,17 @@ struct RegXS {
         return res;
     }
 
+    // Multiply a scalar by a vector
+    template <size_t N>
+    inline std::array<RegXS,N> operator*(std::array<value_t,N> rhs) const {
+        std::array<RegXS,N> res;
+        for (size_t i=0;i<N;++i) {
+            res[i] = *this;
+            res[i] *= rhs[i];
+        }
+        return res;
+    }
+
     inline RegXS &operator^=(const RegXS &rhs) {
         this->xshare ^= rhs.xshare;
         return *this;
@@ -427,9 +449,32 @@ std::tuple<T,T> operator*(const std::tuple<T,T> &A,
     return res;
 }
 
-template <typename T>
-inline std::tuple<value_t,value_t> combine(
-        const std::tuple<T,T> &A, const std::tuple<T,T> &B,
+template <typename T, size_t N>
+std::tuple<std::array<T,N>,std::array<T,N>> operator*(
+    const std::tuple<T,T> &A,
+    const std::tuple<std::array<value_t,N>,std::array<value_t,N>> &B)
+{
+    std::tuple<std::array<T,N>,std::array<T,N>> res;
+    std::get<0>(res) = std::get<0>(A) * std::get<0>(B);
+    std::get<1>(res) = std::get<1>(A) * std::get<1>(B);
+    return res;
+}
+
+template <typename S, size_t N>
+inline std::array<value_t,N> combine(const std::array<S,N> &A,
+        const std::array<S,N> &B,
+        nbits_t nbits = VALUE_BITS) {
+    std::array<value_t,N> res;
+    for (size_t i=0;i<N;++i) {
+        res[i] = combine(A[i], B[i], nbits);
+    }
+    return res;
+}
+
+template <typename S, size_t N>
+inline std::tuple<std::array<value_t,N>,std::array<value_t,N>>
+    combine(const std::tuple<std::array<S,N>,std::array<S,N>> &A,
+        const std::tuple<std::array<S,N>,std::array<S,N>> &B,
         nbits_t nbits = VALUE_BITS) {
     return std::make_tuple(
         combine(std::get<0>(A), std::get<0>(B), nbits),
@@ -523,6 +568,18 @@ std::tuple<T,T,T> operator*(const std::tuple<T,T,T> &A,
     return res;
 }
 
+template <typename T, size_t N>
+std::tuple<std::array<T,N>,std::array<T,N>,std::array<T,N>> operator*(
+    const std::tuple<T,T,T> &A,
+    const std::tuple<std::array<value_t,N>,std::array<value_t,N>,std::array<value_t,N>> &B)
+{
+    std::tuple<std::array<T,N>,std::array<T,N>,std::array<T,N>> res;
+    std::get<0>(res) = std::get<0>(A) * std::get<0>(B);
+    std::get<1>(res) = std::get<1>(A) * std::get<1>(B);
+    std::get<2>(res) = std::get<2>(A) * std::get<2>(B);
+    return res;
+}
+
 inline std::vector<RegAS> operator-(const std::vector<RegAS> &A)
 {
     std::vector<RegAS> res;
@@ -564,9 +621,38 @@ inline std::array<RegBS,N> operator-(const std::array<RegBS,N> &A)
     return A;
 }
 
-template <typename T>
-inline std::tuple<value_t,value_t,value_t> combine(
-        const std::tuple<T,T,T> &A, const std::tuple<T,T,T> &B,
+template <typename S, size_t N>
+inline std::array<S,N> &operator+=(std::array<S,N> &A, const std::array<S,N> &B)
+{
+    for (size_t i=0;i<N;++i) {
+        A[i] += B[i];
+    }
+    return A;
+}
+
+template <typename S, size_t N>
+inline std::array<S,N> &operator-=(std::array<S,N> &A, const std::array<S,N> &B)
+{
+    for (size_t i=0;i<N;++i) {
+        A[i] -= B[i];
+    }
+    return A;
+}
+
+template <typename S, size_t N>
+inline std::array<S,N> &operator^=(std::array<S,N> &A, const std::array<S,N> &B)
+{
+    for (size_t i=0;i<N;++i) {
+        A[i] ^= B[i];
+    }
+    return A;
+}
+
+template <typename S, size_t N>
+inline std::tuple<std::array<value_t,N>,std::array<value_t,N>,std::array<value_t,N>>
+    combine(
+        const std::tuple<std::array<S,N>,std::array<S,N>,std::array<S,N>> &A,
+        const std::tuple<std::array<S,N>,std::array<S,N>,std::array<S,N>> &B,
         nbits_t nbits = VALUE_BITS) {
     return std::make_tuple(
         combine(std::get<0>(A), std::get<0>(B), nbits),
@@ -680,6 +766,25 @@ DEFAULT_IO(HalfTriple)
 // We don't need one for AndTriple because it's exactly the same type as
 // MultTriple
 
+// I/O for arrays
+template <typename T, typename S, size_t N>
+T& operator>>(T& is, std::array<S,N> &x)
+{
+    for (size_t i=0;i<N;++i) {
+        is >> x[i];
+    }
+    return is;
+}
+
+template <typename T, typename S, size_t N>
+T& operator<<(T& os, const std::array<S,N> &x)
+{
+    for (size_t i=0;i<N;++i) {
+        os << x[i];
+    }
+    return os;
+}
+
 // I/O for SelectTriples
 template <typename T, typename V>
 T& operator>>(T& is, SelectTriple<V> &x)
@@ -733,6 +838,36 @@ T& operator<<(T& os, const SelectTriple<V> &x)
 DEFAULT_TUPLE_IO(RegAS)
 DEFAULT_TUPLE_IO(RegXS)
 
+// And for pairs and triples of arrays
+
+template <typename T, typename S, size_t N>
+T& operator>>(T& is, std::tuple<std::array<S,N>, std::array<S,N>> &x)
+{
+    is >> std::get<0>(x) >> std::get<1>(x);
+    return is;
+}
+
+template <typename T, typename S, size_t N>
+T& operator<<(T& os, const std::tuple<std::array<S,N>, std::array<S,N>> &x)
+{
+    os << std::get<0>(x) << std::get<1>(x);
+    return os;
+}
+
+template <typename T, typename S, size_t N>
+T& operator>>(T& is, std::tuple<std::array<S,N>, std::array<S,N>, std::array<S,N>> &x)
+{
+    is >> std::get<0>(x) >> std::get<1>(x) >> std::get<2>(x);
+    return is;
+}
+
+template <typename T, typename S, size_t N>
+T& operator<<(T& os, const std::tuple<std::array<S,N>, std::array<S,N>, std::array<S,N>> &x)
+{
+    os << std::get<0>(x) << std::get<1>(x) << std::get<2>(x);
+    return os;
+}
+
 enum ProcessingMode {
     MODE_ONLINE,        // Online mode, after preprocessing has been done
     MODE_PREPROCESSING, // Preprocessing mode