2 years ago · 953c1fd3a1
--- a/cell.cpp
+++ b/cell.cpp
@@ -92,7 +92,8 @@ struct Cell {
 
				     // the word with value 1.  This is used for ORAM reads, where the
			
 
				     // same DPF is used for all the fields.
			
 
				     template <nbits_t WIDTH>
			
 
				-    inline void unit(const RDPF<WIDTH> &dpf, DPFnode leaf) {
			
 
				+    inline void unit(const RDPF<WIDTH> &dpf,
			
 
				+        typename RDPF<WIDTH>::LeafNode leaf) {
			
 
				         key = dpf.unit_as(leaf);
			
 
				         pointers = dpf.unit_bs(leaf);
			
 
				         value = dpf.unit_bs(leaf);
			
--- a/duoram.tcc
+++ b/duoram.tcc
@@ -304,7 +304,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				             shape.tio.aes_ops());
			
 
				         FT init;
			
 
				         res = pe.reduce(init, [this, &dp, &shape] (int thread_num,
			
 
				-                address_t i, const RDPFPair<1>::node &leaf) {
			
 
				+                address_t i, const RDPFPair<1>::LeafNode &leaf) {
			
 
				             // The values from the two DPFs, which will each be of type T
			
 
				             std::tuple<FT,FT> V;
			
 
				             dp.unit(V, leaf);
			
@@ -341,7 +341,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				             shape.shape_size, shape.tio.cpu_nthreads(),
			
 
				             shape.tio.aes_ops());
			
 
				         gamma = pe.reduce(init, [this, &dp, &shape] (int thread_num,
			
 
				-                address_t i, const RDPFPair<1>::node &leaf) {
			
 
				+                address_t i, const RDPFPair<1>::LeafNode &leaf) {
			
 
				             // The values from the two DPFs, each of type FT
			
 
				             std::tuple<FT,FT> V;
			
 
				             dp.unit(V, leaf);
			
@@ -389,8 +389,10 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				         U indoffset;
			
 
				         dt.get_target(indoffset);
			
 
				         indoffset -= idx;
			
 
				-        auto Moffset = std::make_tuple(M, M, M);
			
 
				-        std::tuple<FT,FT,FT> scaled_val;
			
 
				+        RDPF<1>::W<FT> MW;
			
 
				+        MW[0] = M;
			
 
				+        auto Moffset = std::make_tuple(MW, MW, MW);
			
 
				+        RDPFTriple<1>::WTriple<FT> scaled_val;
			
 
				         dt.scaled_value(scaled_val);
			
 
				         Moffset -= scaled_val;
			
 
				 
			
@@ -406,7 +408,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				 
			
 
				         // Receive the above from the peer
			
 
				         U peerindoffset;
			
 
				-        std::tuple<FT,FT,FT> peerMoffset;
			
 
				+        RDPFTriple<1>::WTriple<FT> peerMoffset;
			
 
				         shape.tio.recv_peer(&peerindoffset, BITBYTES(shape.addr_size));
			
 
				         shape.tio.iostream_peer() >> peerMoffset;
			
 
				 
			
@@ -420,22 +422,23 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             shape.tio.aes_ops());
			
 
				         int init = 0;
			
 
				         pe.reduce(init, [this, &dt, &shape, &Mshift, player] (int thread_num,
			
 
				-                address_t i, const RDPFTriple<1>::node &leaf) {
			
 
				+                address_t i, const RDPFTriple<1>::LeafNode &leaf) {
			
 
				             // The values from the three DPFs
			
 
				-            std::tuple<FT,FT,FT> scaled, unit;
			
 
				+            RDPFTriple<1>::WTriple<FT> scaled;
			
 
				+            std::tuple<FT,FT,FT> unit;
			
 
				             dt.scaled(scaled, leaf);
			
 
				             dt.unit(unit, leaf);
			
 
				             auto [V0, V1, V2] = scaled + unit * Mshift;
			
 
				             // References to the appropriate cells in our database, our
			
 
				             // blind, and our copy of the peer's blinded database
			
 
				             auto [DB, BL, PBD] = shape.get_comp(i,fieldsel);
			
 
				-            DB += V0;
			
 
				+            DB += V0[0];
			
 
				             if (player == 0) {
			
 
				-                BL -= V1;
			
 
				-                PBD += V2-V0;
			
 
				+                BL -= V1[0];
			
 
				+                PBD += V2[0]-V0[0];
			
 
				             } else {
			
 
				-                BL -= V2;
			
 
				-                PBD += V1-V0;
			
 
				+                BL -= V2[0];
			
 
				+                PBD += V1[0]-V0[0];
			
 
				             }
			
 
				             return 0;
			
 
				         });
			
@@ -444,7 +447,7 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				 
			
 
				         RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
			
 
				         U p0indoffset, p1indoffset;
			
 
				-        std::tuple<FT,FT> p0Moffset, p1Moffset;
			
 
				+        RDPFPair<1>::WPair<FT> p0Moffset, p1Moffset;
			
 
				 
			
 
				         shape.yield();
			
 
				 
			
@@ -463,16 +466,19 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             shape.tio.aes_ops());
			
 
				         int init = 0;
			
 
				         pe.reduce(init, [this, &dp, &shape, &Mshift] (int thread_num,
			
 
				-                address_t i, const RDPFPair<1>::node &leaf) {
			
 
				+                address_t i, const RDPFPair<1>::LeafNode &leaf) {
			
 
				             // The values from the two DPFs
			
 
				-            std::tuple<FT,FT> scaled, unit;
			
 
				+            RDPFPair<1>::WPair<FT> scaled;
			
 
				+            std::tuple<FT,FT> unit;
			
 
				             dp.scaled(scaled, leaf);
			
 
				             dp.unit(unit, leaf);
			
 
				-            auto V = scaled + unit * Mshift;
			
 
				+            auto [V0, V1] = scaled + unit * Mshift;
			
 
				             // shape.get_server(i) returns a pair of references to the
			
 
				             // appropriate cells in the two blinded databases, so we can
			
 
				             // subtract the pair directly.
			
 
				-            shape.get_server(i,fieldsel) -= V;
			
 
				+            auto [BL0, BL1] = shape.get_server(i,fieldsel);
			
 
				+            BL0 -= V0[0];
			
 
				+            BL1 -= V1[0];
			
 
				             return 0;
			
 
				         });
			
 
				     }
			
--- a/online.cpp
+++ b/online.cpp
@@ -220,13 +220,13 @@ static void rdpf_test(MPCIO &mpcio,
 
				                         for (int i=0;i<2;++i) {
			
 
				                             const RDPF<1> &dpf = dp.dpf[i];
			
 
				                             for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
 
				-                                DPFnode leaf = dpf.leaf(x, aes_ops);
			
 
				+                                RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
			
 
				                                 RegBS ub = dpf.unit_bs(leaf);
			
 
				                                 RegAS ua = dpf.unit_as(leaf);
			
 
				-                                RegXS sx = dpf.scaled_xs(leaf);
			
 
				-                                RegAS sa = dpf.scaled_as(leaf);
			
 
				+                                RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
			
 
				+                                RDPF<1>::RegASW sa = dpf.scaled_as(leaf);
			
 
				                                 printf("%04x %x %016lx %016lx %016lx\n", x,
			
 
				-                                    ub.bshare, ua.ashare, sx.xshare, sa.ashare);
			
 
				+                                    ub.bshare, ua.ashare, sx[0].xshare, sa[0].ashare);
			
 
				                             }
			
 
				                             printf("\n");
			
 
				                         }
			
@@ -237,36 +237,36 @@ static void rdpf_test(MPCIO &mpcio,
 
				                             RegXS peer_scaled_xor;
			
 
				                             RegAS peer_scaled_sum;
			
 
				                             if (tio.player() == 1) {
			
 
				-                                tio.iostream_peer() << dpf.scaled_xor << dpf.scaled_sum;
			
 
				+                                tio.iostream_peer() << dpf.li[0].scaled_xor[0] << dpf.li[0].scaled_sum[0];
			
 
				                             } else {
			
 
				                                 tio.iostream_peer() >> peer_scaled_xor >> peer_scaled_sum;
			
 
				-                                peer_scaled_sum += dpf.scaled_sum;
			
 
				-                                peer_scaled_xor ^= dpf.scaled_xor;
			
 
				+                                peer_scaled_sum += dpf.li[0].scaled_sum[0];
			
 
				+                                peer_scaled_xor ^= dpf.li[0].scaled_xor[0];
			
 
				                             }
			
 
				                             for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
 
				-                                DPFnode leaf = dpf.leaf(x, aes_ops);
			
 
				+                                RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
			
 
				                                 RegBS ub = dpf.unit_bs(leaf);
			
 
				                                 RegAS ua = dpf.unit_as(leaf);
			
 
				-                                RegXS sx = dpf.scaled_xs(leaf);
			
 
				-                                RegAS sa = dpf.scaled_as(leaf);
			
 
				+                                RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
			
 
				+                                RDPF<1>::RegASW sa = dpf.scaled_as(leaf);
			
 
				                                 printf("%04x %x %016lx %016lx %016lx\n", x,
			
 
				-                                    ub.bshare, ua.ashare, sx.xshare, sa.ashare);
			
 
				+                                    ub.bshare, ua.ashare, sx[0].xshare, sa[0].ashare);
			
 
				                                 if (tio.player() == 1) {
			
 
				                                     tio.iostream_peer() << ub << ua << sx << sa;
			
 
				                                 } else {
			
 
				                                     RegBS peer_ub;
			
 
				                                     RegAS peer_ua;
			
 
				-                                    RegXS peer_sx;
			
 
				-                                    RegAS peer_sa;
			
 
				+                                    RDPF<1>::RegXSW peer_sx;
			
 
				+                                    RDPF<1>::RegASW peer_sa;
			
 
				                                     tio.iostream_peer() >> peer_ub >> peer_ua >>
			
 
				                                         peer_sx >> peer_sa;
			
 
				                                     ub ^= peer_ub;
			
 
				                                     ua += peer_ua;
			
 
				                                     sx ^= peer_sx;
			
 
				                                     sa += peer_sa;
			
 
				-                                    if (ub.bshare || ua.ashare || sx.xshare || sa.ashare) {
			
 
				+                                    if (ub.bshare || ua.ashare || sx[0].xshare || sa[0].ashare) {
			
 
				                                         printf("**** %x %016lx %016lx %016lx\n",
			
 
				-                                            ub.bshare, ua.ashare, sx.xshare, sa.ashare);
			
 
				+                                            ub.bshare, ua.ashare, sx[0].xshare, sa[0].ashare);
			
 
				                                         printf("SCALE                   %016lx %016lx\n",
			
 
				                                             peer_scaled_xor.xshare, peer_scaled_sum.ashare);
			
 
				                                     }
			
@@ -304,14 +304,14 @@ static void rdpf_timing(MPCIO &mpcio,
 
				                     for (int i=0;i<2;++i) {
			
 
				                         RDPF<1> &dpf = dp.dpf[i];
			
 
				                         dpf.expand(aes_ops);
			
 
				-                        RegXS scaled_xor;
			
 
				+                        RDPF<1>::RegXSW scaled_xor;
			
 
				                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
 
				-                            DPFnode leaf = dpf.leaf(x, aes_ops);
			
 
				-                            RegXS sx = dpf.scaled_xs(leaf);
			
 
				+                            RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
			
 
				+                            RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
			
 
				                             scaled_xor ^= sx;
			
 
				                         }
			
 
				-                        printf("%016lx\n%016lx\n", scaled_xor.xshare,
			
 
				-                            dpf.scaled_xor.xshare);
			
 
				+                        printf("%016lx\n%016lx\n", scaled_xor[0].xshare,
			
 
				+                            dpf.li[0].scaled_xor[0].xshare);
			
 
				                         printf("\n");
			
 
				                     }
			
 
				                 } else {
			
@@ -319,14 +319,14 @@ static void rdpf_timing(MPCIO &mpcio,
 
				                     for (int i=0;i<3;++i) {
			
 
				                         RDPF<1> &dpf = dt.dpf[i];
			
 
				                         dpf.expand(aes_ops);
			
 
				-                        RegXS scaled_xor;
			
 
				+                        RDPF<1>::RegXSW scaled_xor;
			
 
				                         for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
 
				-                            DPFnode leaf = dpf.leaf(x, aes_ops);
			
 
				-                            RegXS sx = dpf.scaled_xs(leaf);
			
 
				+                            RDPF<1>::LeafNode leaf = dpf.leaf(x, aes_ops);
			
 
				+                            RDPF<1>::RegXSW sx = dpf.scaled_xs(leaf);
			
 
				                             scaled_xor ^= sx;
			
 
				                         }
			
 
				-                        printf("%016lx\n%016lx\n", scaled_xor.xshare,
			
 
				-                            dpf.scaled_xor.xshare);
			
 
				+                        printf("%016lx\n%016lx\n", scaled_xor[0].xshare,
			
 
				+                            dpf.li[0].scaled_xor[0].xshare);
			
 
				                         printf("\n");
			
 
				                     }
			
 
				                 }
			
@@ -339,7 +339,7 @@ static void rdpf_timing(MPCIO &mpcio,
 
				 static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
			
 
				     address_t start, int num_threads)
			
 
				 {
			
 
				-    RegXS scaled_xor[num_threads];
			
 
				+    RDPF<1>::RegXSW scaled_xor[num_threads];
			
 
				     boost::asio::thread_pool pool(num_threads);
			
 
				     address_t totsize = (address_t(1)<<dpf.depth());
			
 
				     address_t threadstart = start;
			
@@ -351,12 +351,12 @@ static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
 
				             [&mpcio, &dpf, &scaled_xor, thread_num, threadstart, threadsize] {
			
 
				                 MPCTIO tio(mpcio, thread_num);
			
 
				 //printf("Thread %d from %X for %X\n", thread_num, threadstart, threadsize);
			
 
				-                RegXS local_xor;
			
 
				+                RDPF<1>::RegXSW local_xor;
			
 
				                 size_t local_aes_ops = 0;
			
 
				                 auto ev = StreamEval(dpf, threadstart, 0, local_aes_ops);
			
 
				                 for (address_t x=0;x<threadsize;++x) {
			
 
				 //if (x%0x10000 == 0) printf("%d", thread_num);
			
 
				-                    DPFnode leaf = ev.next();
			
 
				+                    RDPF<1>::LeafNode leaf = ev.next();
			
 
				                     local_xor ^= dpf.scaled_xs(leaf);
			
 
				                 }
			
 
				                 scaled_xor[thread_num] = local_xor;
			
@@ -366,11 +366,11 @@ static value_t parallel_streameval_rdpf(MPCIO &mpcio, const RDPF<1> &dpf,
 
				         threadstart = (threadstart + threadsize) % totsize;
			
 
				     }
			
 
				     pool.join();
			
 
				-    RegXS res;
			
 
				+    RDPF<1>::RegXSW res;
			
 
				     for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
			
 
				         res ^= scaled_xor[thread_num];
			
 
				     }
			
 
				-    return res.xshare;
			
 
				+    return res[0].xshare;
			
 
				 }
			
 
				 
			
 
				 static void rdpfeval_timing(MPCIO &mpcio,
			
@@ -398,7 +398,7 @@ static void rdpfeval_timing(MPCIO &mpcio,
 
				                 value_t scaled_xor =
			
 
				                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
			
 
				                 printf("%016lx\n%016lx\n", scaled_xor,
			
 
				-                    dpf.scaled_xor.xshare);
			
 
				+                    dpf.li[0].scaled_xor[0].xshare);
			
 
				                 printf("\n");
			
 
				             }
			
 
				         } else {
			
@@ -408,7 +408,7 @@ static void rdpfeval_timing(MPCIO &mpcio,
 
				                 value_t scaled_xor =
			
 
				                     parallel_streameval_rdpf(mpcio, dpf, start, num_threads);
			
 
				                 printf("%016lx\n%016lx\n", scaled_xor,
			
 
				-                    dpf.scaled_xor.xshare);
			
 
				+                    dpf.li[0].scaled_xor[0].xshare);
			
 
				                 printf("\n");
			
 
				             }
			
 
				         }
			
@@ -440,13 +440,13 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
 
				                 nbits_t depth = dpf.depth();
			
 
				                 auto pe = ParallelEval(dpf, start, 0,
			
 
				                     address_t(1)<<depth, num_threads, tio.aes_ops());
			
 
				-                RegXS result, init;
			
 
				+                RDPF<1>::RegXSW result, init;
			
 
				                 result = pe.reduce(init, [&dpf] (int thread_num,
			
 
				-                        address_t i, const RDPF<1>::node &leaf) {
			
 
				+                        address_t i, const RDPF<1>::LeafNode &leaf) {
			
 
				                     return dpf.scaled_xs(leaf);
			
 
				                 });
			
 
				-                printf("%016lx\n%016lx\n", result.xshare,
			
 
				-                    dpf.scaled_xor.xshare);
			
 
				+                printf("%016lx\n%016lx\n", result[0].xshare,
			
 
				+                    dpf.li[0].scaled_xor[0].xshare);
			
 
				                 printf("\n");
			
 
				             }
			
 
				         } else {
			
@@ -456,13 +456,13 @@ static void par_rdpfeval_timing(MPCIO &mpcio,
 
				                 nbits_t depth = dpf.depth();
			
 
				                 auto pe = ParallelEval(dpf, start, 0,
			
 
				                     address_t(1)<<depth, num_threads, tio.aes_ops());
			
 
				-                RegXS result, init;
			
 
				+                RDPF<1>::RegXSW result, init;
			
 
				                 result = pe.reduce(init, [&dpf] (int thread_num,
			
 
				-                        address_t i, const RDPF<1>::node &leaf) {
			
 
				+                        address_t i, const RDPF<1>::LeafNode &leaf) {
			
 
				                     return dpf.scaled_xs(leaf);
			
 
				                 });
			
 
				-                printf("%016lx\n%016lx\n", result.xshare,
			
 
				-                    dpf.scaled_xor.xshare);
			
 
				+                printf("%016lx\n%016lx\n", result[0].xshare,
			
 
				+                    dpf.li[0].scaled_xor[0].xshare);
			
 
				                 printf("\n");
			
 
				             }
			
 
				         }
			
@@ -490,42 +490,42 @@ static void tupleeval_timing(MPCIO &mpcio,
 
				         size_t &aes_ops = tio.aes_ops();
			
 
				         if (tio.player() == 2) {
			
 
				             RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				-            RegXS scaled_xor0, scaled_xor1;
			
 
				+            RDPF<1>::RegXSW scaled_xor0, scaled_xor1;
			
 
				             auto ev = StreamEval(dp, start, 0, aes_ops, false);
			
 
				             for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
 
				                 auto [L0, L1] = ev.next();
			
 
				-                RegXS sx0 = dp.dpf[0].scaled_xs(L0);
			
 
				-                RegXS sx1 = dp.dpf[1].scaled_xs(L1);
			
 
				+                RDPF<1>::RegXSW sx0 = dp.dpf[0].scaled_xs(L0);
			
 
				+                RDPF<1>::RegXSW sx1 = dp.dpf[1].scaled_xs(L1);
			
 
				                 scaled_xor0 ^= sx0;
			
 
				                 scaled_xor1 ^= sx1;
			
 
				             }
			
 
				-            printf("%016lx\n%016lx\n", scaled_xor0.xshare,
			
 
				-                dp.dpf[0].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", scaled_xor0[0].xshare,
			
 
				+                dp.dpf[0].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				-            printf("%016lx\n%016lx\n", scaled_xor1.xshare,
			
 
				-                dp.dpf[1].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", scaled_xor1[0].xshare,
			
 
				+                dp.dpf[1].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				         } else {
			
 
				             RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				-            RegXS scaled_xor0, scaled_xor1, scaled_xor2;
			
 
				+            RDPF<1>::RegXSW scaled_xor0, scaled_xor1, scaled_xor2;
			
 
				             auto ev = StreamEval(dt, start, 0, aes_ops, false);
			
 
				             for (address_t x=0;x<(address_t(1)<<depth);++x) {
			
 
				                 auto [L0, L1, L2] = ev.next();
			
 
				-                RegXS sx0 = dt.dpf[0].scaled_xs(L0);
			
 
				-                RegXS sx1 = dt.dpf[1].scaled_xs(L1);
			
 
				-                RegXS sx2 = dt.dpf[2].scaled_xs(L2);
			
 
				+                RDPF<1>::RegXSW sx0 = dt.dpf[0].scaled_xs(L0);
			
 
				+                RDPF<1>::RegXSW sx1 = dt.dpf[1].scaled_xs(L1);
			
 
				+                RDPF<1>::RegXSW sx2 = dt.dpf[2].scaled_xs(L2);
			
 
				                 scaled_xor0 ^= sx0;
			
 
				                 scaled_xor1 ^= sx1;
			
 
				                 scaled_xor2 ^= sx2;
			
 
				             }
			
 
				-            printf("%016lx\n%016lx\n", scaled_xor0.xshare,
			
 
				-                dt.dpf[0].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", scaled_xor0[0].xshare,
			
 
				+                dt.dpf[0].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				-            printf("%016lx\n%016lx\n", scaled_xor1.xshare,
			
 
				-                dt.dpf[1].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", scaled_xor1[0].xshare,
			
 
				+                dt.dpf[1].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				-            printf("%016lx\n%016lx\n", scaled_xor2.xshare,
			
 
				-                dt.dpf[2].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", scaled_xor2[0].xshare,
			
 
				+                dt.dpf[2].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				         }
			
 
				     });
			
@@ -554,40 +554,38 @@ static void par_tupleeval_timing(MPCIO &mpcio,
 
				             RDPFPair<1> dp = tio.rdpfpair(yield, depth);
			
 
				             auto pe = ParallelEval(dp, start, 0, address_t(1)<<depth,
			
 
				                 num_threads, aes_ops);
			
 
				-            using V = std::tuple<RegXS,RegXS>;
			
 
				-            V result, init;
			
 
				+            RDPFPair<1>::RegXSWP result, init;
			
 
				             result = pe.reduce(init, [&dp] (int thread_num, address_t i,
			
 
				-                    const RDPFPair<1>::node &leaf) {
			
 
				-                std::tuple<RegXS,RegXS> scaled;
			
 
				+                    const RDPFPair<1>::LeafNode &leaf) {
			
 
				+                RDPFPair<1>::RegXSWP scaled;
			
 
				                 dp.scaled(scaled, leaf);
			
 
				                 return scaled;
			
 
				             });
			
 
				-            printf("%016lx\n%016lx\n", std::get<0>(result).xshare,
			
 
				-                dp.dpf[0].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", std::get<0>(result)[0].xshare,
			
 
				+                dp.dpf[0].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				-            printf("%016lx\n%016lx\n", std::get<1>(result).xshare,
			
 
				-                dp.dpf[1].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", std::get<1>(result)[0].xshare,
			
 
				+                dp.dpf[1].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				         } else {
			
 
				             RDPFTriple<1> dt = tio.rdpftriple(yield, depth);
			
 
				             auto pe = ParallelEval(dt, start, 0, address_t(1)<<depth,
			
 
				                 num_threads, aes_ops);
			
 
				-            using V = std::tuple<RegXS,RegXS,RegXS>;
			
 
				-            V result, init;
			
 
				+            RDPFTriple<1>::RegXSWT result, init;
			
 
				             result = pe.reduce(init, [&dt] (int thread_num, address_t i,
			
 
				-                    const RDPFTriple<1>::node &leaf) {
			
 
				-                std::tuple<RegXS,RegXS,RegXS> scaled;
			
 
				+                    const RDPFTriple<1>::LeafNode &leaf) {
			
 
				+                RDPFTriple<1>::RegXSWT scaled;
			
 
				                 dt.scaled(scaled, leaf);
			
 
				                 return scaled;
			
 
				             });
			
 
				-            printf("%016lx\n%016lx\n", std::get<0>(result).xshare,
			
 
				-                dt.dpf[0].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", std::get<0>(result)[0].xshare,
			
 
				+                dt.dpf[0].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				-            printf("%016lx\n%016lx\n", std::get<1>(result).xshare,
			
 
				-                dt.dpf[1].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", std::get<1>(result)[0].xshare,
			
 
				+                dt.dpf[1].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				-            printf("%016lx\n%016lx\n", std::get<2>(result).xshare,
			
 
				-                dt.dpf[2].scaled_xor.xshare);
			
 
				+            printf("%016lx\n%016lx\n", std::get<2>(result)[0].xshare,
			
 
				+                dt.dpf[2].li[0].scaled_xor[0].xshare);
			
 
				             printf("\n");
			
 
				         }
			
 
				     });
			
--- a/preproc.cpp
+++ b/preproc.cpp
@@ -161,13 +161,13 @@ void preprocessing_comp(MPCIO &mpcio, const PRACOptions &opts, char **args)
 
				                                 yield();
			
 
				                                 RDPFTriple rdpftrip =
			
 
				                                     tio.rdpftriple(yield, type, opts.expand_rdpfs);
			
 
				-                                printf("dep  = %d\n", type);
			
 
				-                                printf("usi0 = %016lx\n", rdpftrip.dpf[0].unit_sum_inverse);
			
 
				-                                printf("sxr0 = %016lx\n", rdpftrip.dpf[0].scaled_xor.xshare);
			
 
				-                                printf("usi1 = %016lx\n", rdpftrip.dpf[1].unit_sum_inverse);
			
 
				-                                printf("sxr1 = %016lx\n", rdpftrip.dpf[1].scaled_xor.xshare);
			
 
				-                                printf("usi2 = %016lx\n", rdpftrip.dpf[2].unit_sum_inverse);
			
 
				-                                printf("sxr2 = %016lx\n", rdpftrip.dpf[2].scaled_xor.xshare);
			
 
				+printf("dep  = %d\n", type);
			
 
				+printf("usi0 = %016lx\n", rdpftrip.dpf[0].li[0].unit_sum_inverse);
			
 
				+printf("sxr0 = %016lx\n", rdpftrip.dpf[0].li[0].scaled_xor[0].xshare);
			
 
				+printf("usi1 = %016lx\n", rdpftrip.dpf[1].li[0].unit_sum_inverse);
			
 
				+printf("sxr1 = %016lx\n", rdpftrip.dpf[1].li[0].scaled_xor[0].xshare);
			
 
				+printf("usi2 = %016lx\n", rdpftrip.dpf[2].li[0].unit_sum_inverse);
			
 
				+printf("sxr2 = %016lx\n", rdpftrip.dpf[2].li[0].scaled_xor[0].xshare);
			
 
				                                 tripfile.os() << rdpftrip;
			
 
				                             });
			
 
				                     }
			
@@ -327,12 +327,12 @@ void preprocessing_server(MPCServerIO &mpcsrvio, const PRACOptions &opts, char *
 
				                                 [&stio, &opts, pairfile, depth](yield_t &yield) {
			
 
				                                     yield();
			
 
				                                     RDPFPair rdpfpair = stio.rdpfpair(yield, depth);
			
 
				-                                printf("usi0 = %016lx\n", rdpfpair.dpf[0].unit_sum_inverse);
			
 
				-                                printf("sxr0 = %016lx\n", rdpfpair.dpf[0].scaled_xor.xshare);
			
 
				-                                printf("dep0 = %d\n", rdpfpair.dpf[0].depth());
			
 
				-                                printf("usi1 = %016lx\n", rdpfpair.dpf[1].unit_sum_inverse);
			
 
				-                                printf("sxr1 = %016lx\n", rdpfpair.dpf[1].scaled_xor.xshare);
			
 
				-                                printf("dep1 = %d\n", rdpfpair.dpf[1].depth());
			
 
				+printf("usi0 = %016lx\n", rdpfpair.dpf[0].li[0].unit_sum_inverse);
			
 
				+printf("sxr0 = %016lx\n", rdpfpair.dpf[0].li[0].scaled_xor[0].xshare);
			
 
				+printf("dep0 = %d\n", rdpfpair.dpf[0].depth());
			
 
				+printf("usi1 = %016lx\n", rdpfpair.dpf[1].li[0].unit_sum_inverse);
			
 
				+printf("sxr1 = %016lx\n", rdpfpair.dpf[1].li[0].scaled_xor[0].xshare);
			
 
				+printf("dep1 = %d\n", rdpfpair.dpf[1].depth());
			
 
				                                     if (opts.expand_rdpfs) {
			
 
				                                         rdpfpair.dpf[0].expand(stio.aes_ops());
			
 
				                                         rdpfpair.dpf[1].expand(stio.aes_ops());
			
--- a/rdpf.hpp
+++ b/rdpf.hpp
@@ -1,6 +1,7 @@
 
				 #ifndef __RDPF_HPP__
			
 
				 #define __RDPF_HPP__
			
 
				 
			
 
				+#include <array>
			
 
				 #include <vector>
			
 
				 #include <iostream>
			
 
				 
			
@@ -15,17 +16,50 @@
 
				 
			
 
				 template <nbits_t WIDTH>
			
 
				 struct RDPF : public DPF {
			
 
				-    // The amount we have to scale the low words of the leaf values by
			
 
				-    // to get additive shares of a unit vector
			
 
				-    value_t unit_sum_inverse;
			
 
				-    // Additive share of the scaling value M_as such that the high words
			
 
				-    // of the leaf values for P0 and P1 add to M_as * e_{target}
			
 
				-    RegAS scaled_sum;
			
 
				-    // XOR share of the scaling value M_xs such that the high words
			
 
				-    // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
			
 
				-    RegXS scaled_xor;
			
 
				+    template <typename T>
			
 
				+    using W = std::array<T, WIDTH>;
			
 
				+    using RegASW = W<RegAS>;
			
 
				+    using RegXSW = W<RegXS>;
			
 
				+    // The number of 128-bit leaf node entries you need to get 1 unit
			
 
				+    // value and WIDTH scaled values (each is 64 bits)
			
 
				+    static const nbits_t LWIDTH = 1 + (WIDTH/2);
			
 
				+    using LeafNode = std::array<DPFnode,LWIDTH>;
			
 
				+
			
 
				+    // Information for leaf levels of the RDPF.  Normal RDPFs only have
			
 
				+    // one leaf level (at the bottom), but incremental RDPFs have a leaf
			
 
				+    // level for each level of the DPF.
			
 
				+    struct LeafInfo {
			
 
				+        // The WIDTH correction words for this leaf level
			
 
				+        std::array<DPFnode,WIDTH> leaf_cw;
			
 
				+        // The amount we have to scale the low words of the leaf values by
			
 
				+        // to get additive shares of a unit vector
			
 
				+        value_t unit_sum_inverse;
			
 
				+        // Additive share of the scaling values M_as such that the high words
			
 
				+        // of the WIDTH leaf values for P0 and P1 add to M_as * e_{target}
			
 
				+        std::array<RegAS,WIDTH> scaled_sum;
			
 
				+        // XOR share of the scaling values M_xs such that the high words
			
 
				+        // of the WIDTH leaf values for P0 and P1 XOR to M_xs * e_{target}
			
 
				+        std::array<RegXS,WIDTH> scaled_xor;
			
 
				+
			
 
				+        LeafInfo() : unit_sum_inverse(0) {}
			
 
				+    };
			
 
				+
			
 
				+    // The LeafInfo for each leaf level.  Normal RDPFs only have one
			
 
				+    // leaf level, so this will be a vector of length 1.  Incremental
			
 
				+    // RDPFs will have one entry for each level in the DPF.  The entry
			
 
				+    // corresponding to level i of the DPF (of total depth d) is
			
 
				+    // leaf_info[d-i].
			
 
				+    std::vector<LeafInfo> li;
			
 
				+
			
 
				+    // The leaf correction flag bits for the WIDTH leaf words at each
			
 
				+    // leaf level.  The bit for leaf word j of level i (for an
			
 
				+    // incremental DPF of total depth d) is leaf_cfbits[j] & (1<<(d-i)).
			
 
				+    // For a normal (not incremental) RDPF, it's the same, but therefore
			
 
				+    // only the low bit of each of these WIDTH words gets used.
			
 
				+    std::array<value_t,WIDTH> leaf_cfbits;
			
 
				+
			
 
				     // If we're saving the expansion, put it here
			
 
				-    std::vector<DPFnode> expansion;
			
 
				+    std::vector<LeafNode> expansion;
			
 
				 
			
 
				     RDPF() {}
			
 
				 
			
@@ -48,7 +82,7 @@ struct RDPF : public DPF {
 
				     inline bool has_expansion() const { return expansion.size() > 0; }
			
 
				 
			
 
				     // Get an element of the expansion
			
 
				-    inline node get_expansion(address_t index) const {
			
 
				+    inline LeafNode get_expansion(address_t index) const {
			
 
				         return expansion[index];
			
 
				     }
			
 
				 
			
@@ -58,47 +92,81 @@ struct RDPF : public DPF {
 
				     // Get the leaf node for the given input
			
 
				     //
			
 
				     // Cost: depth AES operations
			
 
				-    DPFnode leaf(address_t input, size_t &aes_ops) const;
			
 
				+    LeafNode leaf(address_t input, size_t &aes_ops) const;
			
 
				 
			
 
				     // Expand the DPF if it's not already expanded
			
 
				     void expand(size_t &aes_ops);
			
 
				 
			
 
				+    // Descend from a node at depth parentdepth to one of its leaf children
			
 
				+    // whichchild = 0: left child
			
 
				+    // whichchild = 1: right child
			
 
				+    //
			
 
				+    // Cost: 1 AES operation
			
 
				+    inline LeafNode descend_to_leaf(const DPFnode &parent,
			
 
				+        nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const;
			
 
				+
			
 
				     // Get the bit-shared unit vector entry from the leaf node
			
 
				-    inline RegBS unit_bs(DPFnode leaf) const {
			
 
				+    inline RegBS unit_bs(LeafNode leaf) const {
			
 
				         RegBS b;
			
 
				-        b.bshare = get_lsb(leaf);
			
 
				+        b.bshare = get_lsb(leaf[0]);
			
 
				         return b;
			
 
				     }
			
 
				 
			
 
				     // Get the additive-shared unit vector entry from the leaf node
			
 
				-    inline RegAS unit_as(DPFnode leaf) const {
			
 
				+    inline RegAS unit_as(LeafNode leaf) const {
			
 
				         RegAS a;
			
 
				-        value_t lowword = value_t(_mm_cvtsi128_si64x(leaf));
			
 
				+        value_t lowword = value_t(_mm_cvtsi128_si64x(leaf[0]));
			
 
				         if (whichhalf == 1) {
			
 
				             lowword = -lowword;
			
 
				         }
			
 
				-        a.ashare = lowword * unit_sum_inverse;
			
 
				+        a.ashare = lowword * li[0].unit_sum_inverse;
			
 
				         return a;
			
 
				     }
			
 
				 
			
 
				     // Get the XOR-shared scaled vector entry from the leaf node
			
 
				-    inline RegXS scaled_xs(DPFnode leaf) const {
			
 
				-        RegXS x;
			
 
				+    inline RegXSW scaled_xs(LeafNode leaf) const {
			
 
				+        RegXSW x;
			
 
				+        nbits_t j = 0;
			
 
				         value_t highword =
			
 
				-            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf,8)));
			
 
				-        x.xshare = highword;
			
 
				+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[0],8)));
			
 
				+        x[j++].xshare = highword;
			
 
				+        for (nbits_t i=1;i<WIDTH;++i) {
			
 
				+            value_t lowword =
			
 
				+                value_t(_mm_cvtsi128_si64x(leaf[i]));
			
 
				+            value_t highword =
			
 
				+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[i],8)));
			
 
				+            x[j++].xshare = lowword;
			
 
				+            if (j < WIDTH) {
			
 
				+                x[j++].xshare = highword;
			
 
				+            }
			
 
				+        }
			
 
				         return x;
			
 
				     }
			
 
				 
			
 
				     // Get the additive-shared scaled vector entry from the leaf node
			
 
				-    inline RegAS scaled_as(DPFnode leaf) const {
			
 
				-        RegAS a;
			
 
				+    inline RegASW scaled_as(LeafNode leaf) const {
			
 
				+        RegASW a;
			
 
				+        nbits_t j = 0;
			
 
				         value_t highword =
			
 
				-            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf,8)));
			
 
				+            value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[0],8)));
			
 
				         if (whichhalf == 1) {
			
 
				             highword = -highword;
			
 
				         }
			
 
				-        a.ashare = highword;
			
 
				+        a[j++].ashare = highword;
			
 
				+        for (nbits_t i=1;i<WIDTH;++i) {
			
 
				+            value_t lowword =
			
 
				+                value_t(_mm_cvtsi128_si64x(leaf[i]));
			
 
				+            value_t highword =
			
 
				+                value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leaf[i],8)));
			
 
				+            if (whichhalf == 1) {
			
 
				+                lowword = -lowword;
			
 
				+                highword = -highword;
			
 
				+            }
			
 
				+            a[j++].ashare = lowword;
			
 
				+            if (j < WIDTH) {
			
 
				+                a[j++].ashare = highword;
			
 
				+            }
			
 
				+        }
			
 
				         return a;
			
 
				     }
			
 
				 
			
@@ -115,8 +183,16 @@ struct RDPF : public DPF {
 
				 
			
 
				 template <nbits_t WIDTH>
			
 
				 struct RDPFTriple {
			
 
				+    template <typename T>
			
 
				+    using Triple = std::tuple<T, T, T>;
			
 
				+    template <typename T>
			
 
				+    using WTriple = Triple<typename RDPF<WIDTH>::W<T>>;
			
 
				+
			
 
				     // The type of node triples
			
 
				-    using node = std::tuple<DPFnode, DPFnode, DPFnode>;
			
 
				+    using node = Triple<DPFnode>;
			
 
				+    using LeafNode = Triple<typename RDPF<WIDTH>::LeafNode>;
			
 
				+    using RegASWT = WTriple<RegAS>;
			
 
				+    using RegXSWT = WTriple<RegXS>;
			
 
				 
			
 
				     RegAS as_target;
			
 
				     RegXS xs_target;
			
@@ -137,7 +213,7 @@ struct RDPFTriple {
 
				     }
			
 
				 
			
 
				     // Get an element of the expansion
			
 
				-    inline node get_expansion(address_t index) const {
			
 
				+    inline LeafNode get_expansion(address_t index) const {
			
 
				         return std::make_tuple(dpf[0].get_expansion(index),
			
 
				             dpf[1].get_expansion(index), dpf[2].get_expansion(index));
			
 
				     }
			
@@ -153,6 +229,10 @@ struct RDPFTriple {
 
				     node descend(const node &parent, nbits_t parentdepth,
			
 
				         bit_t whichchild, size_t &aes_ops) const;
			
 
				 
			
 
				+    // Descend the three RDPFs in lock step to a leaf node
			
 
				+    LeafNode descend_to_leaf(const node &parent, nbits_t parentdepth,
			
 
				+        bit_t whichchild, size_t &aes_ops) const;
			
 
				+
			
 
				     // Overloaded versions of functions to get DPF components and
			
 
				     // outputs so that the appropriate one can be selected with a
			
 
				     // parameter
			
@@ -162,29 +242,29 @@ struct RDPFTriple {
 
				 
			
 
				     // Additive share of the scaling value M_as such that the high words
			
 
				     // of the leaf values for P0 and P1 add to M_as * e_{target}
			
 
				-    inline void scaled_value(std::tuple<RegAS,RegAS,RegAS> &v) const {
			
 
				-        std::get<0>(v) = dpf[0].scaled_sum;
			
 
				-        std::get<1>(v) = dpf[1].scaled_sum;
			
 
				-        std::get<2>(v) = dpf[2].scaled_sum;
			
 
				+    inline void scaled_value(RegASWT &v) const {
			
 
				+        std::get<0>(v) = dpf[0].li[0].scaled_sum;
			
 
				+        std::get<1>(v) = dpf[1].li[0].scaled_sum;
			
 
				+        std::get<2>(v) = dpf[2].li[0].scaled_sum;
			
 
				     }
			
 
				 
			
 
				     // XOR share of the scaling value M_xs such that the high words
			
 
				     // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
			
 
				-    inline void scaled_value(std::tuple<RegXS,RegXS,RegXS> &v) const {
			
 
				-        std::get<0>(v) = dpf[0].scaled_xor;
			
 
				-        std::get<1>(v) = dpf[1].scaled_xor;
			
 
				-        std::get<2>(v) = dpf[2].scaled_xor;
			
 
				+    inline void scaled_value(RegXSWT &v) const {
			
 
				+        std::get<0>(v) = dpf[0].li[0].scaled_xor;
			
 
				+        std::get<1>(v) = dpf[1].li[0].scaled_xor;
			
 
				+        std::get<2>(v) = dpf[2].li[0].scaled_xor;
			
 
				     }
			
 
				 
			
 
				     // Get the additive-shared unit vector entry from the leaf node
			
 
				-    inline void unit(std::tuple<RegAS,RegAS,RegAS> &u, node leaf) const {
			
 
				+    inline void unit(std::tuple<RegAS,RegAS,RegAS> &u, LeafNode leaf) const {
			
 
				         std::get<0>(u) = dpf[0].unit_as(std::get<0>(leaf));
			
 
				         std::get<1>(u) = dpf[1].unit_as(std::get<1>(leaf));
			
 
				         std::get<2>(u) = dpf[2].unit_as(std::get<2>(leaf));
			
 
				     }
			
 
				 
			
 
				     // Get the bit-shared unit vector entry from the leaf node
			
 
				-    inline void unit(std::tuple<RegXS,RegXS,RegXS> &u, node leaf) const {
			
 
				+    inline void unit(std::tuple<RegXS,RegXS,RegXS> &u, LeafNode leaf) const {
			
 
				         std::get<0>(u) = dpf[0].unit_bs(std::get<0>(leaf));
			
 
				         std::get<1>(u) = dpf[1].unit_bs(std::get<1>(leaf));
			
 
				         std::get<2>(u) = dpf[2].unit_bs(std::get<2>(leaf));
			
@@ -193,21 +273,21 @@ struct RDPFTriple {
 
				     // For any more complex entry type, that type will handle the conversion
			
 
				     // for each DPF
			
 
				     template <typename T>
			
 
				-    inline void unit(std::tuple<T,T,T> &u, node leaf) const {
			
 
				+    inline void unit(std::tuple<T,T,T> &u, LeafNode leaf) const {
			
 
				         std::get<0>(u).unit(dpf[0], std::get<0>(leaf));
			
 
				         std::get<1>(u).unit(dpf[1], std::get<1>(leaf));
			
 
				         std::get<2>(u).unit(dpf[2], std::get<2>(leaf));
			
 
				     }
			
 
				 
			
 
				     // Get the additive-shared scaled vector entry from the leaf node
			
 
				-    inline void scaled(std::tuple<RegAS,RegAS,RegAS> &s, node leaf) const {
			
 
				+    inline void scaled(RegASWT &s, LeafNode leaf) const {
			
 
				         std::get<0>(s) = dpf[0].scaled_as(std::get<0>(leaf));
			
 
				         std::get<1>(s) = dpf[1].scaled_as(std::get<1>(leaf));
			
 
				         std::get<2>(s) = dpf[2].scaled_as(std::get<2>(leaf));
			
 
				     }
			
 
				 
			
 
				     // Get the XOR-shared scaled vector entry from the leaf node
			
 
				-    inline void scaled(std::tuple<RegXS,RegXS,RegXS> &s, node leaf) const {
			
 
				+    inline void scaled(RegXSWT &s, LeafNode leaf) const {
			
 
				         std::get<0>(s) = dpf[0].scaled_xs(std::get<0>(leaf));
			
 
				         std::get<1>(s) = dpf[1].scaled_xs(std::get<1>(leaf));
			
 
				         std::get<2>(s) = dpf[2].scaled_xs(std::get<2>(leaf));
			
@@ -216,8 +296,16 @@ struct RDPFTriple {
 
				 
			
 
				 template <nbits_t WIDTH>
			
 
				 struct RDPFPair {
			
 
				+    template <typename T>
			
 
				+    using Pair = std::tuple<T, T>;
			
 
				+    template <typename T>
			
 
				+    using WPair = Pair<typename RDPF<WIDTH>::W<T>>;
			
 
				+
			
 
				     // The type of node pairs
			
 
				-    using node = std::tuple<DPFnode, DPFnode>;
			
 
				+    using node = Pair<DPFnode>;
			
 
				+    using LeafNode = Pair<typename RDPF<WIDTH>::LeafNode>;
			
 
				+    using RegASWP = WPair<RegAS>;
			
 
				+    using RegXSWP = WPair<RegXS>;
			
 
				 
			
 
				     RDPF<WIDTH> dpf[2];
			
 
				 
			
@@ -246,7 +334,7 @@ struct RDPFPair {
 
				     }
			
 
				 
			
 
				     // Get an element of the expansion
			
 
				-    inline node get_expansion(address_t index) const {
			
 
				+    inline LeafNode get_expansion(address_t index) const {
			
 
				         return std::make_tuple(dpf[0].get_expansion(index),
			
 
				             dpf[1].get_expansion(index));
			
 
				     }
			
@@ -255,32 +343,36 @@ struct RDPFPair {
 
				     node descend(const node &parent, nbits_t parentdepth,
			
 
				         bit_t whichchild, size_t &aes_ops) const;
			
 
				 
			
 
				+    // Descend the two RDPFs in lock step to a leaf node
			
 
				+    LeafNode descend_to_leaf(const node &parent, nbits_t parentdepth,
			
 
				+        bit_t whichchild, size_t &aes_ops) const;
			
 
				+
			
 
				     // Overloaded versions of functions to get DPF components and
			
 
				     // outputs so that the appropriate one can be selected with a
			
 
				     // parameter
			
 
				 
			
 
				     // Additive share of the scaling value M_as such that the high words
			
 
				     // of the leaf values for P0 and P1 add to M_as * e_{target}
			
 
				-    inline void scaled_value(std::tuple<RegAS,RegAS> &v) const {
			
 
				+    inline void scaled_value(RegASWP &v) const {
			
 
				         std::get<0>(v) = dpf[0].scaled_sum;
			
 
				         std::get<1>(v) = dpf[1].scaled_sum;
			
 
				     }
			
 
				 
			
 
				     // XOR share of the scaling value M_xs such that the high words
			
 
				     // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
			
 
				-    inline void scaled_value(std::tuple<RegXS,RegXS> &v) const {
			
 
				+    inline void scaled_value(RegXSWP &v) const {
			
 
				         std::get<0>(v) = dpf[0].scaled_xor;
			
 
				         std::get<1>(v) = dpf[1].scaled_xor;
			
 
				     }
			
 
				 
			
 
				     // Get the additive-shared unit vector entry from the leaf node
			
 
				-    inline void unit(std::tuple<RegAS,RegAS> &u, node leaf) const {
			
 
				+    inline void unit(std::tuple<RegAS,RegAS> &u, LeafNode leaf) const {
			
 
				         std::get<0>(u) = dpf[0].unit_as(std::get<0>(leaf));
			
 
				         std::get<1>(u) = dpf[1].unit_as(std::get<1>(leaf));
			
 
				     }
			
 
				 
			
 
				     // Get the bit-shared unit vector entry from the leaf node
			
 
				-    inline void unit(std::tuple<RegXS,RegXS> &u, node leaf) const {
			
 
				+    inline void unit(std::tuple<RegXS,RegXS> &u, LeafNode leaf) const {
			
 
				         std::get<0>(u) = dpf[0].unit_bs(std::get<0>(leaf));
			
 
				         std::get<1>(u) = dpf[1].unit_bs(std::get<1>(leaf));
			
 
				     }
			
@@ -288,19 +380,19 @@ struct RDPFPair {
 
				     // For any more complex entry type, that type will handle the conversion
			
 
				     // for each DPF
			
 
				     template <typename T>
			
 
				-    inline void unit(std::tuple<T,T> &u, node leaf) const {
			
 
				+    inline void unit(std::tuple<T,T> &u, LeafNode leaf) const {
			
 
				         std::get<0>(u).unit(dpf[0], std::get<0>(leaf));
			
 
				         std::get<1>(u).unit(dpf[1], std::get<1>(leaf));
			
 
				     }
			
 
				 
			
 
				     // Get the additive-shared scaled vector entry from the leaf node
			
 
				-    inline void scaled(std::tuple<RegAS,RegAS> &s, node leaf) const {
			
 
				+    inline void scaled(RegASWP &s, LeafNode leaf) const {
			
 
				         std::get<0>(s) = dpf[0].scaled_as(std::get<0>(leaf));
			
 
				         std::get<1>(s) = dpf[1].scaled_as(std::get<1>(leaf));
			
 
				     }
			
 
				 
			
 
				     // Get the XOR-shared scaled vector entry from the leaf node
			
 
				-    inline void scaled(std::tuple<RegXS,RegXS> &s, node leaf) const {
			
 
				+    inline void scaled(RegXSWP &s, LeafNode leaf) const {
			
 
				         std::get<0>(s) = dpf[0].scaled_xs(std::get<0>(leaf));
			
 
				         std::get<1>(s) = dpf[1].scaled_xs(std::get<1>(leaf));
			
 
				     }
			
@@ -336,7 +428,7 @@ public:
 
				         bool use_expansion = true);
			
 
				 
			
 
				     // Get the next value (or tuple of values) from the evaluator
			
 
				-    typename T::node next();
			
 
				+    typename T::LeafNode next();
			
 
				 };
			
 
				 
			
 
				 // Parallel evaluation.  This class launches a number of threads each
			
--- a/rdpf.tcc
+++ b/rdpf.tcc
@@ -63,11 +63,11 @@ StreamEval<T>::StreamEval(const T &rdpf, address_t start,
 
				 }
			
 
				 
			
 
				 template <typename T>
			
 
				-typename T::node StreamEval<T>::next()
			
 
				+typename T::LeafNode StreamEval<T>::next()
			
 
				 {
			
 
				     if (use_expansion && rdpf.has_expansion()) {
			
 
				         // Just use the precomputed values
			
 
				-        typename T::node leaf =
			
 
				+        typename T::LeafNode leaf =
			
 
				             rdpf.get_expansion(nextindex ^ counter_xor_offset);
			
 
				         nextindex = (nextindex + 1) & indexmask;
			
 
				         return leaf;
			
@@ -113,7 +113,7 @@ typename T::node StreamEval<T>::next()
 
				         }
			
 
				     }
			
 
				     bool xor_offset_bit = counter_xor_offset & 1;
			
 
				-    typename T::node leaf = rdpf.descend(path[depth-1], depth-1,
			
 
				+    typename T::LeafNode leaf = rdpf.descend_to_leaf(path[depth-1], depth-1,
			
 
				         (nextindex & 1) ^ xor_offset_bit, aes_ops);
			
 
				     pathindex = nextindex;
			
 
				     nextindex = (nextindex + 1) & indexmask;
			
@@ -161,7 +161,7 @@ inline V ParallelEval<T>::reduce(V init, W process)
 
				                     xor_offset, local_aes_ops);
			
 
				                 V accum = init;
			
 
				                 for (address_t x=0;x<threadsize;++x) {
			
 
				-                    typename T::node leaf = ev.next();
			
 
				+                    typename T::LeafNode leaf = ev.next();
			
 
				                     accum += process(thread_num,
			
 
				                         (threadstart+x)&indexmask, leaf);
			
 
				                 }
			
@@ -179,6 +179,28 @@ inline V ParallelEval<T>::reduce(V init, W process)
 
				     return total;
			
 
				 }
			
 
				 
			
 
				+// Descend from a node at depth parentdepth to one of its leaf children
			
 
				+// whichchild = 0: left child
			
 
				+// whichchild = 1: right child
			
 
				+//
			
 
				+// Cost: 1 AES operation
			
 
				+template <nbits_t WIDTH>
			
 
				+inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
			
 
				+    const DPFnode &parent, nbits_t parentdepth, bit_t whichchild,
			
 
				+    size_t &aes_ops) const
			
 
				+{
			
 
				+    typename RDPF<WIDTH>::LeafNode prgout;
			
 
				+    bool flag = get_lsb(parent);
			
 
				+    prg(prgout[0], parent, whichchild, aes_ops);
			
 
				+    if (flag) {
			
 
				+        DPFnode CW = cw[parentdepth];
			
 
				+        bit_t cfbit = !!(cfbits & (value_t(1)<<parentdepth));
			
 
				+        DPFnode CWR = CW ^ lsb128_mask[cfbit];
			
 
				+        prgout[0] ^= (whichchild ? CWR : CW);
			
 
				+    }
			
 
				+    return prgout;
			
 
				+}
			
 
				+
			
 
				 // I/O for RDPFs
			
 
				 
			
 
				 template <typename T, nbits_t WIDTH>
			
@@ -209,9 +231,13 @@ T& operator>>(T &is, RDPF<WIDTH> &rdpf)
 
				     value_t cfbits = 0;
			
 
				     is.read((char *)&cfbits, BITBYTES(depth));
			
 
				     rdpf.cfbits = cfbits;
			
 
				-    is.read((char *)&rdpf.unit_sum_inverse, sizeof(rdpf.unit_sum_inverse));
			
 
				-    is.read((char *)&rdpf.scaled_sum, sizeof(rdpf.scaled_sum));
			
 
				-    is.read((char *)&rdpf.scaled_xor, sizeof(rdpf.scaled_xor));
			
 
				+    rdpf.li.resize(1);
			
 
				+    is.read((char *)&rdpf.li[0].unit_sum_inverse,
			
 
				+        sizeof(rdpf.li[0].unit_sum_inverse));
			
 
				+    is.read((char *)&rdpf.li[0].scaled_sum,
			
 
				+        sizeof(rdpf.li[0].scaled_sum));
			
 
				+    is.read((char *)&rdpf.li[0].scaled_xor,
			
 
				+        sizeof(rdpf.li[0].scaled_xor));
			
 
				 
			
 
				     return is;
			
 
				 }
			
@@ -242,9 +268,12 @@ T& write_maybe_expanded(T &os, const RDPF<WIDTH> &rdpf,
 
				             sizeof(rdpf.expansion[0])<<depth);
			
 
				     }
			
 
				     os.write((const char *)&rdpf.cfbits, BITBYTES(depth));
			
 
				-    os.write((const char *)&rdpf.unit_sum_inverse, sizeof(rdpf.unit_sum_inverse));
			
 
				-    os.write((const char *)&rdpf.scaled_sum, sizeof(rdpf.scaled_sum));
			
 
				-    os.write((const char *)&rdpf.scaled_xor, sizeof(rdpf.scaled_xor));
			
 
				+    os.write((const char *)&rdpf.li[0].unit_sum_inverse,
			
 
				+        sizeof(rdpf.li[0].unit_sum_inverse));
			
 
				+    os.write((const char *)&rdpf.li[0].scaled_sum,
			
 
				+        sizeof(rdpf.li[0].scaled_sum));
			
 
				+    os.write((const char *)&rdpf.li[0].scaled_xor,
			
 
				+        sizeof(rdpf.li[0].scaled_xor));
			
 
				 
			
 
				     return os;
			
 
				 }
			
@@ -332,6 +361,8 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				     DPFnode *nextlevel = new DPFnode[1];
			
 
				     nextlevel[0] = seed;
			
 
				 
			
 
				+    li.resize(1);
			
 
				+
			
 
				     // Construct each intermediate level
			
 
				     while(level < depth) {
			
 
				         if (player < 2) {
			
@@ -339,7 +370,7 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				             curlevel = nextlevel;
			
 
				             if (save_expansion && level == depth-1) {
			
 
				                 expansion.resize(1<<depth);
			
 
				-                nextlevel = expansion.data();
			
 
				+                nextlevel = (DPFnode *)expansion.data();
			
 
				             } else {
			
 
				                 nextlevel = new DPFnode[1<<(level+1)];
			
 
				             }
			
@@ -666,8 +697,8 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				                     low_sum = -low_sum;
			
 
				                     high_sum = -high_sum;
			
 
				                 }
			
 
				-                scaled_sum.ashare = high_sum;
			
 
				-                scaled_xor.xshare = high_xor;
			
 
				+                li[0].scaled_sum[0].ashare = high_sum;
			
 
				+                li[0].scaled_xor[0].xshare = high_xor;
			
 
				                 // Exchange low_sum and add them up
			
 
				                 tio.queue_peer(&low_sum, sizeof(low_sum));
			
 
				                 yield();
			
@@ -676,7 +707,7 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				                 low_sum += peer_low_sum;
			
 
				                 // The low_sum had better be odd
			
 
				                 assert(low_sum & 1);
			
 
				-                unit_sum_inverse = inverse_value_t(low_sum);
			
 
				+                li[0].unit_sum_inverse = inverse_value_t(low_sum);
			
 
				             }
			
 
				             cw.push_back(CW);
			
 
				         } else if (level == depth-1) {
			
@@ -694,7 +725,8 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				 
			
 
				 // Get the leaf node for the given input
			
 
				 template <nbits_t WIDTH>
			
 
				-DPFnode RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
			
 
				+typename RDPF<WIDTH>::LeafNode
			
 
				+    RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
			
 
				 {
			
 
				     // If we have a precomputed expansion, just use it
			
 
				     if (expansion.size()) {
			
@@ -707,7 +739,9 @@ DPFnode RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
 
				         bit_t dir = !!(input & (address_t(1)<<(totdepth-d-1)));
			
 
				         node = descend(node, d, dir, aes_ops);
			
 
				     }
			
 
				-    return node;
			
 
				+    LeafNode ln;
			
 
				+    ln[0] = node;
			
 
				+    return ln;
			
 
				 }
			
 
				 
			
 
				 // Expand the DPF if it's not already expanded
			
@@ -728,8 +762,8 @@ void RDPF<WIDTH>::expand(size_t &aes_ops)
 
				     for (nbits_t i=1;i<depth;++i) {
			
 
				         path[i] = descend(path[i-1], i-1, 0, aes_ops);
			
 
				     }
			
 
				-    expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				-    expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				+    expansion[index++][0] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				+    expansion[index++][0] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				     while(index < num_leaves) {
			
 
				         // Invariant: lastindex and index will both be even, and
			
 
				         // index=lastindex+2
			
@@ -749,8 +783,8 @@ void RDPF<WIDTH>::expand(size_t &aes_ops)
 
				             path[i+1] = descend(path[i], i, 0, aes_ops);
			
 
				         }
			
 
				         lastindex = index;
			
 
				-        expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				-        expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				+        expansion[index++][0] = descend(path[depth-1], depth-1, 0, aes_ops);
			
 
				+        expansion[index++][0] = descend(path[depth-1], depth-1, 1, aes_ops);
			
 
				     }
			
 
				 
			
 
				     delete[] path;
			
@@ -796,6 +830,20 @@ typename RDPFTriple<WIDTH>::node RDPFTriple<WIDTH>::descend(
 
				     return std::make_tuple(C0,C1,C2);
			
 
				 }
			
 
				 
			
 
				+template <nbits_t WIDTH>
			
 
				+typename RDPFTriple<WIDTH>::LeafNode RDPFTriple<WIDTH>::descend_to_leaf(
			
 
				+    const RDPFTriple<WIDTH>::node &parent,
			
 
				+    nbits_t parentdepth, bit_t whichchild,
			
 
				+    size_t &aes_ops) const
			
 
				+{
			
 
				+    auto [P0, P1, P2] = parent;
			
 
				+    typename RDPF<WIDTH>::LeafNode C0, C1, C2;
			
 
				+    C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
			
 
				+    C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
			
 
				+    C2 = dpf[2].descend_to_leaf(P2, parentdepth, whichchild, aes_ops);
			
 
				+    return std::make_tuple(C0,C1,C2);
			
 
				+}
			
 
				+
			
 
				 template <nbits_t WIDTH>
			
 
				 typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
			
 
				     const RDPFPair<WIDTH>::node &parent,
			
@@ -808,3 +856,16 @@ typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
 
				     C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
			
 
				     return std::make_tuple(C0,C1);
			
 
				 }
			
 
				+
			
 
				+template <nbits_t WIDTH>
			
 
				+typename RDPFPair<WIDTH>::LeafNode RDPFPair<WIDTH>::descend_to_leaf(
			
 
				+    const RDPFPair<WIDTH>::node &parent,
			
 
				+    nbits_t parentdepth, bit_t whichchild,
			
 
				+    size_t &aes_ops) const
			
 
				+{
			
 
				+    auto [P0, P1] = parent;
			
 
				+    typename RDPF<WIDTH>::LeafNode C0, C1;
			
 
				+    C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
			
 
				+    C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
			
 
				+    return std::make_tuple(C0,C1);
			
 
				+}
			
--- a/types.hpp
+++ b/types.hpp
@@ -99,6 +99,17 @@ struct RegAS {
 
				         return res;
			
 
				     }
			
 
				 
			
 
				+    // Multiply a scalar by a vector
			
 
				+    template <size_t N>
			
 
				+    inline std::array<RegAS,N> operator*(std::array<value_t,N> rhs) const {
			
 
				+        std::array<RegAS,N> res;
			
 
				+        for (size_t i=0;i<N;++i) {
			
 
				+            res[i] = *this;
			
 
				+            res[i] *= rhs[i];
			
 
				+        }
			
 
				+        return res;
			
 
				+    }
			
 
				+
			
 
				     inline RegAS &operator&=(value_t mask) {
			
 
				         this->ashare &= mask;
			
 
				         return *this;
			
@@ -235,6 +246,17 @@ struct RegXS {
 
				         return res;
			
 
				     }
			
 
				 
			
 
				+    // Multiply a scalar by a vector
			
 
				+    template <size_t N>
			
 
				+    inline std::array<RegXS,N> operator*(std::array<value_t,N> rhs) const {
			
 
				+        std::array<RegXS,N> res;
			
 
				+        for (size_t i=0;i<N;++i) {
			
 
				+            res[i] = *this;
			
 
				+            res[i] *= rhs[i];
			
 
				+        }
			
 
				+        return res;
			
 
				+    }
			
 
				+
			
 
				     inline RegXS &operator^=(const RegXS &rhs) {
			
 
				         this->xshare ^= rhs.xshare;
			
 
				         return *this;
			
@@ -427,9 +449,32 @@ std::tuple<T,T> operator*(const std::tuple<T,T> &A,
 
				     return res;
			
 
				 }
			
 
				 
			
 
				-template <typename T>
			
 
				-inline std::tuple<value_t,value_t> combine(
			
 
				-        const std::tuple<T,T> &A, const std::tuple<T,T> &B,
			
 
				+template <typename T, size_t N>
			
 
				+std::tuple<std::array<T,N>,std::array<T,N>> operator*(
			
 
				+    const std::tuple<T,T> &A,
			
 
				+    const std::tuple<std::array<value_t,N>,std::array<value_t,N>> &B)
			
 
				+{
			
 
				+    std::tuple<std::array<T,N>,std::array<T,N>> res;
			
 
				+    std::get<0>(res) = std::get<0>(A) * std::get<0>(B);
			
 
				+    std::get<1>(res) = std::get<1>(A) * std::get<1>(B);
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+template <typename S, size_t N>
			
 
				+inline std::array<value_t,N> combine(const std::array<S,N> &A,
			
 
				+        const std::array<S,N> &B,
			
 
				+        nbits_t nbits = VALUE_BITS) {
			
 
				+    std::array<value_t,N> res;
			
 
				+    for (size_t i=0;i<N;++i) {
			
 
				+        res[i] = combine(A[i], B[i], nbits);
			
 
				+    }
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+template <typename S, size_t N>
			
 
				+inline std::tuple<std::array<value_t,N>,std::array<value_t,N>>
			
 
				+    combine(const std::tuple<std::array<S,N>,std::array<S,N>> &A,
			
 
				+        const std::tuple<std::array<S,N>,std::array<S,N>> &B,
			
 
				         nbits_t nbits = VALUE_BITS) {
			
 
				     return std::make_tuple(
			
 
				         combine(std::get<0>(A), std::get<0>(B), nbits),
			
@@ -523,6 +568,18 @@ std::tuple<T,T,T> operator*(const std::tuple<T,T,T> &A,
 
				     return res;
			
 
				 }
			
 
				 
			
 
				+template <typename T, size_t N>
			
 
				+std::tuple<std::array<T,N>,std::array<T,N>,std::array<T,N>> operator*(
			
 
				+    const std::tuple<T,T,T> &A,
			
 
				+    const std::tuple<std::array<value_t,N>,std::array<value_t,N>,std::array<value_t,N>> &B)
			
 
				+{
			
 
				+    std::tuple<std::array<T,N>,std::array<T,N>,std::array<T,N>> res;
			
 
				+    std::get<0>(res) = std::get<0>(A) * std::get<0>(B);
			
 
				+    std::get<1>(res) = std::get<1>(A) * std::get<1>(B);
			
 
				+    std::get<2>(res) = std::get<2>(A) * std::get<2>(B);
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				 inline std::vector<RegAS> operator-(const std::vector<RegAS> &A)
			
 
				 {
			
 
				     std::vector<RegAS> res;
			
@@ -564,9 +621,38 @@ inline std::array<RegBS,N> operator-(const std::array<RegBS,N> &A)
 
				     return A;
			
 
				 }
			
 
				 
			
 
				-template <typename T>
			
 
				-inline std::tuple<value_t,value_t,value_t> combine(
			
 
				-        const std::tuple<T,T,T> &A, const std::tuple<T,T,T> &B,
			
 
				+template <typename S, size_t N>
			
 
				+inline std::array<S,N> &operator+=(std::array<S,N> &A, const std::array<S,N> &B)
			
 
				+{
			
 
				+    for (size_t i=0;i<N;++i) {
			
 
				+        A[i] += B[i];
			
 
				+    }
			
 
				+    return A;
			
 
				+}
			
 
				+
			
 
				+template <typename S, size_t N>
			
 
				+inline std::array<S,N> &operator-=(std::array<S,N> &A, const std::array<S,N> &B)
			
 
				+{
			
 
				+    for (size_t i=0;i<N;++i) {
			
 
				+        A[i] -= B[i];
			
 
				+    }
			
 
				+    return A;
			
 
				+}
			
 
				+
			
 
				+template <typename S, size_t N>
			
 
				+inline std::array<S,N> &operator^=(std::array<S,N> &A, const std::array<S,N> &B)
			
 
				+{
			
 
				+    for (size_t i=0;i<N;++i) {
			
 
				+        A[i] ^= B[i];
			
 
				+    }
			
 
				+    return A;
			
 
				+}
			
 
				+
			
 
				+template <typename S, size_t N>
			
 
				+inline std::tuple<std::array<value_t,N>,std::array<value_t,N>,std::array<value_t,N>>
			
 
				+    combine(
			
 
				+        const std::tuple<std::array<S,N>,std::array<S,N>,std::array<S,N>> &A,
			
 
				+        const std::tuple<std::array<S,N>,std::array<S,N>,std::array<S,N>> &B,
			
 
				         nbits_t nbits = VALUE_BITS) {
			
 
				     return std::make_tuple(
			
 
				         combine(std::get<0>(A), std::get<0>(B), nbits),
			
@@ -680,6 +766,25 @@ DEFAULT_IO(HalfTriple)
 
				 // We don't need one for AndTriple because it's exactly the same type as
			
 
				 // MultTriple
			
 
				 
			
 
				+// I/O for arrays
			
 
				+template <typename T, typename S, size_t N>
			
 
				+T& operator>>(T& is, std::array<S,N> &x)
			
 
				+{
			
 
				+    for (size_t i=0;i<N;++i) {
			
 
				+        is >> x[i];
			
 
				+    }
			
 
				+    return is;
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename S, size_t N>
			
 
				+T& operator<<(T& os, const std::array<S,N> &x)
			
 
				+{
			
 
				+    for (size_t i=0;i<N;++i) {
			
 
				+        os << x[i];
			
 
				+    }
			
 
				+    return os;
			
 
				+}
			
 
				+
			
 
				 // I/O for SelectTriples
			
 
				 template <typename T, typename V>
			
 
				 T& operator>>(T& is, SelectTriple<V> &x)
			
@@ -733,6 +838,36 @@ T& operator<<(T& os, const SelectTriple<V> &x)
 
				 DEFAULT_TUPLE_IO(RegAS)
			
 
				 DEFAULT_TUPLE_IO(RegXS)
			
 
				 
			
 
				+// And for pairs and triples of arrays
			
 
				+
			
 
				+template <typename T, typename S, size_t N>
			
 
				+T& operator>>(T& is, std::tuple<std::array<S,N>, std::array<S,N>> &x)
			
 
				+{
			
 
				+    is >> std::get<0>(x) >> std::get<1>(x);
			
 
				+    return is;
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename S, size_t N>
			
 
				+T& operator<<(T& os, const std::tuple<std::array<S,N>, std::array<S,N>> &x)
			
 
				+{
			
 
				+    os << std::get<0>(x) << std::get<1>(x);
			
 
				+    return os;
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename S, size_t N>
			
 
				+T& operator>>(T& is, std::tuple<std::array<S,N>, std::array<S,N>, std::array<S,N>> &x)
			
 
				+{
			
 
				+    is >> std::get<0>(x) >> std::get<1>(x) >> std::get<2>(x);
			
 
				+    return is;
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename S, size_t N>
			
 
				+T& operator<<(T& os, const std::tuple<std::array<S,N>, std::array<S,N>, std::array<S,N>> &x)
			
 
				+{
			
 
				+    os << std::get<0>(x) << std::get<1>(x) << std::get<2>(x);
			
 
				+    return os;
			
 
				+}
			
 
				+
			
 
				 enum ProcessingMode {
			
 
				     MODE_ONLINE,        // Online mode, after preprocessing has been done
			
 
				     MODE_PREPROCESSING, // Preprocessing mode