2 years ago · 82ef536bc2
--- a/Makefile
+++ b/Makefile
@@ -42,7 +42,7 @@ preproc.o: rdpf.tcc mpcops.hpp mpcops.tcc cdpf.hpp cdpf.tcc
 
				 online.o: online.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
			
 
				 online.o: options.hpp mpcops.hpp coroutine.hpp mpcops.tcc rdpf.hpp dpf.hpp
			
 
				 online.o: prg.hpp aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc
			
 
				-online.o: cell.hpp
			
 
				+online.o: cell.hpp shapes.hpp shapes.tcc
			
 
				 mpcops.o: mpcops.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				 mpcops.o: coroutine.hpp mpcops.tcc
			
 
				 rdpf.o: rdpf.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
			
@@ -51,7 +51,7 @@ cdpf.o: bitutils.hpp cdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc
 
				 cdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp cdpf.tcc
			
 
				 duoram.o: duoram.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				 duoram.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
			
 
				-duoram.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
			
 
				+duoram.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc shapes.hpp shapes.tcc
			
 
				 cell.o: types.hpp bitutils.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc
			
 
				 cell.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
			
 
				 cell.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp
			
--- a/cell.cpp
+++ b/cell.cpp
@@ -22,6 +22,11 @@ void cell(MPCIO &mpcio,
 
				         size_t size = size_t(1)<<depth;
			
 
				         Duoram<Cell> oram(tio.player(), size);
			
 
				         auto A = oram.flat(tio, yield);
			
 
				+        Cell init;
			
 
				+        init.key.set(0xffffffffffffffff);
			
 
				+        init.pointers.set(0xeeeeeeeeeeeeeeee);
			
 
				+        init.value.set(0xdddddddddddddddd);
			
 
				+        A.init(init);
			
 
				         Cell c;
			
 
				         c.key.set(0x0102030405060708);
			
 
				         c.pointers.set(0x1112131415161718);
			
--- a/duoram.cpp
+++ b/duoram.cpp
@@ -1,51 +1,70 @@
 
				 #include "duoram.hpp"
			
 
				+#include "shapes.hpp"
			
 
				 
			
 
				 // Assuming the memory is already sorted, do an oblivious binary
			
 
				-// search for the largest index containing the value at most the
			
 
				-// given one.  (The answer will be 0 if all of the memory elements
			
 
				-// are greate than the target.) This Flat must be a power of 2 size.
			
 
				-// Only available for additive shared databases for now.
			
 
				+// search for the smallest index containing the value at least the
			
 
				+// given one.  (The answer will be the length of the Shape if all
			
 
				+// elements are smaller than the target.) Only available for additive
			
 
				+// shared databases for now.
			
 
				+
			
 
				+// The basic version uses log(N) ORAM reads of size N, where N is the
			
 
				+// smallest power of 2 strictly larger than the Shape size
			
 
				 template <>
			
 
				-RegAS Duoram<RegAS>::Flat::obliv_binary_search(RegAS &target)
			
 
				+RegAS Duoram<RegAS>::Shape::basic_binary_search(RegAS &target)
			
 
				 {
			
 
				-    nbits_t depth = this->addr_size;
			
 
				+    if (this->shape_size == 0) {
			
 
				+        RegAS zero;
			
 
				+        return zero;
			
 
				+    }
			
 
				+    // Create a Pad of the smallest power of 2 size strictly greater
			
 
				+    // than the Shape size
			
 
				+    address_t padsize = 1;
			
 
				+    nbits_t depth = 0;
			
 
				+    while (padsize <= this->shape_size) {
			
 
				+        padsize *= 2;
			
 
				+        ++depth;
			
 
				+    }
			
 
				+    Duoram<RegAS>::Pad P(*this, tio, yield, padsize);
			
 
				+
			
 
				     // Start in the middle
			
 
				     RegAS index;
			
 
				-    index.set(this->tio.player() ? 0 : 1<<(depth-1));
			
 
				-    // Invariant: index points to the first element of the right half of
			
 
				-    // the remaining possible range
			
 
				+    index.set(this->tio.player() ? 0 : (1<<(depth-1))-1);
			
 
				+    // Invariant: index points to the last element of the left half of
			
 
				+    // the remaining possible range, which is of width (1<<depth).
			
 
				     while (depth > 0) {
			
 
				         // Obliviously read the value there
			
 
				-        RegAS val = operator[](index);
			
 
				+        RegAS val = P[index];
			
 
				         // Compare it to the target
			
 
				         CDPF cdpf = tio.cdpf(this->yield);
			
 
				         auto [lt, eq, gt] = cdpf.compare(this->tio, this->yield,
			
 
				             val-target, tio.aes_ops());
			
 
				         if (depth > 1) {
			
 
				-            // If val > target, the answer is strictly to the left
			
 
				+            // If val >= target, the answer is here or to the left
			
 
				             // and we should subtract 2^{depth-2} from index
			
 
				-            // If val <= target, the answer is here or to the right
			
 
				+            // If val < target, the answer is strictly to the right
			
 
				             // and we should add 2^{depth-2} to index
			
 
				             // So we unconditionally subtract 2^{depth-2} from index, and
			
 
				-            // add (lt+eq)*2^{depth-1}.
			
 
				+            // add (lt)*2^{depth-1}.
			
 
				             RegAS uncond;
			
 
				             uncond.set(tio.player() ? 0 : address_t(1)<<(depth-2));
			
 
				             RegAS cond;
			
 
				             cond.set(tio.player() ? 0 : address_t(1)<<(depth-1));
			
 
				             RegAS condprod;
			
 
				-            RegBS le = lt ^ eq;
			
 
				-            mpc_flagmult(this->tio, this->yield, condprod, le, cond);
			
 
				+            mpc_flagmult(this->tio, this->yield, condprod, lt, cond);
			
 
				             index -= uncond;
			
 
				             index += condprod;
			
 
				         } else {
			
 
				-            // If val > target, the answer is strictly to the left
			
 
				-            // If val <= target, the answer is here or to the right
			
 
				-            // so subtract gt from index
			
 
				+            // The possible range is of width 2, and we're pointing to
			
 
				+            // the first element of it.
			
 
				+            // If val >= target, the answer is here or to the left, so
			
 
				+            // it's here.
			
 
				+            // If val < target, the answer is strictly to the right
			
 
				+            // so add lt to index
			
 
				             RegAS cond;
			
 
				             cond.set(tio.player() ? 0 : 1);
			
 
				             RegAS condprod;
			
 
				-            mpc_flagmult(this->tio, this->yield, condprod, gt, cond);
			
 
				-            index -= condprod;
			
 
				+            mpc_flagmult(this->tio, this->yield, condprod, lt, cond);
			
 
				+            index += condprod;
			
 
				         }
			
 
				         --depth;
			
 
				     }
			
@@ -53,3 +72,63 @@ RegAS Duoram<RegAS>::Flat::obliv_binary_search(RegAS &target)
 
				     return index;
			
 
				 }
			
 
				 
			
 
				+// This version does 1 ORAM read of size 2, 1 of size 4, 1 of size
			
 
				+// 8, ..., 1 of size N/2, where N is the smallest power of 2 strictly
			
 
				+// larger than the Shape size
			
 
				+template <>
			
 
				+RegXS Duoram<RegAS>::Shape::binary_search(RegAS &target)
			
 
				+{
			
 
				+    if (this->shape_size == 0) {
			
 
				+        RegXS zero;
			
 
				+        return zero;
			
 
				+    }
			
 
				+    // Create a Pad of the smallest power of 2 size strictly greater
			
 
				+    // than the Shape size
			
 
				+    address_t padsize = 1;
			
 
				+    nbits_t depth = 0;
			
 
				+    while (padsize <= this->shape_size) {
			
 
				+        padsize *= 2;
			
 
				+        ++depth;
			
 
				+    }
			
 
				+    Duoram<RegAS>::Pad P(*this, tio, yield, padsize);
			
 
				+    // Explicitly read the middle item
			
 
				+    address_t mid = (1<<(depth-1))-1;
			
 
				+    RegAS val = P[mid];
			
 
				+    // Compare it to the target
			
 
				+    CDPF cdpf = tio.cdpf(this->yield);
			
 
				+    auto [lt, eq, gt] = cdpf.compare(this->tio, this->yield,
			
 
				+        val-target, tio.aes_ops());
			
 
				+    if (depth == 1) {
			
 
				+        // There was only one item in the Shape, and mid will equal 0, so
			
 
				+        // val is (a share of) that item, P[0].  If val >= target, the
			
 
				+        // answer is here or to the left, so it must be 0.  If val <
			
 
				+        // target, the answer is strictly to the right, so it must be 1.
			
 
				+        // So just return lt.
			
 
				+        return RegXS(lt);
			
 
				+    }
			
 
				+    auto oidx = P.oblivindex(depth-1);
			
 
				+    oidx.incr(lt);
			
 
				+    --depth;
			
 
				+    while(depth > 0) {
			
 
				+        // Create the Stride shape; the ORAM will operate only over
			
 
				+        // elements of the Stride, which will consist of exactly those
			
 
				+        // elements of the Pad we could possibly be accessing at this
			
 
				+        // depth.  Those will be elements start=(1<<(depth-1)-1,
			
 
				+        // start+(1<<depth), start+(2<<depth), start+(3<<depth), and so
			
 
				+        // on.  The invariant is that the range of remaining possible
			
 
				+        // answers is of width (1<<depth), and we will look at the
			
 
				+        // rightmost element of the left half.  If that value (val) has
			
 
				+        // val >= target, then the answer is at that position or to the
			
 
				+        // left, so we append a 0 to the index.  If val < targer, then
			
 
				+        // the answer is strictly to the right, so we append a 1 to the
			
 
				+        // index.  That is, always append lt to the index.
			
 
				+        Duoram<RegAS>::Stride S(P, tio, yield, (1<<(depth-1))-1, 1<<depth);
			
 
				+        RegAS val = S[oidx];
			
 
				+        CDPF cdpf = tio.cdpf(this->yield);
			
 
				+        auto [lt, eq, gt] = cdpf.compare(this->tio, this->yield,
			
 
				+            val-target, tio.aes_ops());
			
 
				+        oidx.incr(lt);
			
 
				+        --depth;
			
 
				+    }
			
 
				+    return oidx.index();
			
 
				+}
			
--- a/duoram.hpp
+++ b/duoram.hpp
@@ -1,6 +1,9 @@
 
				 #ifndef __DUORAM_HPP__
			
 
				 #define __DUORAM_HPP__
			
 
				 
			
 
				+#include <optional>
			
 
				+#include <functional>
			
 
				+
			
 
				 #include "types.hpp"
			
 
				 #include "mpcio.hpp"
			
 
				 #include "coroutine.hpp"
			
@@ -22,10 +25,8 @@
 
				 // on a Shape shared with other threads or coroutines.
			
 
				 
			
 
				 // This is templated, because you can have a Duoram of additively shared
			
 
				-// (RegAS) or XOR shared (RegXS) elements, or std::arrays of those to
			
 
				-// get "wide" memory cells.
			
 
				-
			
 
				-// The initial implementation is focused on additive shares.
			
 
				+// (RegAS) or XOR shared (RegXS) elements, or more complex cell types
			
 
				+// (see cell.hpp for example).
			
 
				 
			
 
				 template <typename T>
			
 
				 class Duoram {
			
@@ -58,6 +59,13 @@ public:
 
				     class Shape;
			
 
				     // These are the different Shapes that exist
			
 
				     class Flat;
			
 
				+    class Pad;
			
 
				+    class Stride;
			
 
				+    class Path;
			
 
				+
			
 
				+    // Oblivious indices for use in related-index ORAM accesses
			
 
				+    template <typename U, nbits_t WIDTH>
			
 
				+    class OblivIndex;
			
 
				 
			
 
				     // Pass the player number and desired size
			
 
				     Duoram(int player, size_t size);
			
@@ -80,8 +88,15 @@ public:
 
				 
			
 
				 template <typename T>
			
 
				 class Duoram<T>::Shape {
			
 
				-    // Subclasses should be able to access _other_ Shapes' indexmap
			
 
				+    // Subclasses should be able to access _other_ Shapes'
			
 
				+    // get_{comp,server} functions
			
 
				     friend class Flat;
			
 
				+    friend class Pad;
			
 
				+    friend class Stride;
			
 
				+    friend class Path;
			
 
				+
			
 
				+    template <typename U, nbits_t WIDTH>
			
 
				+    friend class OblivIndex;
			
 
				 
			
 
				     // When you index into a shape (A[x]), you get one of these types,
			
 
				     // depending on the type of x (the index), _not_ on the type T (the
			
@@ -97,8 +112,8 @@ class Duoram<T>::Shape {
 
				     // a particular field of T, then FT will be the type of the field
			
 
				     // (RegAS or RegXS) and FST will be a pointer-to-member T::* type
			
 
				     // pointing to that field.  Sh is the specific Shape subtype used to
			
 
				-    // create the MemRefS.
			
 
				-    template <typename U, typename FT, typename FST, typename Sh>
			
 
				+    // create the MemRefS.  WIDTH is the RDPF width to use.
			
 
				+    template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
			
 
				     class MemRefS;
			
 
				     // When x is unshared explicit value.  FT and FST are as above.
			
 
				     template <typename FT, typename FST>
			
@@ -161,12 +176,8 @@ protected:
 
				         explicitmode(copy_from.explicitmode) {}
			
 
				 
			
 
				     // The index-mapping function. Input the index relative to this
			
 
				-    // shape, and output the corresponding physical address.  The
			
 
				-    // strategy is to map the index relative to this shape to the index
			
 
				-    // relative to the parent shape, call the parent's indexmap function
			
 
				-    // on that (unless this is the topmost shape), and return what it
			
 
				-    // returns.  If this is the topmost shape, just return what you
			
 
				-    // would have passed to the parent's indexmap.
			
 
				+    // shape, and output the corresponding index relative to the parent
			
 
				+    // shape.
			
 
				     //
			
 
				     // This is a pure virtual function; all subclasses of Shape must
			
 
				     // implement it, and of course Shape itself therefore cannot be
			
@@ -174,55 +185,156 @@ protected:
 
				     virtual size_t indexmap(size_t idx) const = 0;
			
 
				 
			
 
				     // Get a pair (for the server) of references to the underlying
			
 
				-    // Duoram entries at share virtual index idx.  (That is, it gets
			
 
				-    // duoram.p0_blind[indexmap(idx)], etc.)
			
 
				-    inline std::tuple<T&,T&> get_server(size_t idx,
			
 
				+    // Duoram entries at share virtual index idx.
			
 
				+    virtual inline std::tuple<T&,T&> get_server(size_t idx,
			
 
				         std::nullopt_t null = std::nullopt) const {
			
 
				-        size_t physaddr = indexmap(idx);
			
 
				-        return std::tie(
			
 
				-            duoram.p0_blind[physaddr],
			
 
				-            duoram.p1_blind[physaddr]);
			
 
				+        size_t parindex = indexmap(idx);
			
 
				+        if (&(this->parent) == this) {
			
 
				+            return std::tie(
			
 
				+                duoram.p0_blind[parindex],
			
 
				+                duoram.p1_blind[parindex]);
			
 
				+        } else {
			
 
				+            return this->parent.get_server(parindex, null);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     // Get a triple (for the computational players) of references to the
			
 
				-    // underlying Duoram entries at share virtual index idx.  (That is,
			
 
				-    // it gets duoram.database[indexmap(idx)], etc.)
			
 
				-    inline std::tuple<T&,T&,T&> get_comp(size_t idx,
			
 
				+    // underlying Duoram entries at share virtual index idx.
			
 
				+    virtual inline std::tuple<T&,T&,T&> get_comp(size_t idx,
			
 
				         std::nullopt_t null = std::nullopt) const {
			
 
				-        size_t physaddr = indexmap(idx);
			
 
				-        return std::tie(
			
 
				-            duoram.database[physaddr],
			
 
				-            duoram.blind[physaddr],
			
 
				-            duoram.peer_blinded_db[physaddr]);
			
 
				+        size_t parindex = indexmap(idx);
			
 
				+        if (&(this->parent) == this) {
			
 
				+            return std::tie(
			
 
				+                duoram.database[parindex],
			
 
				+                duoram.blind[parindex],
			
 
				+                duoram.peer_blinded_db[parindex]);
			
 
				+        } else {
			
 
				+            return this->parent.get_comp(parindex, null);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     // Get a pair (for the server) of references to a particular field
			
 
				     // of the underlying Duoram entries at share virtual index idx.
			
 
				-    // (That is, it gets duoram.p0_blind[indexmap(idx)].field, etc.)
			
 
				     template <typename FT>
			
 
				     inline std::tuple<FT&,FT&> get_server(size_t idx, FT T::*field) const {
			
 
				-        size_t physaddr = indexmap(idx);
			
 
				-        return std::tie(
			
 
				-            duoram.p0_blind[physaddr].*field,
			
 
				-            duoram.p1_blind[physaddr].*field);
			
 
				+        size_t parindex = indexmap(idx);
			
 
				+        if (&(this->parent) == this) {
			
 
				+            return std::tie(
			
 
				+                duoram.p0_blind[parindex].*field,
			
 
				+                duoram.p1_blind[parindex].*field);
			
 
				+        } else {
			
 
				+            return this->parent.get_server(parindex, field);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     // Get a triple (for the computational players) of references to a
			
 
				     // particular field to the underlying Duoram entries at share
			
 
				-    // virtual index idx.  (That is, it gets
			
 
				-    // duoram.database[indexmap(idx)].field, etc.)
			
 
				+    // virtual index idx.
			
 
				     template <typename FT>
			
 
				     inline std::tuple<FT&,FT&,FT&> get_comp(size_t idx, FT T::*field) const {
			
 
				-        size_t physaddr = indexmap(idx);
			
 
				-        return std::tie(
			
 
				-            duoram.database[physaddr].*field,
			
 
				-            duoram.blind[physaddr].*field,
			
 
				-            duoram.peer_blinded_db[physaddr].*field);
			
 
				+        size_t parindex = indexmap(idx);
			
 
				+        if (&(this->parent) == this) {
			
 
				+            return std::tie(
			
 
				+                duoram.database[parindex].*field,
			
 
				+                duoram.blind[parindex].*field,
			
 
				+                duoram.peer_blinded_db[parindex].*field);
			
 
				+        } else {
			
 
				+            return this->parent.get_comp(parindex, field);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				 public:
			
 
				     // Get the size
			
 
				-    inline size_t size() { return shape_size; }
			
 
				+    inline size_t size() const { return shape_size; }
			
 
				+
			
 
				+    // Initialize the contents of the Shape to a constant.  This method
			
 
				+    // does no communication; all the operations are local.  This only
			
 
				+    // works for T=RegXS or RegAS.
			
 
				+    void init(size_t value) {
			
 
				+        T v;
			
 
				+        v.set(value);
			
 
				+        init([v] (size_t i) { return v; });
			
 
				+    }
			
 
				+
			
 
				+    // As above, but for general T
			
 
				+    void init(const T &value) {
			
 
				+        init([value] (size_t i) { return value; });
			
 
				+    }
			
 
				+
			
 
				+    // As above, but use the default initializer for T (probably sets
			
 
				+    // everything to 0).
			
 
				+    void init() {
			
 
				+        T deflt;
			
 
				+        init(deflt);
			
 
				+    }
			
 
				+
			
 
				+    // Pass a function f: size_t -> size_t, and initialize element i of the
			
 
				+    // Shape to f(i) for each i.  This method does no communication; all
			
 
				+    // the operations are local.  This function must be deterministic
			
 
				+    // and public.  Only works for T=RegAS or RegXS.
			
 
				+    void init(std::function<size_t(size_t)> f) {
			
 
				+        int player = tio.player();
			
 
				+        if (player < 2) {
			
 
				+            for (size_t i=0; i<shape_size; ++i) {
			
 
				+                auto [DB, BL, PBD] = get_comp(i);
			
 
				+                BL.set(0);
			
 
				+                if (player) {
			
 
				+                    DB.set(f(i));
			
 
				+                    PBD.set(0);
			
 
				+                } else {
			
 
				+                    DB.set(0);
			
 
				+                    PBD.set(f(i));
			
 
				+                }
			
 
				+            }
			
 
				+        } else {
			
 
				+            for (size_t i=0; i<shape_size; ++i) {
			
 
				+                auto [BL0, BL1] = get_server(i);
			
 
				+                BL0.set(0);
			
 
				+                BL1.set(0);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Pass a function f: size_t -> T, and initialize element i of the
			
 
				+    // Shape to f(i) for each i.  This method does no communication; all
			
 
				+    // the operations are local.  This function must be deterministic
			
 
				+    // and public.
			
 
				+    void init(std::function<T(size_t)> f) {
			
 
				+        int player = tio.player();
			
 
				+        if (player < 2) {
			
 
				+            for (size_t i=0; i<shape_size; ++i) {
			
 
				+                auto [DB, BL, PBD] = get_comp(i);
			
 
				+                BL = T();
			
 
				+                if (player) {
			
 
				+                    DB = f(i);
			
 
				+                    PBD = T();
			
 
				+                } else {
			
 
				+                    DB = T();
			
 
				+                    PBD = f(i);
			
 
				+                }
			
 
				+            }
			
 
				+        } else {
			
 
				+            for (size_t i=0; i<shape_size; ++i) {
			
 
				+                auto [BL0, BL1] = get_server(i);
			
 
				+                BL0 = T();
			
 
				+                BL1 = T();
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Assuming the Shape is already sorted, do an oblivious binary
			
 
				+    // search for the smallest index containing the value at least the
			
 
				+    // given one.  (The answer will be the length of the Shape if all
			
 
				+    // elements are smaller than the target.) Only available for additive
			
 
				+    // shared databases for now.
			
 
				+
			
 
				+    // The basic version uses log(N) ORAM reads of size N, where N is
			
 
				+    // the smallest power of 2 strictly larger than the Shape size
			
 
				+    RegAS basic_binary_search(RegAS &target);
			
 
				+    // This version does 1 ORAM read of size 2, 1 of size 4, 1 of size
			
 
				+    // 8, ..., 1 of size N/2, where N is the smallest power of 2
			
 
				+    // strictly larger than the Shape size
			
 
				+    RegXS binary_search(RegAS &target);
			
 
				 
			
 
				     // Enable or disable explicit-only mode.  Only using [] with
			
 
				     // explicit (address_t) indices are allowed in this mode.  Using []
			
@@ -236,6 +348,40 @@ public:
 
				     // next oblivious read or write.  Bitonic sort is a prime example.
			
 
				     void explicitonly(bool enable);
			
 
				 
			
 
				+    // Create an OblivIndex, non-incrementally (supply the shares of the
			
 
				+    // index directly) or incrementally (the bits of the index will be
			
 
				+    // supplied later, one at a time)
			
 
				+
			
 
				+    // Non-incremental, RegXS index
			
 
				+    OblivIndex<RegXS,1> oblivindex(const RegXS &idx, nbits_t depth=0) {
			
 
				+        if (depth == 0) {
			
 
				+            depth = this->addr_size;
			
 
				+        }
			
 
				+        typename Duoram<T>::template OblivIndex<RegXS,1>
			
 
				+            res(this->tio, this->yield, idx, depth);
			
 
				+        return res;
			
 
				+    }
			
 
				+
			
 
				+    // Non-incremental, RegAS index
			
 
				+    OblivIndex<RegAS,1> oblivindex(const RegAS &idx, nbits_t depth=0) {
			
 
				+        if (depth == 0) {
			
 
				+            depth = this->addr_size;
			
 
				+        }
			
 
				+        typename Duoram<T>::template OblivIndex<RegAS,1>
			
 
				+            res(this->tio, this->yield, idx, depth);
			
 
				+        return res;
			
 
				+    }
			
 
				+
			
 
				+    // Incremental (requires RegXS index, supplied bit-by-bit later)
			
 
				+    OblivIndex<RegXS,1> oblivindex(nbits_t depth=0) {
			
 
				+        if (depth == 0) {
			
 
				+            depth = this->addr_size;
			
 
				+        }
			
 
				+        typename Duoram<T>::template OblivIndex<RegXS,1>
			
 
				+            res(this->tio, this->yield, depth);
			
 
				+        return res;
			
 
				+    }
			
 
				+
			
 
				     // For debugging or checking your answers (using this in general is
			
 
				     // of course insecure)
			
 
				 
			
@@ -258,15 +404,11 @@ class Duoram<T>::Flat : public Duoram<T>::Shape {
 
				 
			
 
				     inline size_t indexmap(size_t idx) const {
			
 
				         size_t paridx = idx + start;
			
 
				-        if (&(this->parent) == this) {
			
 
				-            return paridx;
			
 
				-        } else {
			
 
				-            return this->parent.indexmap(paridx);
			
 
				-        }
			
 
				+        return paridx;
			
 
				     }
			
 
				 
			
 
				     // Internal function to aid bitonic_sort
			
 
				-    void butterfly(address_t start, nbits_t depth, bool dir);
			
 
				+    void butterfly(address_t start, address_t len, bool dir);
			
 
				 
			
 
				 public:
			
 
				     // Constructor.  len=0 means the maximum size (the parent's size
			
@@ -274,6 +416,11 @@ public:
 
				     Flat(Duoram &duoram, MPCTIO &tio, yield_t &yield, size_t start = 0,
			
 
				         size_t len = 0);
			
 
				 
			
 
				+    // Constructor.  len=0 means the maximum size (the parent's size
			
 
				+    // minus start).
			
 
				+    Flat(const Shape &parent, MPCTIO &tio, yield_t &yield, size_t start = 0,
			
 
				+        size_t len = 0);
			
 
				+
			
 
				     // Copy the given Flat except for the tio and yield
			
 
				     Flat(const Flat &copy_from, MPCTIO &tio, yield_t &yield) :
			
 
				         Shape(copy_from, tio, yield), start(copy_from.start),
			
@@ -290,20 +437,28 @@ public:
 
				     }
			
 
				 
			
 
				     // Index into this Flat in various ways
			
 
				-    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Flat>
			
 
				+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Flat,1>
			
 
				             operator[](const RegAS &idx) {
			
 
				         typename Duoram<T>::Shape::
			
 
				-            template MemRefS<RegAS,T,std::nullopt_t,Flat>
			
 
				+            template MemRefS<RegAS,T,std::nullopt_t,Flat,1>
			
 
				             res(*this, idx, std::nullopt);
			
 
				         return res;
			
 
				     }
			
 
				-    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Flat>
			
 
				+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Flat,1>
			
 
				             operator[](const RegXS &idx) {
			
 
				         typename Duoram<T>::Shape::
			
 
				-            template MemRefS<RegXS,T,std::nullopt_t,Flat>
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Flat,1>
			
 
				             res(*this, idx, std::nullopt);
			
 
				         return res;
			
 
				     }
			
 
				+    template <typename U, nbits_t WIDTH>
			
 
				+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Flat,WIDTH>
			
 
				+            operator[](OblivIndex<U,WIDTH> &obidx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Flat,WIDTH>
			
 
				+            res(*this, obidx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				     typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
			
 
				             operator[](address_t idx) {
			
 
				         typename Duoram<T>::Shape::
			
@@ -338,18 +493,72 @@ public:
 
				     template<typename U,typename V>
			
 
				     void osort(const U &idx1, const V &idx2, bool dir=0);
			
 
				 
			
 
				-    // Bitonic sort the elements from start to start+(1<<depth)-1, in
			
 
				+    // Bitonic sort the elements from start to start+len-1, in
			
 
				     // increasing order if dir=0 or decreasing order if dir=1. Note that
			
 
				     // the elements must be at most 63 bits long each for the notion of
			
 
				     // ">" to make consistent sense.
			
 
				-    void bitonic_sort(address_t start, nbits_t depth, bool dir=0);
			
 
				-
			
 
				-    // Assuming the memory is already sorted, do an oblivious binary
			
 
				-    // search for the largest index containing the value at most the
			
 
				-    // given one.  (The answer will be 0 if all of the memory elements
			
 
				-    // are greate than the target.) This Flat must be a power of 2 size.
			
 
				-    // Only available for additive shared databases for now.
			
 
				-    RegAS obliv_binary_search(RegAS &target);
			
 
				+    void bitonic_sort(address_t start, address_t len, bool dir=0);
			
 
				+};
			
 
				+
			
 
				+// Oblivious indices for use in related-index ORAM accesses.
			
 
				+
			
 
				+template <typename T>
			
 
				+template <typename U, nbits_t WIDTH>
			
 
				+class Duoram<T>::OblivIndex {
			
 
				+    template <typename Ux,typename FT,typename FST,typename Sh,nbits_t WIDTHx>
			
 
				+    friend class Shape::MemRefS;
			
 
				+
			
 
				+    int player;
			
 
				+    std::optional<RDPFTriple<WIDTH>> dt;
			
 
				+    std::optional<RDPFPair<WIDTH>> dp;
			
 
				+    nbits_t curdepth, maxdepth;
			
 
				+    nbits_t next_windex;
			
 
				+    bool incremental;
			
 
				+    U idx;
			
 
				+
			
 
				+public:
			
 
				+    // Non-incremental constructor
			
 
				+    OblivIndex(MPCTIO &tio, yield_t &yield, const U &idx, nbits_t depth) :
			
 
				+        player(tio.player()), curdepth(depth), maxdepth(depth),
			
 
				+        next_windex(0), incremental(false), idx(idx)
			
 
				+    {
			
 
				+        if (player < 2) {
			
 
				+            dt = tio.rdpftriple<WIDTH>(yield, depth);
			
 
				+        } else {
			
 
				+            dp = tio.rdpfpair<WIDTH>(yield, depth);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Incremental constructor: only for U=RegXS
			
 
				+    OblivIndex(MPCTIO &tio, yield_t &yield, nbits_t depth) :
			
 
				+        player(tio.player()), curdepth(0), maxdepth(depth),
			
 
				+        next_windex(0), incremental(true), idx(RegXS())
			
 
				+    {
			
 
				+        if (player < 2) {
			
 
				+            dt = tio.rdpftriple(yield, depth, true);
			
 
				+        } else {
			
 
				+            dp = tio.rdpfpair(yield, depth, true);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Incrementally append a (shared) bit to the oblivious index
			
 
				+    void incr(RegBS bit)
			
 
				+    {
			
 
				+        assert(incremental);
			
 
				+        idx.xshare = (idx.xshare << 1) | value_t(bit.bshare);
			
 
				+        ++curdepth;
			
 
				+        if (player < 2) {
			
 
				+            dt->depth(curdepth);
			
 
				+        } else {
			
 
				+            dp->depth(curdepth);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Get a copy of the index
			
 
				+    U index() { return idx; }
			
 
				+
			
 
				+    // Get the next wide-RDPF index
			
 
				+    nbits_t windex() { assert(next_windex < WIDTH); return next_windex++; }
			
 
				 };
			
 
				 
			
 
				 // An additive or XOR shared memory reference.  You get one of these
			
@@ -363,31 +572,50 @@ public:
 
				 // particular field of T, then FT will be the type of the field (RegAS
			
 
				 // or RegXS) and FST will be a pointer-to-member T::* type pointing to
			
 
				 // that field.  Sh is the specific Shape subtype used to create the
			
 
				-// MemRefS.
			
 
				+// MemRefS.  WIDTH is the RDPF width to use.
			
 
				 
			
 
				 template <typename T>
			
 
				-template <typename U, typename FT, typename FST, typename Sh>
			
 
				+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
			
 
				 class Duoram<T>::Shape::MemRefS {
			
 
				     Sh &shape;
			
 
				-    U idx;
			
 
				+    // oblividx is a reference to the OblivIndex we're using.  In the
			
 
				+    // common case, we own the actual OblivIndex, and it's stored in
			
 
				+    // our_oblividx, and oblividx is a pointer to that.  Sometimes
			
 
				+    // (for example incremental ORAM accesses), the caller will own (and
			
 
				+    // modify between uses) the OblivIndex.  In that case, oblividx will
			
 
				+    // be a pointer to the caller's OblivIndex object, and
			
 
				+    // our_oblividx will be nullopt.
			
 
				+    std::optional<Duoram<T>::OblivIndex<U,WIDTH>> our_oblividx;
			
 
				+    Duoram<T>::OblivIndex<U,WIDTH> *oblividx;
			
 
				+
			
 
				     FST fieldsel;
			
 
				 
			
 
				 private:
			
 
				     // Oblivious update to a shared index of Duoram memory, only for
			
 
				     // FT = RegAS or RegXS
			
 
				-    MemRefS<U,FT,FST,Sh> &oram_update(const FT& M, const prac_template_true&);
			
 
				+    MemRefS<U,FT,FST,Sh,WIDTH> &oram_update(const FT& M, const prac_template_true&);
			
 
				     // Oblivious update to a shared index of Duoram memory, for
			
 
				     // FT not RegAS or RegXS
			
 
				-    MemRefS<U,FT,FST,Sh> &oram_update(const FT& M, const prac_template_false&);
			
 
				+    MemRefS<U,FT,FST,Sh,WIDTH> &oram_update(const FT& M, const prac_template_false&);
			
 
				 
			
 
				 public:
			
 
				-    MemRefS<U,FT,FST,Sh>(Sh &shape, const U &idx, FST fieldsel) :
			
 
				-        shape(shape), idx(idx), fieldsel(fieldsel) {}
			
 
				+    MemRefS<U,FT,FST,Sh,WIDTH>(Sh &shape, const U &idx, FST fieldsel) :
			
 
				+        shape(shape), fieldsel(fieldsel) {
			
 
				+        our_oblividx.emplace(shape.tio, shape.yield, idx,
			
 
				+            shape.addr_size);
			
 
				+        oblividx = &(*our_oblividx);
			
 
				+    }
			
 
				+
			
 
				+    MemRefS<U,FT,FST,Sh,WIDTH>(Sh &shape, OblivIndex<U,WIDTH> &obidx, FST fieldsel) :
			
 
				+        shape(shape), fieldsel(fieldsel) {
			
 
				+        oblividx = &obidx;
			
 
				+    }
			
 
				 
			
 
				     // Create a MemRefExpl for accessing a partcular field of T
			
 
				     template <typename SFT>
			
 
				-    MemRefS<U,SFT,SFT T::*,Sh> field(SFT T::*subfieldsel) {
			
 
				-        auto res = MemRefS<U,SFT,SFT T::*,Sh>(this->shape, idx, subfieldsel);
			
 
				+    MemRefS<U,SFT,SFT T::*,Sh,WIDTH> field(SFT T::*subfieldsel) {
			
 
				+        auto res = MemRefS<U,SFT,SFT T::*,Sh,WIDTH>(this->shape,
			
 
				+            *oblividx, subfieldsel);
			
 
				         return res;
			
 
				     }
			
 
				 
			
@@ -395,10 +623,10 @@ public:
 
				     operator FT();
			
 
				 
			
 
				     // Oblivious update to a shared index of Duoram memory
			
 
				-    MemRefS<U,FT,FST,Sh> &operator+=(const FT& M);
			
 
				+    MemRefS<U,FT,FST,Sh,WIDTH> &operator+=(const FT& M);
			
 
				 
			
 
				     // Oblivious write to a shared index of Duoram memory
			
 
				-    MemRefS<U,FT,FST,Sh> &operator=(const FT& M);
			
 
				+    MemRefS<U,FT,FST,Sh,WIDTH> &operator=(const FT& M);
			
 
				 };
			
 
				 
			
 
				 // An explicit memory reference.  You get one of these from a Shape A
			
--- a/duoram.tcc
+++ b/duoram.tcc
@@ -177,62 +177,94 @@ Duoram<T>::Flat::Flat(Duoram &duoram, MPCTIO &tio, yield_t &yield,
 
				     this->set_shape_size(len);
			
 
				 }
			
 
				 
			
 
				-// Bitonic sort the elements from start to start+(1<<depth)-1, in
			
 
				+// Constructor for the Flat shape.  len=0 means the maximum size (the
			
 
				+// parent's size minus start).
			
 
				+template <typename T>
			
 
				+Duoram<T>::Flat::Flat(const Shape &parent, MPCTIO &tio, yield_t &yield,
			
 
				+    size_t start, size_t len) : Shape(parent, parent.duoram, tio, yield)
			
 
				+{
			
 
				+    size_t parentsize = parent.size();
			
 
				+    if (start > parentsize) {
			
 
				+        start = parentsize;
			
 
				+    }
			
 
				+    this->start = start;
			
 
				+    size_t maxshapesize = parentsize - start;
			
 
				+    if (len > maxshapesize || len == 0) {
			
 
				+        len = maxshapesize;
			
 
				+    }
			
 
				+    this->len = len;
			
 
				+    this->set_shape_size(len);
			
 
				+}
			
 
				+
			
 
				+// Bitonic sort the elements from start to start+len-1, in
			
 
				 // increasing order if dir=0 or decreasing order if dir=1. Note that
			
 
				 // the elements must be at most 63 bits long each for the notion of
			
 
				 // ">" to make consistent sense.
			
 
				 template <typename T>
			
 
				-void Duoram<T>::Flat::bitonic_sort(address_t start, nbits_t depth, bool dir)
			
 
				+void Duoram<T>::Flat::bitonic_sort(address_t start, address_t len, bool dir)
			
 
				 {
			
 
				-    if (depth == 0) return;
			
 
				-    if (depth == 1) {
			
 
				+    if (len < 2) return;
			
 
				+    if (len == 2) {
			
 
				         osort(start, start+1, dir);
			
 
				         return;
			
 
				     }
			
 
				-    // Recurse on the first half (increasing order) and the second half
			
 
				-    // (decreasing order) in parallel
			
 
				+    address_t leftlen, rightlen;
			
 
				+    leftlen = (len+1) >> 1;
			
 
				+    rightlen = len >> 1;
			
 
				+
			
 
				+    // Recurse on the first half (opposite to the desired order)
			
 
				+    // and the second half (desired order) in parallel
			
 
				     run_coroutines(this->yield,
			
 
				-        [this, start, depth](yield_t &yield) {
			
 
				+        [this, start, leftlen, dir](yield_t &yield) {
			
 
				             Flat Acoro = context(yield);
			
 
				-            Acoro.bitonic_sort(start, depth-1, 0);
			
 
				+            Acoro.bitonic_sort(start, leftlen, !dir);
			
 
				         },
			
 
				-        [this, start, depth](yield_t &yield) {
			
 
				+        [this, start, leftlen, rightlen, dir](yield_t &yield) {
			
 
				             Flat Acoro = context(yield);
			
 
				-            Acoro.bitonic_sort(start+(1<<(depth-1)), depth-1, 1);
			
 
				+            Acoro.bitonic_sort(start+leftlen, rightlen, dir);
			
 
				         });
			
 
				     // Merge the two into the desired order
			
 
				-    butterfly(start, depth, dir);
			
 
				+    butterfly(start, len, dir);
			
 
				 }
			
 
				 
			
 
				 // Internal function to aid bitonic_sort
			
 
				 template <typename T>
			
 
				-void Duoram<T>::Flat::butterfly(address_t start, nbits_t depth, bool dir)
			
 
				+void Duoram<T>::Flat::butterfly(address_t start, address_t len, bool dir)
			
 
				 {
			
 
				-    if (depth == 0) return;
			
 
				-    if (depth == 1) {
			
 
				+    if (len < 2) return;
			
 
				+    if (len == 2) {
			
 
				         osort(start, start+1, dir);
			
 
				         return;
			
 
				     }
			
 
				-    // Sort pairs of elements half the width apart in parallel
			
 
				-    address_t halfwidth = address_t(1)<<(depth-1);
			
 
				+    address_t leftlen, rightlen, offset, num_swaps;
			
 
				+    // leftlen = (len+1) >> 1;
			
 
				+    leftlen = 1;
			
 
				+    while(2*leftlen < len) {
			
 
				+        leftlen *= 2;
			
 
				+    }
			
 
				+    rightlen = len - leftlen;
			
 
				+    offset = leftlen;
			
 
				+    num_swaps = rightlen;
			
 
				+
			
 
				+    // Sort pairs of elements offset apart in parallel
			
 
				     std::vector<coro_t> coroutines;
			
 
				-    for (address_t i=0; i<halfwidth;++i) {
			
 
				+    for (address_t i=0; i<num_swaps;++i) {
			
 
				         coroutines.emplace_back(
			
 
				-            [this, start, halfwidth, dir, i](yield_t &yield) {
			
 
				+            [this, start, offset, dir, i](yield_t &yield) {
			
 
				                 Flat Acoro = context(yield);
			
 
				-                Acoro.osort(start+i, start+i+halfwidth, dir);
			
 
				+                Acoro.osort(start+i, start+i+offset, dir);
			
 
				             });
			
 
				     }
			
 
				     run_coroutines(this->yield, coroutines);
			
 
				     // Recurse on each half in parallel
			
 
				     run_coroutines(this->yield,
			
 
				-        [this, start, depth, dir](yield_t &yield) {
			
 
				+        [this, start, leftlen, dir](yield_t &yield) {
			
 
				             Flat Acoro = context(yield);
			
 
				-            Acoro.butterfly(start, depth-1, dir);
			
 
				+            Acoro.butterfly(start, leftlen, dir);
			
 
				         },
			
 
				-        [this, start, halfwidth, depth, dir](yield_t &yield) {
			
 
				+        [this, start, leftlen, rightlen, dir](yield_t &yield) {
			
 
				             Flat Acoro = context(yield);
			
 
				-            Acoro.butterfly(start+halfwidth, depth-1, dir);
			
 
				+            Acoro.butterfly(start+leftlen, rightlen, dir);
			
 
				         });
			
 
				 }
			
 
				 
			
@@ -260,11 +292,11 @@ inline address_t IfRegXS<RegXS>(address_t val) { return val; }
 
				 // a particular field of T, then FT will be the type of the field (RegAS
			
 
				 // or RegXS) and FST will be a pointer-to-member T::* type pointing to
			
 
				 // that field.  Sh is the specific Shape subtype used to create the
			
 
				-// MemRefS.
			
 
				+// MemRefS.  WIDTH is the RDPF width to use.
			
 
				 
			
 
				 template <typename T>
			
 
				-template <typename U,typename FT,typename FST,typename Sh>
			
 
				-Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
			
 
				+template <typename U,typename FT,typename FST,typename Sh,nbits_t WIDTH>
			
 
				+Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::operator FT()
			
 
				 {
			
 
				     FT res;
			
 
				     Sh &shape = this->shape;
			
@@ -273,30 +305,29 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				     if (player < 2) {
			
 
				         // Computational players do this
			
 
				 
			
 
				-        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
			
 
				+        const RDPFTriple<WIDTH> &dt = *(oblividx->dt);
			
 
				+        const nbits_t depth = dt.depth();
			
 
				 
			
 
				         // Compute the index offset
			
 
				         U indoffset;
			
 
				         dt.get_target(indoffset);
			
 
				-        indoffset -= idx;
			
 
				+        indoffset -= oblividx->idx;
			
 
				 
			
 
				         // We only need two of the DPFs for reading
			
 
				-        RDPFPair<1> dp(std::move(dt), 0, player == 0 ? 2 : 1);
			
 
				-        // The RDPFTriple dt is now broken, since we've moved things out
			
 
				-        // of it.
			
 
				+        RDPF2of3<WIDTH> dp(dt, 0, player == 0 ? 2 : 1);
			
 
				 
			
 
				         // Send it to the peer and the server
			
 
				-        shape.tio.queue_peer(&indoffset, BITBYTES(shape.addr_size));
			
 
				-        shape.tio.queue_server(&indoffset, BITBYTES(shape.addr_size));
			
 
				+        shape.tio.queue_peer(&indoffset, BITBYTES(depth));
			
 
				+        shape.tio.queue_server(&indoffset, BITBYTES(depth));
			
 
				 
			
 
				         shape.yield();
			
 
				 
			
 
				         // Receive the above from the peer
			
 
				         U peerindoffset;
			
 
				-        shape.tio.recv_peer(&peerindoffset, BITBYTES(shape.addr_size));
			
 
				+        shape.tio.recv_peer(&peerindoffset, BITBYTES(depth));
			
 
				 
			
 
				         // Reconstruct the total offset
			
 
				-        auto indshift = combine(indoffset, peerindoffset, shape.addr_size);
			
 
				+        auto indshift = combine(indoffset, peerindoffset, depth);
			
 
				 
			
 
				         // Evaluate the DPFs and compute the dotproducts
			
 
				         ParallelEval pe(dp, IfRegAS<U>(indshift), IfRegXS<U>(indshift),
			
@@ -304,7 +335,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				             shape.tio.aes_ops());
			
 
				         FT init;
			
 
				         res = pe.reduce(init, [this, &dp, &shape] (int thread_num,
			
 
				-                address_t i, const RDPFPair<1>::LeafNode &leaf) {
			
 
				+                address_t i, const typename RDPFPair<WIDTH>::LeafNode &leaf) {
			
 
				             // The values from the two DPFs, which will each be of type T
			
 
				             std::tuple<FT,FT> V;
			
 
				             dp.unit(V, leaf);
			
@@ -324,16 +355,17 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				     } else {
			
 
				         // The server does this
			
 
				 
			
 
				-        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
			
 
				+        const RDPFPair<WIDTH> &dp = *(oblividx->dp);
			
 
				+        const nbits_t depth = dp.depth();
			
 
				         U p0indoffset, p1indoffset;
			
 
				 
			
 
				         shape.yield();
			
 
				 
			
 
				         // Receive the index offset from the computational players and
			
 
				         // combine them
			
 
				-        shape.tio.recv_p0(&p0indoffset, BITBYTES(shape.addr_size));
			
 
				-        shape.tio.recv_p1(&p1indoffset, BITBYTES(shape.addr_size));
			
 
				-        auto indshift = combine(p0indoffset, p1indoffset, shape.addr_size);
			
 
				+        shape.tio.recv_p0(&p0indoffset, BITBYTES(depth));
			
 
				+        shape.tio.recv_p1(&p1indoffset, BITBYTES(depth));
			
 
				+        auto indshift = combine(p0indoffset, p1indoffset, depth);
			
 
				 
			
 
				         // Evaluate the DPFs to compute the cancellation terms
			
 
				         std::tuple<FT,FT> init, gamma;
			
@@ -341,7 +373,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				             shape.shape_size, shape.tio.cpu_nthreads(),
			
 
				             shape.tio.aes_ops());
			
 
				         gamma = pe.reduce(init, [this, &dp, &shape] (int thread_num,
			
 
				-                address_t i, const RDPFPair<1>::LeafNode &leaf) {
			
 
				+                address_t i, const typename RDPFPair<WIDTH>::LeafNode &leaf) {
			
 
				             // The values from the two DPFs, each of type FT
			
 
				             std::tuple<FT,FT> V;
			
 
				             dp.unit(V, leaf);
			
@@ -372,9 +404,9 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 
				 // Oblivious update to a shared index of Duoram memory, only for
			
 
				 // FT = RegAS or RegXS.  The template parameters are as above.
			
 
				 template <typename T>
			
 
				-template <typename U, typename FT, typename FST, typename Sh>
			
 
				-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
			
 
				-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::oram_update(const FT& M,
			
 
				+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
			
 
				+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
			
 
				+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::oram_update(const FT& M,
			
 
				         const prac_template_true &)
			
 
				 {
			
 
				     Sh &shape = this->shape;
			
@@ -383,24 +415,26 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				     if (player < 2) {
			
 
				         // Computational players do this
			
 
				 
			
 
				-        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
			
 
				+        const RDPFTriple<WIDTH> &dt = *(oblividx->dt);
			
 
				+        const nbits_t windex = oblividx->windex();
			
 
				+        const nbits_t depth = dt.depth();
			
 
				 
			
 
				         // Compute the index and message offsets
			
 
				         U indoffset;
			
 
				         dt.get_target(indoffset);
			
 
				-        indoffset -= idx;
			
 
				-        RDPF<1>::W<FT> MW;
			
 
				-        MW[0] = M;
			
 
				+        indoffset -= oblividx->idx;
			
 
				+        typename RDPF<WIDTH>::template W<FT> MW;
			
 
				+        MW[windex] = M;
			
 
				         auto Moffset = std::make_tuple(MW, MW, MW);
			
 
				-        RDPFTriple<1>::WTriple<FT> scaled_val;
			
 
				+        typename RDPFTriple<WIDTH>::template WTriple<FT> scaled_val;
			
 
				         dt.scaled_value(scaled_val);
			
 
				         Moffset -= scaled_val;
			
 
				 
			
 
				         // Send them to the peer, and everything except the first offset
			
 
				         // to the server
			
 
				-        shape.tio.queue_peer(&indoffset, BITBYTES(shape.addr_size));
			
 
				+        shape.tio.queue_peer(&indoffset, BITBYTES(depth));
			
 
				         shape.tio.iostream_peer() << Moffset;
			
 
				-        shape.tio.queue_server(&indoffset, BITBYTES(shape.addr_size));
			
 
				+        shape.tio.queue_server(&indoffset, BITBYTES(depth));
			
 
				         shape.tio.iostream_server() << std::get<1>(Moffset) <<
			
 
				             std::get<2>(Moffset);
			
 
				 
			
@@ -408,12 +442,12 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				 
			
 
				         // Receive the above from the peer
			
 
				         U peerindoffset;
			
 
				-        RDPFTriple<1>::WTriple<FT> peerMoffset;
			
 
				-        shape.tio.recv_peer(&peerindoffset, BITBYTES(shape.addr_size));
			
 
				+        typename RDPFTriple<WIDTH>::template WTriple<FT> peerMoffset;
			
 
				+        shape.tio.recv_peer(&peerindoffset, BITBYTES(depth));
			
 
				         shape.tio.iostream_peer() >> peerMoffset;
			
 
				 
			
 
				         // Reconstruct the total offsets
			
 
				-        auto indshift = combine(indoffset, peerindoffset, shape.addr_size);
			
 
				+        auto indshift = combine(indoffset, peerindoffset, depth);
			
 
				         auto Mshift = combine(Moffset, peerMoffset);
			
 
				 
			
 
				         // Evaluate the DPFs and add them to the database
			
@@ -421,10 +455,10 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             shape.shape_size, shape.tio.cpu_nthreads(),
			
 
				             shape.tio.aes_ops());
			
 
				         int init = 0;
			
 
				-        pe.reduce(init, [this, &dt, &shape, &Mshift, player] (int thread_num,
			
 
				-                address_t i, const RDPFTriple<1>::LeafNode &leaf) {
			
 
				+        pe.reduce(init, [this, &dt, &shape, &Mshift, player, windex] (int thread_num,
			
 
				+                address_t i, const typename RDPFTriple<WIDTH>::LeafNode &leaf) {
			
 
				             // The values from the three DPFs
			
 
				-            RDPFTriple<1>::WTriple<FT> scaled;
			
 
				+            typename RDPFTriple<WIDTH>::template WTriple<FT> scaled;
			
 
				             std::tuple<FT,FT,FT> unit;
			
 
				             dt.scaled(scaled, leaf);
			
 
				             dt.unit(unit, leaf);
			
@@ -432,32 +466,34 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             // References to the appropriate cells in our database, our
			
 
				             // blind, and our copy of the peer's blinded database
			
 
				             auto [DB, BL, PBD] = shape.get_comp(i,fieldsel);
			
 
				-            DB += V0[0];
			
 
				+            DB += V0[windex];
			
 
				             if (player == 0) {
			
 
				-                BL -= V1[0];
			
 
				-                PBD += V2[0]-V0[0];
			
 
				+                BL -= V1[windex];
			
 
				+                PBD += V2[windex]-V0[windex];
			
 
				             } else {
			
 
				-                BL -= V2[0];
			
 
				-                PBD += V1[0]-V0[0];
			
 
				+                BL -= V2[windex];
			
 
				+                PBD += V1[windex]-V0[windex];
			
 
				             }
			
 
				             return 0;
			
 
				         });
			
 
				     } else {
			
 
				         // The server does this
			
 
				 
			
 
				-        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
			
 
				+        const RDPFPair<WIDTH> &dp = *(oblividx->dp);
			
 
				+        const nbits_t windex = oblividx->windex();
			
 
				+        const nbits_t depth = dp.depth();
			
 
				         U p0indoffset, p1indoffset;
			
 
				-        RDPFPair<1>::WPair<FT> p0Moffset, p1Moffset;
			
 
				+        typename RDPFPair<WIDTH>::template WPair<FT> p0Moffset, p1Moffset;
			
 
				 
			
 
				         shape.yield();
			
 
				 
			
 
				         // Receive the index and message offsets from the computational
			
 
				         // players and combine them
			
 
				-        shape.tio.recv_p0(&p0indoffset, BITBYTES(shape.addr_size));
			
 
				+        shape.tio.recv_p0(&p0indoffset, BITBYTES(depth));
			
 
				         shape.tio.iostream_p0() >> p0Moffset;
			
 
				-        shape.tio.recv_p1(&p1indoffset, BITBYTES(shape.addr_size));
			
 
				+        shape.tio.recv_p1(&p1indoffset, BITBYTES(depth));
			
 
				         shape.tio.iostream_p1() >> p1Moffset;
			
 
				-        auto indshift = combine(p0indoffset, p1indoffset, shape.addr_size);
			
 
				+        auto indshift = combine(p0indoffset, p1indoffset, depth);
			
 
				         auto Mshift = combine(p0Moffset, p1Moffset);
			
 
				 
			
 
				         // Evaluate the DPFs and subtract them from the blinds
			
@@ -465,10 +501,10 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             shape.shape_size, shape.tio.cpu_nthreads(),
			
 
				             shape.tio.aes_ops());
			
 
				         int init = 0;
			
 
				-        pe.reduce(init, [this, &dp, &shape, &Mshift] (int thread_num,
			
 
				-                address_t i, const RDPFPair<1>::LeafNode &leaf) {
			
 
				+        pe.reduce(init, [this, &dp, &shape, &Mshift, windex] (int thread_num,
			
 
				+                address_t i, const typename RDPFPair<WIDTH>::LeafNode &leaf) {
			
 
				             // The values from the two DPFs
			
 
				-            RDPFPair<1>::WPair<FT> scaled;
			
 
				+            typename RDPFPair<WIDTH>::template WPair<FT> scaled;
			
 
				             std::tuple<FT,FT> unit;
			
 
				             dp.scaled(scaled, leaf);
			
 
				             dp.unit(unit, leaf);
			
@@ -477,8 +513,8 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				             // appropriate cells in the two blinded databases, so we can
			
 
				             // subtract the pair directly.
			
 
				             auto [BL0, BL1] = shape.get_server(i,fieldsel);
			
 
				-            BL0 -= V0[0];
			
 
				-            BL1 -= V1[0];
			
 
				+            BL0 -= V0[windex];
			
 
				+            BL1 -= V1[windex];
			
 
				             return 0;
			
 
				         });
			
 
				     }
			
@@ -488,21 +524,21 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				 // Oblivious update to a shared index of Duoram memory, only for
			
 
				 // FT not RegAS or RegXS.  The template parameters are as above.
			
 
				 template <typename T>
			
 
				-template <typename U, typename FT, typename FST, typename Sh>
			
 
				-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
			
 
				-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::oram_update(const FT& M,
			
 
				+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
			
 
				+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
			
 
				+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::oram_update(const FT& M,
			
 
				         const prac_template_false &)
			
 
				 {
			
 
				-    T::update(shape, shape.yield, idx, M);
			
 
				+    T::update(shape, shape.yield, oblividx->idx, M);
			
 
				     return *this;
			
 
				 }
			
 
				 
			
 
				 // Oblivious update to an additively or XOR shared index of Duoram
			
 
				 // memory. The template parameters are as above.
			
 
				 template <typename T>
			
 
				-template <typename U, typename FT, typename FST, typename Sh>
			
 
				-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
			
 
				-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator+=(const FT& M)
			
 
				+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
			
 
				+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
			
 
				+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::operator+=(const FT& M)
			
 
				 {
			
 
				     return oram_update(M, prac_basic_Reg_S<FT>());
			
 
				 }
			
@@ -510,9 +546,9 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
				 // Oblivious write to an additively or XOR shared index of Duoram
			
 
				 // memory. The template parameters are as above.
			
 
				 template <typename T>
			
 
				-template <typename U, typename FT, typename FST, typename Sh>
			
 
				-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
			
 
				-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator=(const FT& M)
			
 
				+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
			
 
				+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
			
 
				+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::operator=(const FT& M)
			
 
				 {
			
 
				     FT oldval = *this;
			
 
				     FT update = M - oldval;
			
--- a/mpcio.cpp
+++ b/mpcio.cpp
@@ -460,7 +460,8 @@ MPCTIO::MPCTIO(MPCIO &mpcio, int thread_num, int num_threads) :
 
				 #ifdef VERBOSE_COMMS
			
 
				         round_num(0),
			
 
				 #endif
			
 
				-        last_andtriple_bits_remaining(0)
			
 
				+        last_andtriple_bits_remaining(0),
			
 
				+        remaining_nodesselecttriples(0)
			
 
				 {
			
 
				     if (mpcio.player < 2) {
			
 
				         MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
			
@@ -726,44 +727,71 @@ MultTriple MPCTIO::andtriple(yield_t &yield)
 
				     return val;
			
 
				 }
			
 
				 
			
 
				-SelectTriple<DPFnode> MPCTIO::nodeselecttriple(yield_t &yield)
			
 
				+void MPCTIO::request_nodeselecttriples(yield_t &yield, size_t num)
			
 
				 {
			
 
				-    SelectTriple<DPFnode> val;
			
 
				     if (mpcio.player < 2) {
			
 
				         MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
			
 
				         if (mpcpio.mode != MODE_ONLINE) {
			
 
				-            uint8_t Xbyte;
			
 
				             yield();
			
 
				-            recv_server(&Xbyte, sizeof(Xbyte));
			
 
				-            val.X = Xbyte & 1;
			
 
				-            recv_server(&val.Y, sizeof(val.Y));
			
 
				-            recv_server(&val.Z, sizeof(val.Z));
			
 
				+            for (size_t i=0; i<num; ++i) {
			
 
				+                SelectTriple<DPFnode> v;
			
 
				+                uint8_t Xbyte;
			
 
				+                recv_server(&Xbyte, sizeof(Xbyte));
			
 
				+                v.X = Xbyte & 1;
			
 
				+                recv_server(&v.Y, sizeof(v.Y));
			
 
				+                recv_server(&v.Z, sizeof(v.Z));
			
 
				+                queued_nodeselecttriples.push_back(v);
			
 
				+            }
			
 
				+            remaining_nodesselecttriples += num;
			
 
				         } else {
			
 
				             std::cerr << "Attempted to read SelectTriple<DPFnode> in online phase\n";
			
 
				         }
			
 
				     } else if (mpcio.mode != MODE_ONLINE) {
			
 
				-        // Create triples (X0,Y0,Z0),(X1,Y1,Z1) such that
			
 
				-        // (X0*Y1 ^ Y0*X1) = (Z0^Z1)
			
 
				-        bit_t X0, X1;
			
 
				-        DPFnode Y0, Z0, Y1, Z1;
			
 
				-        X0 = arc4random() & 1;
			
 
				-        arc4random_buf(&Y0, sizeof(Y0));
			
 
				-        arc4random_buf(&Z0, sizeof(Z0));
			
 
				-        X1 = arc4random() & 1;
			
 
				-        arc4random_buf(&Y1, sizeof(Y1));
			
 
				-        DPFnode X0ext, X1ext;
			
 
				-        // Sign-extend X0 and X1 (so that 0 -> 0000...0 and
			
 
				-        // 1 -> 1111...1)
			
 
				-        X0ext = if128_mask[X0];
			
 
				-        X1ext = if128_mask[X1];
			
 
				-        Z1 = ((X0ext & Y1) ^ (X1ext & Y0)) ^ Z0;
			
 
				-        queue_p0(&X0, sizeof(X0));
			
 
				-        queue_p0(&Y0, sizeof(Y0));
			
 
				-        queue_p0(&Z0, sizeof(Z0));
			
 
				-        queue_p1(&X1, sizeof(X1));
			
 
				-        queue_p1(&Y1, sizeof(Y1));
			
 
				-        queue_p1(&Z1, sizeof(Z1));
			
 
				+        for (size_t i=0; i<num; ++i) {
			
 
				+            // Create triples (X0,Y0,Z0),(X1,Y1,Z1) such that
			
 
				+            // (X0*Y1 ^ Y0*X1) = (Z0^Z1)
			
 
				+            bit_t X0, X1;
			
 
				+            DPFnode Y0, Z0, Y1, Z1;
			
 
				+            X0 = arc4random() & 1;
			
 
				+            arc4random_buf(&Y0, sizeof(Y0));
			
 
				+            arc4random_buf(&Z0, sizeof(Z0));
			
 
				+            X1 = arc4random() & 1;
			
 
				+            arc4random_buf(&Y1, sizeof(Y1));
			
 
				+            DPFnode X0ext, X1ext;
			
 
				+            // Sign-extend X0 and X1 (so that 0 -> 0000...0 and
			
 
				+            // 1 -> 1111...1)
			
 
				+            X0ext = if128_mask[X0];
			
 
				+            X1ext = if128_mask[X1];
			
 
				+            Z1 = ((X0ext & Y1) ^ (X1ext & Y0)) ^ Z0;
			
 
				+            queue_p0(&X0, sizeof(X0));
			
 
				+            queue_p0(&Y0, sizeof(Y0));
			
 
				+            queue_p0(&Z0, sizeof(Z0));
			
 
				+            queue_p1(&X1, sizeof(X1));
			
 
				+            queue_p1(&Y1, sizeof(Y1));
			
 
				+            queue_p1(&Z1, sizeof(Z1));
			
 
				+        }
			
 
				         yield();
			
 
				+        remaining_nodesselecttriples += num;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+SelectTriple<DPFnode> MPCTIO::nodeselecttriple(yield_t &yield)
			
 
				+{
			
 
				+    SelectTriple<DPFnode> val;
			
 
				+    if (remaining_nodesselecttriples == 0) {
			
 
				+        request_nodeselecttriples(yield, 1);
			
 
				+    }
			
 
				+    if (mpcio.player < 2) {
			
 
				+        MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
			
 
				+        if (mpcpio.mode != MODE_ONLINE) {
			
 
				+            val = queued_nodeselecttriples.front();
			
 
				+            queued_nodeselecttriples.pop_front();
			
 
				+            --remaining_nodesselecttriples;
			
 
				+        } else {
			
 
				+            std::cerr << "Attempted to read SelectTriple<DPFnode> in online phase\n";
			
 
				+        }
			
 
				+    } else if (mpcio.mode != MODE_ONLINE) {
			
 
				+        --remaining_nodesselecttriples;
			
 
				     }
			
 
				     return val;
			
 
				 }
			
--- a/mpcio.hpp
+++ b/mpcio.hpp
@@ -353,6 +353,15 @@ class MPCTIO {
 
				     AndTriple last_andtriple;
			
 
				     nbits_t last_andtriple_bits_remaining;
			
 
				 
			
 
				+    // We allow for prefetching of SelectTriple<DPFnode>s to save one
			
 
				+    // network round per level when constructing RDPFs
			
 
				+    std::deque<SelectTriple<DPFnode>> queued_nodeselecttriples;
			
 
				+    // For P0 and P1, it should always be the case that
			
 
				+    // remaining_nodesselecttriples equals
			
 
				+    // queued_nodeselecttriples.size().  P2 does not store anything in
			
 
				+    // queued_nodeselecttriples, however.
			
 
				+    size_t remaining_nodesselecttriples;
			
 
				+
			
 
				 public:
			
 
				     MPCTIO(MPCIO &mpcio, int thread_num, int num_threads = 1);
			
 
				 
			
@@ -425,6 +434,7 @@ public:
 
				     MultTriple multtriple(yield_t &yield);
			
 
				     HalfTriple halftriple(yield_t &yield, bool tally=true);
			
 
				     AndTriple andtriple(yield_t &yield);
			
 
				+    void request_nodeselecttriples(yield_t &yield, size_t num);
			
 
				     SelectTriple<DPFnode> nodeselecttriple(yield_t &yield);
			
 
				     SelectTriple<value_t> valselecttriple(yield_t &yield);
			
 
				     SelectTriple<bit_t> bitselecttriple(yield_t &yield);
			
--- a/online.cpp
+++ b/online.cpp
@@ -7,6 +7,7 @@
 
				 #include "cdpf.hpp"
			
 
				 #include "cell.hpp"
			
 
				 #include "heap.hpp"
			
 
				+#include "shapes.hpp"
			
 
				 
			
 
				 
			
 
				 static void online_test(MPCIO &mpcio,
			
@@ -260,8 +261,11 @@ static void rdpf_test(MPCIO &mpcio,
 
				                     RDPF<WIDTH> &dpf = dt.dpf[i];
			
 
				                     for (nbits_t level=min_level; level<=depth; ++level) {
			
 
				                         if (incremental) {
			
 
				-                            printf("Level = %u\n\n", level);
			
 
				-                            dpf.depth(level);
			
 
				+                            printf("Level = %u\n", level);
			
 
				+                            dt.depth(level);
			
 
				+                            RegXS tshare;
			
 
				+                            dt.get_target(tshare);
			
 
				+                            printf("Target share = %lx\n\n", tshare.share());
			
 
				                         }
			
 
				                         typename RDPF<WIDTH>::RegXSW peer_scaled_xor;
			
 
				                         typename RDPF<WIDTH>::RegASW peer_scaled_sum;
			
@@ -651,12 +655,16 @@ static void duoram_test(MPCIO &mpcio,
 
				         ++args;
			
 
				     }
			
 
				     share &= ((address_t(1)<<depth)-1);
			
 
				+    address_t len = (1<<depth);
			
 
				+    if (*args) {
			
 
				+        len = atoi(*args);
			
 
				+        ++args;
			
 
				+    }
			
 
				 
			
 
				     MPCTIO tio(mpcio, 0, opts.num_threads);
			
 
				-    run_coroutines(tio, [&tio, depth, share] (yield_t &yield) {
			
 
				-        size_t size = size_t(1)<<depth;
			
 
				+    run_coroutines(tio, [&tio, depth, share, len] (yield_t &yield) {
			
 
				         // size_t &aes_ops = tio.aes_ops();
			
 
				-        Duoram<T> oram(tio.player(), size);
			
 
				+        Duoram<T> oram(tio.player(), len);
			
 
				         auto A = oram.flat(tio, yield);
			
 
				         RegAS aidx, aidx2, aidx3;
			
 
				         aidx.ashare = share;
			
@@ -676,6 +684,14 @@ static void duoram_test(MPCIO &mpcio,
 
				         } else {
			
 
				             N.set(0x0000beef);
			
 
				         }
			
 
				+        RegXS oxidx;
			
 
				+        oxidx.xshare = share+3*tio.player();
			
 
				+        T O;
			
 
				+        if (tio.player() == 0) {
			
 
				+            O.set(0x31410000);
			
 
				+        } else {
			
 
				+            O.set(0x00005926);
			
 
				+        }
			
 
				         // Writing and reading with additively shared indices
			
 
				         printf("Additive Updating\n");
			
 
				         A[aidx] += M;
			
@@ -686,8 +702,14 @@ static void duoram_test(MPCIO &mpcio,
 
				         A[xidx] += N;
			
 
				         printf("XOR Reading\n");
			
 
				         T Ax = A[xidx];
			
 
				-        T Ae;
			
 
				+        // Writing and reading with OblivIndex indices
			
 
				+        auto oidx = A.oblivindex(oxidx);
			
 
				+        printf("OblivIndex Updating\n");
			
 
				+        A[oidx] += O;
			
 
				+        printf("OblivIndex Reading\n");
			
 
				+        T Ox = A[oidx];
			
 
				         // Writing and reading with explicit indices
			
 
				+        T Ae;
			
 
				         if (depth > 2) {
			
 
				             printf("Explicit Updating\n");
			
 
				             A[5] += Aa;
			
@@ -714,7 +736,7 @@ static void duoram_test(MPCIO &mpcio,
 
				             oram.dump();
			
 
				             auto check = A.reconstruct();
			
 
				             if (tio.player() == 0) {
			
 
				-                for (address_t i=0;i<size;++i) {
			
 
				+                for (address_t i=0;i<len;++i) {
			
 
				                     printf("%04x %016lx\n", i, check[i].share());
			
 
				                 }
			
 
				             }
			
@@ -722,10 +744,12 @@ static void duoram_test(MPCIO &mpcio,
 
				         auto checkread = A.reconstruct(Aa);
			
 
				         auto checkreade = A.reconstruct(Ae);
			
 
				         auto checkreadx = A.reconstruct(Ax);
			
 
				+        auto checkreado = A.reconstruct(Ox);
			
 
				         if (tio.player() == 0) {
			
 
				             printf("Read AS value = %016lx\n", checkread.share());
			
 
				             printf("Read AX value = %016lx\n", checkreadx.share());
			
 
				             printf("Read Ex value = %016lx\n", checkreade.share());
			
 
				+            printf("Read OI value = %016lx\n", checkreado.share());
			
 
				         }
			
 
				         for (auto &v : Av) {
			
 
				             auto checkv = A.reconstruct(v);
			
@@ -1067,48 +1091,126 @@ static void sort_test(MPCIO &mpcio,
 
				         depth = atoi(*args);
			
 
				         ++args;
			
 
				     }
			
 
				+    address_t len = (1<<depth);
			
 
				+    if (*args) {
			
 
				+        len = atoi(*args);
			
 
				+        ++args;
			
 
				+    }
			
 
				 
			
 
				-    int num_threads = opts.num_threads;
			
 
				-    boost::asio::thread_pool pool(num_threads);
			
 
				-    for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
			
 
				-        boost::asio::post(pool, [&mpcio, thread_num, depth] {
			
 
				-            MPCTIO tio(mpcio, thread_num);
			
 
				-            run_coroutines(tio, [&tio, depth] (yield_t &yield) {
			
 
				-                address_t size = address_t(1)<<depth;
			
 
				-                // size_t &aes_ops = tio.aes_ops();
			
 
				-                Duoram<RegAS> oram(tio.player(), size);
			
 
				-                auto A = oram.flat(tio, yield);
			
 
				-                A.explicitonly(true);
			
 
				-                // Initialize the memory to random values in parallel
			
 
				-                std::vector<coro_t> coroutines;
			
 
				-                for (address_t i=0; i<size; ++i) {
			
 
				-                    coroutines.emplace_back(
			
 
				-                        [&A, i](yield_t &yield) {
			
 
				-                            auto Acoro = A.context(yield);
			
 
				-                            RegAS v;
			
 
				-                            v.randomize(62);
			
 
				-                            Acoro[i] += v;
			
 
				-                        });
			
 
				-                }
			
 
				-                run_coroutines(yield, coroutines);
			
 
				-                A.bitonic_sort(0, depth);
			
 
				+    MPCTIO tio(mpcio, 0, opts.num_threads);
			
 
				+    run_coroutines(tio, [&tio, depth, len] (yield_t &yield) {
			
 
				+        address_t size = address_t(1)<<depth;
			
 
				+        // size_t &aes_ops = tio.aes_ops();
			
 
				+        Duoram<RegAS> oram(tio.player(), size);
			
 
				+        auto A = oram.flat(tio, yield);
			
 
				+        A.explicitonly(true);
			
 
				+        // Initialize the memory to random values in parallel
			
 
				+        std::vector<coro_t> coroutines;
			
 
				+        for (address_t i=0; i<size; ++i) {
			
 
				+            coroutines.emplace_back(
			
 
				+                [&A, i](yield_t &yield) {
			
 
				+                    auto Acoro = A.context(yield);
			
 
				+                    RegAS v;
			
 
				+                    v.randomize(62);
			
 
				+                    Acoro[i] += v;
			
 
				+                });
			
 
				+        }
			
 
				+        run_coroutines(yield, coroutines);
			
 
				+        A.bitonic_sort(0, len);
			
 
				+        if (depth <= 10) {
			
 
				+            oram.dump();
			
 
				+        }
			
 
				+        auto check = A.reconstruct();
			
 
				+        bool fail = false;
			
 
				+        if (tio.player() == 0) {
			
 
				+            for (address_t i=0;i<size;++i) {
			
 
				                 if (depth <= 10) {
			
 
				-                    oram.dump();
			
 
				-                    auto check = A.reconstruct();
			
 
				-                    if (tio.player() == 0) {
			
 
				-                        for (address_t i=0;i<size;++i) {
			
 
				-                            printf("%04x %016lx\n", i, check[i].share());
			
 
				-                        }
			
 
				-                    }
			
 
				+                    printf("%04x %016lx\n", i, check[i].share());
			
 
				                 }
			
 
				-            });
			
 
				-        });
			
 
				+                if (i>0 && i<len &&
			
 
				+                    check[i].share() < check[i-1].share()) {
			
 
				+                    fail = true;
			
 
				+                }
			
 
				+            }
			
 
				+            if (fail) {
			
 
				+                printf("FAIL\n");
			
 
				+            } else {
			
 
				+                printf("PASS\n");
			
 
				+            }
			
 
				+        }
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+static void pad_test(MPCIO &mpcio,
			
 
				+    const PRACOptions &opts, char **args)
			
 
				+{
			
 
				+    nbits_t depth=6;
			
 
				+
			
 
				+    if (*args) {
			
 
				+        depth = atoi(*args);
			
 
				+        ++args;
			
 
				     }
			
 
				-    pool.join();
			
 
				+    address_t len = (1<<depth);
			
 
				+    if (*args) {
			
 
				+        len = atoi(*args);
			
 
				+        ++args;
			
 
				+    }
			
 
				+
			
 
				+    MPCTIO tio(mpcio, 0, opts.num_threads);
			
 
				+    run_coroutines(tio, [&mpcio, &tio, depth, len] (yield_t &yield) {
			
 
				+        int player = tio.player();
			
 
				+        Duoram<RegAS> oram(player, len);
			
 
				+        auto A = oram.flat(tio, yield);
			
 
				+        // Initialize the ORAM in explicit mode
			
 
				+        A.explicitonly(true);
			
 
				+        for (address_t i=0; i<len; ++i) {
			
 
				+            RegAS v;
			
 
				+            v.set((player*0xffff+1)*i);
			
 
				+            A[i] = v;
			
 
				+        }
			
 
				+        A.explicitonly(false);
			
 
				+        // Obliviously add 0 to A[0], which reblinds the whole database
			
 
				+        RegAS z;
			
 
				+        A[z] += z;
			
 
				+        auto check = A.reconstruct();
			
 
				+        if (player == 0) {
			
 
				+            for (address_t i=0;i<len;++i) {
			
 
				+                if (depth <= 10) {
			
 
				+                    printf("%04x %016lx\n", i, check[i].share());
			
 
				+                }
			
 
				+            }
			
 
				+            printf("\n");
			
 
				+        }
			
 
				+        address_t maxsize = address_t(1)<<depth;
			
 
				+        Duoram<RegAS>::Pad P(A, tio, yield, maxsize);
			
 
				+        for (address_t i=0; i<maxsize; ++i) {
			
 
				+            RegAS v = P[i];
			
 
				+            if (depth <= 10) {
			
 
				+                value_t vval = mpc_reconstruct(tio, yield, v);
			
 
				+                printf("%04x %016lx %016lx\n", i, v.share(), vval);
			
 
				+            }
			
 
				+        }
			
 
				+        printf("\n");
			
 
				+        for (address_t i=0; i<maxsize; ++i) {
			
 
				+            value_t offset = 0xdeadbeef;
			
 
				+            if (player) {
			
 
				+                offset = -offset;
			
 
				+            }
			
 
				+            RegAS ind;
			
 
				+            ind.set(player*i+offset);
			
 
				+            RegAS v = P[ind];
			
 
				+            if (depth <= 10) {
			
 
				+                value_t vval = mpc_reconstruct(tio, yield, v);
			
 
				+                printf("%04x %016lx %016lx\n", i, v.share(), vval);
			
 
				+            }
			
 
				+        }
			
 
				+        printf("\n");
			
 
				+    });
			
 
				 }
			
 
				 
			
 
				+
			
 
				 static void bsearch_test(MPCIO &mpcio,
			
 
				-    const PRACOptions &opts, char **args)
			
 
				+    const PRACOptions &opts, char **args, bool basic)
			
 
				 {
			
 
				     value_t target;
			
 
				     arc4random_buf(&target, sizeof(target));
			
@@ -1119,79 +1221,298 @@ static void bsearch_test(MPCIO &mpcio,
 
				         depth = atoi(*args);
			
 
				         ++args;
			
 
				     }
			
 
				+    address_t len = (1<<depth);
			
 
				+    if (*args) {
			
 
				+        len = atoi(*args);
			
 
				+        ++args;
			
 
				+    }
			
 
				     if (*args) {
			
 
				         target = strtoull(*args, NULL, 16);
			
 
				         ++args;
			
 
				     }
			
 
				 
			
 
				-    int num_threads = opts.num_threads;
			
 
				-    boost::asio::thread_pool pool(num_threads);
			
 
				-    for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
			
 
				-        boost::asio::post(pool, [&mpcio, thread_num, depth, target] {
			
 
				-            MPCTIO tio(mpcio, thread_num);
			
 
				-            run_coroutines(tio, [&tio, depth, target] (yield_t &yield) {
			
 
				-                address_t size = address_t(1)<<depth;
			
 
				-                RegAS tshare;
			
 
				-                if (tio.player() == 2) {
			
 
				-                    // Send shares of the target to the computational
			
 
				-                    // players
			
 
				-                    RegAS tshare0, tshare1;
			
 
				-                    tshare0.randomize();
			
 
				-                    tshare1.set(target-tshare0.share());
			
 
				-                    tio.iostream_p0() << tshare0;
			
 
				-                    tio.iostream_p1() << tshare1;
			
 
				-                    printf("Using target = %016lx\n", target);
			
 
				-                    yield();
			
 
				-                } else {
			
 
				-                    // Get the share of the target
			
 
				-                    tio.iostream_server() >> tshare;
			
 
				-                }
			
 
				+    MPCTIO tio(mpcio, 0, opts.num_threads);
			
 
				+    run_coroutines(tio, [&tio, &mpcio, depth, len, target, basic] (yield_t &yield) {
			
 
				+        RegAS tshare;
			
 
				+        std::cout << "\n===== SETUP =====\n";
			
 
				+
			
 
				+        if (tio.player() == 2) {
			
 
				+            // Send shares of the target to the computational
			
 
				+            // players
			
 
				+            RegAS tshare0, tshare1;
			
 
				+            tshare0.randomize();
			
 
				+            tshare1.set(target-tshare0.share());
			
 
				+            tio.iostream_p0() << tshare0;
			
 
				+            tio.iostream_p1() << tshare1;
			
 
				+            printf("Using target = %016lx\n", target);
			
 
				+            yield();
			
 
				+        } else {
			
 
				+            // Get the share of the target
			
 
				+            tio.iostream_server() >> tshare;
			
 
				+        }
			
 
				+
			
 
				+        tio.sync_lamport();
			
 
				+        mpcio.dump_stats(std::cout);
			
 
				+
			
 
				+        std::cout << "\n===== SORT RANDOM DATABASE =====\n";
			
 
				+        mpcio.reset_stats();
			
 
				+        tio.reset_lamport();
			
 
				+        // Create a random database and sort it
			
 
				+        // size_t &aes_ops = tio.aes_ops();
			
 
				+        Duoram<RegAS> oram(tio.player(), len);
			
 
				+        auto A = oram.flat(tio, yield);
			
 
				+        A.explicitonly(true);
			
 
				+        // Initialize the memory to random values in parallel
			
 
				+        std::vector<coro_t> coroutines;
			
 
				+        for (address_t i=0; i<len; ++i) {
			
 
				+            coroutines.emplace_back(
			
 
				+                [&A, i](yield_t &yield) {
			
 
				+                    auto Acoro = A.context(yield);
			
 
				+                    RegAS v;
			
 
				+                    v.randomize(62);
			
 
				+                    Acoro[i] += v;
			
 
				+                });
			
 
				+        }
			
 
				+        run_coroutines(yield, coroutines);
			
 
				+        A.bitonic_sort(0, len);
			
 
				+        A.explicitonly(false);
			
 
				+
			
 
				+        tio.sync_lamport();
			
 
				+        mpcio.dump_stats(std::cout);
			
 
				 
			
 
				-                // Create a random database and sort it
			
 
				-                // size_t &aes_ops = tio.aes_ops();
			
 
				-                Duoram<RegAS> oram(tio.player(), size);
			
 
				-                auto A = oram.flat(tio, yield);
			
 
				-                A.explicitonly(true);
			
 
				-                // Initialize the memory to random values in parallel
			
 
				-                std::vector<coro_t> coroutines;
			
 
				-                for (address_t i=0; i<size; ++i) {
			
 
				-                    coroutines.emplace_back(
			
 
				-                        [&A, i](yield_t &yield) {
			
 
				-                            auto Acoro = A.context(yield);
			
 
				-                            RegAS v;
			
 
				-                            v.randomize(62);
			
 
				-                            Acoro[i] += v;
			
 
				-                        });
			
 
				+        std::cout << "\n===== BINARY SEARCH =====\n";
			
 
				+        mpcio.reset_stats();
			
 
				+        tio.reset_lamport();
			
 
				+        // Binary search for the target
			
 
				+        value_t checkindex;
			
 
				+        if (basic) {
			
 
				+            RegAS tindex = A.basic_binary_search(tshare);
			
 
				+            checkindex = mpc_reconstruct(tio, yield, tindex);
			
 
				+        } else {
			
 
				+            RegXS tindex = A.binary_search(tshare);
			
 
				+            checkindex = mpc_reconstruct(tio, yield, tindex);
			
 
				+        }
			
 
				+
			
 
				+        tio.sync_lamport();
			
 
				+        mpcio.dump_stats(std::cout);
			
 
				+
			
 
				+        std::cout << "\n===== CHECK ANSWER =====\n";
			
 
				+        mpcio.reset_stats();
			
 
				+        tio.reset_lamport();
			
 
				+        // Check the answer
			
 
				+        size_t size = size_t(1) << depth;
			
 
				+        value_t checktarget = mpc_reconstruct(tio, yield, tshare);
			
 
				+        auto check = A.reconstruct();
			
 
				+        bool fail = false;
			
 
				+        if (tio.player() == 0) {
			
 
				+            for (address_t i=0;i<len;++i) {
			
 
				+                if (depth <= 10) {
			
 
				+                    printf("%c%04x %016lx\n",
			
 
				+                        (i == checkindex ? '*' : ' '),
			
 
				+                        i, check[i].share());
			
 
				                 }
			
 
				-                run_coroutines(yield, coroutines);
			
 
				-                A.bitonic_sort(0, depth);
			
 
				-
			
 
				-                // Binary search for the target
			
 
				-                RegAS tindex = A.obliv_binary_search(tshare);
			
 
				-
			
 
				-                // Check the answer
			
 
				-                if (tio.player() == 1) {
			
 
				-                    tio.iostream_peer() << tindex;
			
 
				-                } else if (tio.player() == 0) {
			
 
				-                    RegAS peer_tindex;
			
 
				-                    tio.iostream_peer() >> peer_tindex;
			
 
				-                    tindex += peer_tindex;
			
 
				+                if (i>0 && i<len &&
			
 
				+                    check[i].share() < check[i-1].share()) {
			
 
				+                    fail = true;
			
 
				                 }
			
 
				-                if (depth <= 10) {
			
 
				-                    auto check = A.reconstruct();
			
 
				-                    if (tio.player() == 0) {
			
 
				-                        for (address_t i=0;i<size;++i) {
			
 
				-                            printf("%04x %016lx\n", i, check[i].share());
			
 
				-                        }
			
 
				+                if (i == checkindex) {
			
 
				+                    // check[i] should be >= target, and check[i-1]
			
 
				+                    // should be < target
			
 
				+                    if ((i < len && check[i].share() < checktarget) ||
			
 
				+                        (i > 0 && check[i-1].share() >= checktarget)) {
			
 
				+                        fail = true;
			
 
				                     }
			
 
				                 }
			
 
				-                if (tio.player() == 0) {
			
 
				-                    printf("Found index = %lx\n", tindex.share());
			
 
				+            }
			
 
				+            if (checkindex == len && check[len-1].share() >= checktarget) {
			
 
				+                fail = true;
			
 
				+            }
			
 
				+
			
 
				+            printf("Target = %016lx\n", checktarget);
			
 
				+            printf("Found index = %02lx\n", checkindex);
			
 
				+            if (checkindex > size) {
			
 
				+                fail = true;
			
 
				+            }
			
 
				+            if (fail) {
			
 
				+                printf("FAIL\n");
			
 
				+            } else {
			
 
				+                printf("PASS\n");
			
 
				+            }
			
 
				+        }
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+static void related(MPCIO &mpcio,
			
 
				+    const PRACOptions &opts, char **args)
			
 
				+{
			
 
				+    nbits_t depth = 5;
			
 
				+
			
 
				+    // The depth of the (complete) binary tree
			
 
				+    if (*args) {
			
 
				+        depth = atoi(*args);
			
 
				+        ++args;
			
 
				+    }
			
 
				+    // The layer at which to choose a random parent node (and its two
			
 
				+    // children along with it)
			
 
				+    nbits_t layer = depth-1;
			
 
				+    if (*args) {
			
 
				+        layer = atoi(*args);
			
 
				+        ++args;
			
 
				+    }
			
 
				+    assert(layer < depth);
			
 
				+
			
 
				+    MPCTIO tio(mpcio, 0, opts.num_threads);
			
 
				+    run_coroutines(tio, [&mpcio, &tio, depth, layer] (yield_t &yield) {
			
 
				+        size_t size = size_t(1)<<(depth+1);
			
 
				+        Duoram<T> oram(tio.player(), size);
			
 
				+        auto A = oram.flat(tio, yield);
			
 
				+
			
 
				+        // Initialize A with words with sequential top and bottom halves
			
 
				+        // (just so we can more easily eyeball the right answers)
			
 
				+        A.init([] (size_t i) { return i * 0x100000001; } );
			
 
				+
			
 
				+        // We use this layout for the tree:
			
 
				+        // A[0] is unused
			
 
				+        // A[1] is the root (layer 0)
			
 
				+        // A[2..3] is layer 1
			
 
				+        // A[4..7] is layer 2
			
 
				+        // ...
			
 
				+        // A[(1<<j)..((2<<j)-1)] is layer j
			
 
				+        //
			
 
				+        // So the parent of x is at location (x/2) and the children of x
			
 
				+        // are at locations 2*x and 2*x+1
			
 
				+
			
 
				+        // Pick a random index _within_ the given layer (i.e., the
			
 
				+        // offset from the beginning of the layer, not the absolute
			
 
				+        // location in A)
			
 
				+        RegXS idx;
			
 
				+        idx.randomize(layer);
			
 
				+        // Create the OblivIndex. RegXS is the type of the common index
			
 
				+        // (idx), 3 is the maximum number of related updates to support
			
 
				+        // (which equals the width of the underlying RDPF, currently
			
 
				+        // maximum 5), layer is the depth of the underlying RDPF (the
			
 
				+        // bit length of idx).
			
 
				+        typename Duoram<T>::template OblivIndex<RegXS,3> oidx(tio, yield, idx, layer);
			
 
				+
			
 
				+        // This is the (known) layer containing the (unknown) parent
			
 
				+        // node
			
 
				+        typename Duoram<T>::Flat P(A, tio, yield, 1<<layer, 1<<layer);
			
 
				+        // This is the layer below that one, containing all possible
			
 
				+        // children
			
 
				+        typename Duoram<T>::Flat C(A, tio, yield, 2<<layer, 2<<layer);
			
 
				+        // These are the subsets of C containing the left children and
			
 
				+        // the right children respectively
			
 
				+        typename Duoram<T>::Stride L(C, tio, yield, 0, 2);
			
 
				+        typename Duoram<T>::Stride R(C, tio, yield, 1, 2);
			
 
				+
			
 
				+        T parent, left, right;
			
 
				+
			
 
				+        // Do three related reads.  In this version, only one DPF will
			
 
				+        // be used, but it will still be _evaluated_ three times.
			
 
				+        parent = P[oidx];
			
 
				+        left = L[oidx];
			
 
				+        right = R[oidx];
			
 
				+
			
 
				+        // The operation is just a simple rotation: the value in the
			
 
				+        // parent moves to the left child, the left child moves to the
			
 
				+        // right child, and the right child becomes the parent
			
 
				+
			
 
				+        // Do three related updates.  As above, only one (wide) DPF will
			
 
				+        // be used (the same one as for the reads in fact), but it will
			
 
				+        // still be _evaluated_ three more times.
			
 
				+        P[oidx] += right-parent;
			
 
				+        L[oidx] += parent-left;
			
 
				+        R[oidx] += left-right;
			
 
				+
			
 
				+        // Check the answer
			
 
				+        auto check = A.reconstruct();
			
 
				+        if (depth <= 10) {
			
 
				+            oram.dump();
			
 
				+            if (tio.player() == 0) {
			
 
				+                for (address_t i=0;i<size;++i) {
			
 
				+                    printf("%04x %016lx\n", i, check[i].share());
			
 
				                 }
			
 
				-            });
			
 
				-        });
			
 
				+            }
			
 
				+        }
			
 
				+        value_t pval = mpc_reconstruct(tio, yield, parent);
			
 
				+        value_t lval = mpc_reconstruct(tio, yield, left);
			
 
				+        value_t rval = mpc_reconstruct(tio, yield, right);
			
 
				+        printf("parent = %016lx\nleft   = %016lx\nright  = %016lx\n",
			
 
				+            pval, lval, rval);
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+static void path(MPCIO &mpcio,
			
 
				+    const PRACOptions &opts, char **args)
			
 
				+{
			
 
				+    nbits_t depth = 5;
			
 
				+
			
 
				+    // The depth of the (complete) binary tree
			
 
				+    if (*args) {
			
 
				+        depth = atoi(*args);
			
 
				+        ++args;
			
 
				     }
			
 
				-    pool.join();
			
 
				+    // The target node
			
 
				+    size_t target_node = 3 << (depth-1);
			
 
				+    if (*args) {
			
 
				+        target_node = atoi(*args);
			
 
				+        ++args;
			
 
				+    }
			
 
				+
			
 
				+    MPCTIO tio(mpcio, 0, opts.num_threads);
			
 
				+    run_coroutines(tio, [&mpcio, &tio, depth, target_node] (yield_t &yield) {
			
 
				+        size_t size = size_t(1)<<(depth+1);
			
 
				+        Duoram<T> oram(tio.player(), size);
			
 
				+        auto A = oram.flat(tio, yield);
			
 
				+
			
 
				+        // Initialize A with words with sequential top and bottom halves
			
 
				+        // (just so we can more easily eyeball the right answers)
			
 
				+        A.init([] (size_t i) { return i * 0x100000001; } );
			
 
				+
			
 
				+        // We use this layout for the tree:
			
 
				+        // A[0] is unused
			
 
				+        // A[1] is the root (layer 0)
			
 
				+        // A[2..3] is layer 1
			
 
				+        // A[4..7] is layer 2
			
 
				+        // ...
			
 
				+        // A[(1<<j)..((2<<j)-1)] is layer j
			
 
				+        //
			
 
				+        // So the parent of x is at location (x/2) and the children of x
			
 
				+        // are at locations 2*x and 2*x+1
			
 
				+
			
 
				+        // Create a Path from the root to the target node
			
 
				+        typename Duoram<T>::Path P(A, tio, yield, target_node);
			
 
				+
			
 
				+        // Re-initialize that path to something recognizable
			
 
				+        P.init([] (size_t i) { return 0xff + i * 0x1000000010000; } );
			
 
				+
			
 
				+        // ORAM update along that path
			
 
				+        RegXS idx;
			
 
				+        idx.set(tio.player() * arc4random_uniform(P.size()));
			
 
				+        T val;
			
 
				+        val.set(tio.player() * 0xaaaa00000000);
			
 
				+        P[idx] += val;
			
 
				+
			
 
				+        // Binary search along that path
			
 
				+        T lookup;
			
 
				+        lookup.set(tio.player() * 0x3000000000000);
			
 
				+        RegXS foundidx = P.binary_search(lookup);
			
 
				+
			
 
				+        // Check the answer
			
 
				+        auto check = A.reconstruct();
			
 
				+        if (depth <= 10) {
			
 
				+            oram.dump();
			
 
				+            if (tio.player() == 0) {
			
 
				+                for (address_t i=0;i<size;++i) {
			
 
				+                    printf("%04x %016lx\n", i, check[i].share());
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        value_t found = mpc_reconstruct(tio, yield, foundidx);
			
 
				+        printf("foundidx = %lu\n", found);
			
 
				+    });
			
 
				 }
			
 
				 
			
 
				 void online_main(MPCIO &mpcio, const PRACOptions &opts, char **args)
			
@@ -1267,9 +1588,15 @@ void online_main(MPCIO &mpcio, const PRACOptions &opts, char **args)
 
				     } else if (!strcmp(*args, "sorttest")) {
			
 
				         ++args;
			
 
				         sort_test(mpcio, opts, args);
			
 
				+    } else if (!strcmp(*args, "padtest")) {
			
 
				+        ++args;
			
 
				+        pad_test(mpcio, opts, args);
			
 
				+    } else if (!strcmp(*args, "bbsearch")) {
			
 
				+        ++args;
			
 
				+        bsearch_test(mpcio, opts, args, true);
			
 
				     } else if (!strcmp(*args, "bsearch")) {
			
 
				         ++args;
			
 
				-        bsearch_test(mpcio, opts, args);
			
 
				+        bsearch_test(mpcio, opts, args, false);
			
 
				     } else if (!strcmp(*args, "duoram")) {
			
 
				         ++args;
			
 
				         if (opts.use_xor_db) {
			
@@ -1277,6 +1604,16 @@ void online_main(MPCIO &mpcio, const PRACOptions &opts, char **args)
 
				         } else {
			
 
				             duoram<RegAS>(mpcio, opts, args);
			
 
				         }
			
 
				+    } else if (!strcmp(*args, "related")) {
			
 
				+        ++args;
			
 
				+        if (opts.use_xor_db) {
			
 
				+            related<RegXS>(mpcio, opts, args);
			
 
				+        } else {
			
 
				+            related<RegAS>(mpcio, opts, args);
			
 
				+        }
			
 
				+    } else if (!strcmp(*args, "path")) {
			
 
				+        ++args;
			
 
				+        path<RegAS>(mpcio, opts, args);
			
 
				     } else if (!strcmp(*args, "cell")) {
			
 
				         ++args;
			
 
				         cell(mpcio, opts, args);
			
--- a/preproc.cpp
+++ b/preproc.cpp
@@ -476,7 +476,7 @@ void preprocessing_server(MPCServerIO &mpcsrvio, const PRACOptions &opts, char *
 
				                                 stio.cdpf(yield);
			
 
				                             });
			
 
				                     }
			
 
				-                } else if (!strcmp(type, "i")) {
			
 
				+                } else if (!strcmp(type, "k")) {
			
 
				                     unsigned char typetag = 0x8e;
			
 
				                     unsigned char subtypetag = 0x00;
			
 
				                     stio.queue_p0(&typetag, 1);
			
--- a/rdpf.hpp
+++ b/rdpf.hpp
@@ -289,8 +289,11 @@ struct RDPFTriple {
 
				     // outputs so that the appropriate one can be selected with a
			
 
				     // parameter
			
 
				 
			
 
				+    // Only RegXS, not RegAS, indices are used with incremental RDPFs
			
 
				     inline void get_target(RegAS &target) const { target = as_target; }
			
 
				-    inline void get_target(RegXS &target) const { target = xs_target; }
			
 
				+    inline void get_target(RegXS &target) const {
			
 
				+        target = xs_target >> (dpf[0].maxdepth - dpf[0].curdepth);
			
 
				+    }
			
 
				 
			
 
				     // Additive share of the scaling value M_as such that the high words
			
 
				     // of the leaf values for P0 and P1 add to M_as * e_{target}
			
@@ -368,15 +371,6 @@ struct RDPFPair {
 
				 
			
 
				     RDPFPair() {}
			
 
				 
			
 
				-    // Create an RDPFPair from an RDPFTriple, keeping two of the RDPFs
			
 
				-    // and dropping one.  This _moves_ the dpfs from the triple to the
			
 
				-    // pair, so the triple will no longer be valid after using this.
			
 
				-    // which0 and which1 indicate which of the dpfs to keep.
			
 
				-    RDPFPair(RDPFTriple<WIDTH> &&trip, int which0, int which1) {
			
 
				-        dpf[0] = std::move(trip.dpf[which0]);
			
 
				-        dpf[1] = std::move(trip.dpf[which1]);
			
 
				-    }
			
 
				-
			
 
				     // The depth
			
 
				     inline nbits_t depth() const { return dpf[0].depth(); }
			
 
				 
			
@@ -464,6 +458,123 @@ struct RDPFPair {
 
				 
			
 
				 };
			
 
				 
			
 
				+// These are used by computational peers, who hold RPDFTriples, but when
			
 
				+// reading, only need to use 2 of the 3 RDPFs.  The API follows that of
			
 
				+// RDPFPair, but internally, it holds two references to external RDPFs,
			
 
				+// instead of holding the RDPFs themselves.
			
 
				+
			
 
				+template <nbits_t WIDTH>
			
 
				+struct RDPF2of3 {
			
 
				+    template <typename T>
			
 
				+    using Pair = std::tuple<T, T>;
			
 
				+    template <typename T>
			
 
				+    using WPair = std::tuple<
			
 
				+        typename std::array<T,WIDTH>,
			
 
				+        typename std::array<T,WIDTH> >;
			
 
				+
			
 
				+    // The type of pairs of nodes, LeafNodes, and the wide shared
			
 
				+    // register types
			
 
				+    using node = Pair<DPFnode>;
			
 
				+    using LeafNode = Pair<typename RDPF<WIDTH>::LeafNode>;
			
 
				+    using RegASWP = WPair<RegAS>;
			
 
				+    using RegXSWP = WPair<RegXS>;
			
 
				+
			
 
				+    const RDPF<WIDTH> &dpf0, &dpf1;
			
 
				+
			
 
				+    // Create an RDPFPair from an RDPFTriple, keeping two of the RDPFs
			
 
				+    // and dropping one.  This _moves_ the dpfs from the triple to the
			
 
				+    // pair, so the triple will no longer be valid after using this.
			
 
				+    // which0 and which1 indicate which of the dpfs to keep.
			
 
				+    RDPF2of3(const RDPFTriple<WIDTH> &trip, int which0, int which1) :
			
 
				+        dpf0(trip.dpf[which0]), dpf1(trip.dpf[which1]) {}
			
 
				+
			
 
				+    // The depth
			
 
				+    inline nbits_t depth() const { return dpf0.depth(); }
			
 
				+
			
 
				+    // Set the current depth for an incremental RDPFPair; 0 means to use
			
 
				+    // maxdepth
			
 
				+    inline void depth(nbits_t newdepth) {
			
 
				+        dpf0.depth(newdepth);
			
 
				+        dpf1.depth(newdepth);
			
 
				+    }
			
 
				+
			
 
				+    // The seed
			
 
				+    inline node get_seed() const {
			
 
				+        return std::make_tuple(dpf0.get_seed(), dpf1.get_seed());
			
 
				+    }
			
 
				+
			
 
				+    // Do we have a precomputed expansion?
			
 
				+    inline bool has_expansion() const {
			
 
				+        int li_index = dpf0.maxdepth - dpf0.curdepth;
			
 
				+        return dpf0.li[li_index].expansion.size() > 0;
			
 
				+    }
			
 
				+
			
 
				+    // Get an element of the expansion
			
 
				+    inline LeafNode get_expansion(address_t index) const {
			
 
				+        return std::make_tuple(dpf0.get_expansion(index),
			
 
				+            dpf1.get_expansion(index));
			
 
				+    }
			
 
				+
			
 
				+    // Descend the two RDPFs in lock step
			
 
				+    node descend(const node &parent, nbits_t parentdepth,
			
 
				+        bit_t whichchild, size_t &aes_ops) const;
			
 
				+
			
 
				+    // Descend the two RDPFs in lock step to a leaf node
			
 
				+    LeafNode descend_to_leaf(const node &parent, nbits_t parentdepth,
			
 
				+        bit_t whichchild, size_t &aes_ops) const;
			
 
				+
			
 
				+    // Overloaded versions of functions to get DPF components and
			
 
				+    // outputs so that the appropriate one can be selected with a
			
 
				+    // parameter
			
 
				+
			
 
				+    // Additive share of the scaling value M_as such that the high words
			
 
				+    // of the leaf values for P0 and P1 add to M_as * e_{target}
			
 
				+    inline void scaled_value(RegASWP &v) const {
			
 
				+        std::get<0>(v) = dpf0.scaled_sum;
			
 
				+        std::get<1>(v) = dpf1.scaled_sum;
			
 
				+    }
			
 
				+
			
 
				+    // XOR share of the scaling value M_xs such that the high words
			
 
				+    // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
			
 
				+    inline void scaled_value(RegXSWP &v) const {
			
 
				+        std::get<0>(v) = dpf0.scaled_xor;
			
 
				+        std::get<1>(v) = dpf1.scaled_xor;
			
 
				+    }
			
 
				+
			
 
				+    // Get the additive-shared unit vector entry from the leaf node
			
 
				+    inline void unit(std::tuple<RegAS,RegAS> &u, const LeafNode &leaf) const {
			
 
				+        std::get<0>(u) = dpf0.unit_as(std::get<0>(leaf));
			
 
				+        std::get<1>(u) = dpf1.unit_as(std::get<1>(leaf));
			
 
				+    }
			
 
				+
			
 
				+    // Get the bit-shared unit vector entry from the leaf node
			
 
				+    inline void unit(std::tuple<RegXS,RegXS> &u, const LeafNode &leaf) const {
			
 
				+        std::get<0>(u) = dpf0.unit_bs(std::get<0>(leaf));
			
 
				+        std::get<1>(u) = dpf1.unit_bs(std::get<1>(leaf));
			
 
				+    }
			
 
				+
			
 
				+    // For any more complex entry type, that type will handle the conversion
			
 
				+    // for each DPF
			
 
				+    template <typename T>
			
 
				+    inline void unit(std::tuple<T,T> &u, const LeafNode &leaf) const {
			
 
				+        std::get<0>(u).unit(dpf0, std::get<0>(leaf));
			
 
				+        std::get<1>(u).unit(dpf1, std::get<1>(leaf));
			
 
				+    }
			
 
				+
			
 
				+    // Get the additive-shared scaled vector entry from the leaf node
			
 
				+    inline void scaled(RegASWP &s, const LeafNode &leaf) const {
			
 
				+        std::get<0>(s) = dpf0.scaled_as(std::get<0>(leaf));
			
 
				+        std::get<1>(s) = dpf1.scaled_as(std::get<1>(leaf));
			
 
				+    }
			
 
				+
			
 
				+    // Get the XOR-shared scaled vector entry from the leaf node
			
 
				+    inline void scaled(RegXSWP &s, const LeafNode &leaf) const {
			
 
				+        std::get<0>(s) = dpf0.scaled_xs(std::get<0>(leaf));
			
 
				+        std::get<1>(s) = dpf1.scaled_xs(std::get<1>(leaf));
			
 
				+    }
			
 
				+
			
 
				+};
			
 
				+
			
 
				 // Streaming evaluation, to avoid taking up enough memory to store
			
 
				 // an entire evaluation.  T can be RDPF, RDPFPair, or RDPFTriple.
			
 
				 template <typename T>
			
--- a/rdpf.tcc
+++ b/rdpf.tcc
@@ -145,7 +145,7 @@ inline V ParallelEval<T>::reduce(V init, W process)
 
				     size_t thread_aes_ops[num_threads];
			
 
				     V accums[num_threads];
			
 
				     boost::asio::thread_pool pool(num_threads);
			
 
				-    address_t threadstart = start;
			
 
				+    address_t threadstart = 0;
			
 
				     address_t threadchunk = num_evals / num_threads;
			
 
				     address_t threadextra = num_evals % num_threads;
			
 
				     nbits_t depth = rdpf.depth();
			
@@ -857,6 +857,9 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				 
			
 
				     li.resize(incremental ? depth : 1);
			
 
				 
			
 
				+    // Prefetch the right number of nodeselecttriples
			
 
				+    tio.request_nodeselecttriples(yield, incremental ? 2*depth-1 : depth);
			
 
				+
			
 
				     // Construct each intermediate level
			
 
				     while(level < depth) {
			
 
				         LeafNode *leaflevel = NULL;
			
@@ -883,29 +886,44 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
				         // The bit-shared choice bit is bit (depth-level-1) of the
			
 
				         // XOR-shared target index
			
 
				         RegBS bs_choice = target.bit(depth-level-1);
			
 
				-        bool cfbit;
			
 
				 
			
 
				+        // At each layer, we can create the next internal layer and the
			
 
				+        // leaf layer in parallel coroutines if we're making an
			
 
				+        // incremental RDPF.  If not, exactly one of these coroutines
			
 
				+        // will be created, and we just run that one.
			
 
				+        std::vector<coro_t> coroutines;
			
 
				         if (level < depth-1) {
			
 
				-            DPFnode CW;
			
 
				-            // This field is ignored when we're not expanding to a leaf
			
 
				-            // level, but it needs to be an lvalue reference.
			
 
				-            int noleafinfo = 0;
			
 
				-            create_level(tio, yield, curlevel, nextlevel, player, level,
			
 
				-                depth, bs_choice, CW, cfbit, save_expansion, noleafinfo,
			
 
				-                aes_ops);
			
 
				-            cfbits |= (value_t(cfbit)<<level);
			
 
				-            if (player < 2) {
			
 
				-                cw.push_back(CW);
			
 
				-            }
			
 
				+            coroutines.emplace_back([this, &tio, curlevel, nextlevel,
			
 
				+                player, level, depth, bs_choice, save_expansion,
			
 
				+                &aes_ops] (yield_t &yield) {
			
 
				+                    DPFnode CW;
			
 
				+                    bool cfbit;
			
 
				+                    // This field is ignored when we're not expanding to a leaf
			
 
				+                    // level, but it needs to be an lvalue reference.
			
 
				+                    int noleafinfo = 0;
			
 
				+                    create_level(tio, yield, curlevel, nextlevel, player, level,
			
 
				+                        depth, bs_choice, CW, cfbit, save_expansion, noleafinfo,
			
 
				+                        aes_ops);
			
 
				+                    cfbits |= (value_t(cfbit)<<level);
			
 
				+                    if (player < 2) {
			
 
				+                        cw.push_back(CW);
			
 
				+                    }
			
 
				+                });
			
 
				         }
			
 
				         if (incremental || level == depth-1) {
			
 
				-            LeafNode CW;
			
 
				-            create_level(tio, yield, curlevel, leaflevel, player, level,
			
 
				-                depth, bs_choice, CW, cfbit, save_expansion,
			
 
				-                li[depth-level-1], aes_ops);
			
 
				-            leaf_cfbits |= (value_t(cfbit)<<(depth-level-1));
			
 
				-            li[depth-level-1].leaf_cw = CW;
			
 
				+            coroutines.emplace_back([this, &tio, curlevel, leaflevel,
			
 
				+                player, level, depth, bs_choice, save_expansion,
			
 
				+                &aes_ops](yield_t &yield) {
			
 
				+                    LeafNode CW;
			
 
				+                    bool cfbit;
			
 
				+                    create_level(tio, yield, curlevel, leaflevel, player,
			
 
				+                        level, depth, bs_choice, CW, cfbit, save_expansion,
			
 
				+                        li[depth-level-1], aes_ops);
			
 
				+                    leaf_cfbits |= (value_t(cfbit)<<(depth-level-1));
			
 
				+                    li[depth-level-1].leaf_cw = CW;
			
 
				+                });
			
 
				         }
			
 
				+        run_coroutines(yield, coroutines);
			
 
				 
			
 
				         if (!save_expansion) {
			
 
				             delete[] leaflevel;
			
@@ -1081,3 +1099,29 @@ typename RDPFPair<WIDTH>::LeafNode RDPFPair<WIDTH>::descend_to_leaf(
 
				     C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
			
 
				     return std::make_tuple(C0,C1);
			
 
				 }
			
 
				+
			
 
				+template <nbits_t WIDTH>
			
 
				+typename RDPF2of3<WIDTH>::node RDPF2of3<WIDTH>::descend(
			
 
				+    const RDPF2of3<WIDTH>::node &parent,
			
 
				+    nbits_t parentdepth, bit_t whichchild,
			
 
				+    size_t &aes_ops) const
			
 
				+{
			
 
				+    auto [P0, P1] = parent;
			
 
				+    DPFnode C0, C1;
			
 
				+    C0 = dpf0.descend(P0, parentdepth, whichchild, aes_ops);
			
 
				+    C1 = dpf1.descend(P1, parentdepth, whichchild, aes_ops);
			
 
				+    return std::make_tuple(C0,C1);
			
 
				+}
			
 
				+
			
 
				+template <nbits_t WIDTH>
			
 
				+typename RDPF2of3<WIDTH>::LeafNode RDPF2of3<WIDTH>::descend_to_leaf(
			
 
				+    const RDPF2of3<WIDTH>::node &parent,
			
 
				+    nbits_t parentdepth, bit_t whichchild,
			
 
				+    size_t &aes_ops) const
			
 
				+{
			
 
				+    auto [P0, P1] = parent;
			
 
				+    typename RDPF<WIDTH>::LeafNode C0, C1;
			
 
				+    C0 = dpf0.descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
			
 
				+    C1 = dpf1.descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
			
 
				+    return std::make_tuple(C0,C1);
			
 
				+}
			
--- a/shapes.hpp
+++ b/shapes.hpp
@@ -0,0 +1,302 @@
 
				+#ifndef __SHAPES_HPP__
			
 
				+#define __SHAPES_HPP__
			
 
				+
			
 
				+// Various Shapes beyond the standard Flat (in duoram.hpp)
			
 
				+
			
 
				+#include "duoram.hpp"
			
 
				+
			
 
				+
			
 
				+// A Pad is a Shape that pads an underlying Shape so that read accesses
			
 
				+// past the end return a fixed constant value.  Do _not_ write into a
			
 
				+// Pad!
			
 
				+
			
 
				+template <typename T>
			
 
				+class Duoram<T>::Pad : public Duoram<T>::Shape {
			
 
				+    // These are pointers because we need to be able to return a
			
 
				+    // (non-const) T& even from a const Pad.
			
 
				+    T *padvalp;
			
 
				+    T *peerpadvalp;
			
 
				+    T *zerop;
			
 
				+    address_t padded_size;
			
 
				+
			
 
				+    inline size_t indexmap(size_t idx) const override {
			
 
				+        return idx;
			
 
				+    }
			
 
				+
			
 
				+    Pad &operator=(const Pad &) = delete;
			
 
				+
			
 
				+public:
			
 
				+    // Constructor for the Pad shape. The parent must _not_ be in
			
 
				+    // explicit-only mode.
			
 
				+    Pad(Shape &parent, MPCTIO &tio, yield_t &yield,
			
 
				+        address_t padded_size, value_t padval = 0x7fffffffffffffff);
			
 
				+
			
 
				+    // Copy the given Pad except for the tio and yield
			
 
				+    Pad(const Pad &copy_from, MPCTIO &tio, yield_t &yield);
			
 
				+
			
 
				+    // Destructor
			
 
				+    ~Pad();
			
 
				+
			
 
				+    // Update the context (MPCTIO and yield if you've started a new
			
 
				+    // thread, or just yield if you've started a new coroutine in the
			
 
				+    // same thread).  Returns a new Shape with an updated context.
			
 
				+    Pad context(MPCTIO &new_tio, yield_t &new_yield) const {
			
 
				+        return Pad(*this, new_tio, new_yield);
			
 
				+    }
			
 
				+    Pad context(yield_t &new_yield) const {
			
 
				+        return Pad(*this, this->tio, new_yield);
			
 
				+    }
			
 
				+
			
 
				+    // Get a pair (for the server) of references to the underlying
			
 
				+    // Duoram entries at share virtual index idx.  (That is, it gets
			
 
				+    // duoram.p0_blind[indexmap(idx)], etc.)
			
 
				+    inline std::tuple<T&,T&> get_server(size_t idx,
			
 
				+        std::nullopt_t null = std::nullopt) const override {
			
 
				+        size_t parindex = indexmap(idx);
			
 
				+        if (parindex < this->parent.shape_size) {
			
 
				+            return this->parent.get_server(parindex, null);
			
 
				+        } else {
			
 
				+            return std::tie(*zerop, *zerop);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Get a triple (for the computational players) of references to the
			
 
				+    // underlying Duoram entries at share virtual index idx.  (That is,
			
 
				+    // it gets duoram.database[indexmap(idx)], etc.)
			
 
				+    inline std::tuple<T&,T&,T&> get_comp(size_t idx,
			
 
				+        std::nullopt_t null = std::nullopt) const override {
			
 
				+        size_t parindex = indexmap(idx);
			
 
				+        if (parindex < this->parent.shape_size) {
			
 
				+            return this->parent.get_comp(parindex, null);
			
 
				+        } else {
			
 
				+            return std::tie(*padvalp, *zerop, *peerpadvalp);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Index into this Pad in various ways
			
 
				+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Pad,1>
			
 
				+            operator[](const RegAS &idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegAS,T,std::nullopt_t,Pad,1>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Pad,1>
			
 
				+            operator[](const RegXS &idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Pad,1>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U, nbits_t WIDTH>
			
 
				+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Pad,WIDTH>
			
 
				+            operator[](OblivIndex<U,WIDTH> &obidx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Pad,WIDTH>
			
 
				+            res(*this, obidx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
			
 
				+            operator[](address_t idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefExpl<T,std::nullopt_t>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U>
			
 
				+    Duoram::Shape::MemRefInd<U, Pad>
			
 
				+            operator[](const std::vector<U> &indcs) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefInd<U,Pad>
			
 
				+            res(*this, indcs);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U, size_t N>
			
 
				+    Duoram::Shape::MemRefInd<U, Pad>
			
 
				+            operator[](const std::array<U,N> &indcs) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefInd<U,Pad>
			
 
				+            res(*this, indcs);
			
 
				+        return res;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// A Stride is a Shape that represents evenly spaced elements of its
			
 
				+// parent Shape, starting with some offset, and then every stride
			
 
				+// elements.
			
 
				+
			
 
				+template <typename T>
			
 
				+class Duoram<T>::Stride : public Duoram<T>::Shape {
			
 
				+    size_t offset;
			
 
				+    size_t stride;
			
 
				+
			
 
				+    inline size_t indexmap(size_t idx) const override {
			
 
				+        size_t paridx = offset + idx*stride;
			
 
				+        return paridx;
			
 
				+    }
			
 
				+
			
 
				+public:
			
 
				+    // Constructor
			
 
				+    Stride(Shape &parent, MPCTIO &tio, yield_t &yield, size_t offset,
			
 
				+        size_t stride);
			
 
				+
			
 
				+    // Copy the given Stride except for the tio and yield
			
 
				+    Stride(const Stride &copy_from, MPCTIO &tio, yield_t &yield) :
			
 
				+        Shape(copy_from, tio, yield), offset(copy_from.offset),
			
 
				+        stride(copy_from.stride) {}
			
 
				+
			
 
				+    // Update the context (MPCTIO and yield if you've started a new
			
 
				+    // thread, or just yield if you've started a new coroutine in the
			
 
				+    // same thread).  Returns a new Shape with an updated context.
			
 
				+    Stride context(MPCTIO &new_tio, yield_t &new_yield) const {
			
 
				+        return Stride(*this, new_tio, new_yield);
			
 
				+    }
			
 
				+    Stride context(yield_t &new_yield) const {
			
 
				+        return Stride(*this, this->tio, new_yield);
			
 
				+    }
			
 
				+
			
 
				+    // Index into this Stride in various ways
			
 
				+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Stride,1>
			
 
				+            operator[](const RegAS &idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegAS,T,std::nullopt_t,Stride,1>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Stride,1>
			
 
				+            operator[](const RegXS &idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Stride,1>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U, nbits_t WIDTH>
			
 
				+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Stride,WIDTH>
			
 
				+            operator[](OblivIndex<U,WIDTH> &obidx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Stride,WIDTH>
			
 
				+            res(*this, obidx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
			
 
				+            operator[](address_t idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefExpl<T,std::nullopt_t>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U>
			
 
				+    Duoram::Shape::MemRefInd<U, Stride>
			
 
				+            operator[](const std::vector<U> &indcs) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefInd<U,Stride>
			
 
				+            res(*this, indcs);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U, size_t N>
			
 
				+    Duoram::Shape::MemRefInd<U, Stride>
			
 
				+            operator[](const std::array<U,N> &indcs) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefInd<U,Stride>
			
 
				+            res(*this, indcs);
			
 
				+        return res;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// A Path is a Shape that represents a path from the root of a complete
			
 
				+// binary tree down to a given node.
			
 
				+
			
 
				+// We assume this layout for the tree (the _parent_ shape of the Path):
			
 
				+// A[0] is unused
			
 
				+// A[1] is the root (layer 0)
			
 
				+// A[2..3] is layer 1
			
 
				+// A[4..7] is layer 2
			
 
				+// ...
			
 
				+// A[(1<<j)..((2<<j)-1)] is layer j
			
 
				+//
			
 
				+// So the parent of x is at location (x/2) and the children of x
			
 
				+// are at locations 2*x and 2*x+1
			
 
				+
			
 
				+template <typename T>
			
 
				+class Duoram<T>::Path : public Duoram<T>::Shape {
			
 
				+    size_t target_node;
			
 
				+
			
 
				+    inline size_t indexmap(size_t idx) const override {
			
 
				+        size_t paridx = target_node >> (this->shape_size - idx - 1);
			
 
				+        return paridx;
			
 
				+    }
			
 
				+
			
 
				+public:
			
 
				+    // Constructor
			
 
				+    Path(Shape &parent, MPCTIO &tio, yield_t &yield,
			
 
				+        size_t target_node);
			
 
				+
			
 
				+    // Copy the given Path except for the tio and yield
			
 
				+    Path(const Path &copy_from, MPCTIO &tio, yield_t &yield) :
			
 
				+        Shape(copy_from, tio, yield),
			
 
				+        target_node(copy_from.target_node) {}
			
 
				+
			
 
				+    // Update the context (MPCTIO and yield if you've started a new
			
 
				+    // thread, or just yield if you've started a new coroutine in the
			
 
				+    // same thread).  Returns a new Shape with an updated context.
			
 
				+    Path context(MPCTIO &new_tio, yield_t &new_yield) const {
			
 
				+        return Path(*this, new_tio, new_yield);
			
 
				+    }
			
 
				+    Path context(yield_t &new_yield) const {
			
 
				+        return Path(*this, this->tio, new_yield);
			
 
				+    }
			
 
				+
			
 
				+    // Index into this Path in various ways
			
 
				+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Path,1>
			
 
				+            operator[](const RegAS &idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegAS,T,std::nullopt_t,Path,1>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Path,1>
			
 
				+            operator[](const RegXS &idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Path,1>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U, nbits_t WIDTH>
			
 
				+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Path,WIDTH>
			
 
				+            operator[](OblivIndex<U,WIDTH> &obidx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefS<RegXS,T,std::nullopt_t,Path,WIDTH>
			
 
				+            res(*this, obidx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
			
 
				+            operator[](address_t idx) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefExpl<T,std::nullopt_t>
			
 
				+            res(*this, idx, std::nullopt);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U>
			
 
				+    Duoram::Shape::MemRefInd<U, Path>
			
 
				+            operator[](const std::vector<U> &indcs) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefInd<U,Path>
			
 
				+            res(*this, indcs);
			
 
				+        return res;
			
 
				+    }
			
 
				+    template <typename U, size_t N>
			
 
				+    Duoram::Shape::MemRefInd<U, Path>
			
 
				+            operator[](const std::array<U,N> &indcs) {
			
 
				+        typename Duoram<T>::Shape::
			
 
				+            template MemRefInd<U,Path>
			
 
				+            res(*this, indcs);
			
 
				+        return res;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#include "shapes.tcc"
			
 
				+
			
 
				+#endif
			
--- a/shapes.tcc
+++ b/shapes.tcc
@@ -0,0 +1,85 @@
 
				+#ifndef __SHAPES_TCC__
			
 
				+#define __SHAPES_TCC__
			
 
				+
			
 
				+// Constructor for the Pad shape. The parent must _not_ be in
			
 
				+// explicit-only mode.
			
 
				+template <typename T>
			
 
				+Duoram<T>::Pad::Pad(Shape &parent, MPCTIO &tio, yield_t &yield,
			
 
				+    address_t padded_size, size_t padval) :
			
 
				+    Shape(parent, parent.duoram, tio, yield)
			
 
				+{
			
 
				+    int player = tio.player();
			
 
				+    padvalp = new T;
			
 
				+    padvalp->set(player*padval);
			
 
				+    zerop = new T;
			
 
				+    peerpadvalp = new T;
			
 
				+    peerpadvalp->set((1-player)*padval);
			
 
				+    this->set_shape_size(padded_size);
			
 
				+}
			
 
				+
			
 
				+// Copy the given Pad except for the tio and yield
			
 
				+template <typename T>
			
 
				+Duoram<T>::Pad::Pad(const Pad &copy_from, MPCTIO &tio, yield_t &yield) :
			
 
				+    Shape(copy_from, tio, yield)
			
 
				+{
			
 
				+    padvalp = new T;
			
 
				+    padvalp->set(copy_from.padvalp->share());
			
 
				+    zerop = new T;
			
 
				+    peerpadvalp = new T;
			
 
				+    peerpadvalp->set(copy_from.peerpadvalp->share());
			
 
				+}
			
 
				+
			
 
				+// Destructor
			
 
				+template <typename T>
			
 
				+Duoram<T>::Pad::~Pad()
			
 
				+{
			
 
				+    delete padvalp;
			
 
				+    delete zerop;
			
 
				+    delete peerpadvalp;
			
 
				+}
			
 
				+
			
 
				+// Constructor for the Stride shape.
			
 
				+template <typename T>
			
 
				+Duoram<T>::Stride::Stride(Shape &parent, MPCTIO &tio, yield_t &yield,
			
 
				+    size_t offset, size_t stride) :
			
 
				+    Shape(parent, parent.duoram, tio, yield)
			
 
				+{
			
 
				+    size_t parentsize = parent.size();
			
 
				+    if (offset > parentsize) {
			
 
				+        offset = parentsize;
			
 
				+    }
			
 
				+    this->offset = offset;
			
 
				+    this->stride = stride;
			
 
				+    // How many items are there if you take every stride'th item,
			
 
				+    // starting at offset?  strideregionsize corrects for the offset, so
			
 
				+    // we're asking how many multiples of stride are there strictly less
			
 
				+    // than strideregionsize.  That's just ceil(strideregionsize/stride)
			
 
				+    // which is the same as (strideregionsize + stride - 1)/stride with
			
 
				+    // integer truncated division.
			
 
				+    size_t strideregionsize = parentsize - offset;
			
 
				+    size_t numelements = (strideregionsize + stride - 1) / stride;
			
 
				+    this->set_shape_size(numelements);
			
 
				+}
			
 
				+
			
 
				+// Constructor for the Path shape.
			
 
				+template <typename T>
			
 
				+Duoram<T>::Path::Path(Shape &parent, MPCTIO &tio, yield_t &yield,
			
 
				+    size_t target_node) :
			
 
				+    Shape(parent, parent.duoram, tio, yield)
			
 
				+{
			
 
				+    size_t parentsize = parent.size();
			
 
				+    assert(target_node > 0 && target_node < parentsize);
			
 
				+    this->target_node = target_node;
			
 
				+
			
 
				+    // How many nodes are there on the path from the root (index 1) to
			
 
				+    // the target node?  Recall that the parent of the node at index x
			
 
				+    // is just the node at index (x>>1).
			
 
				+    size_t path_num_nodes = 1, cur_node = target_node;
			
 
				+    while (cur_node > 1) {
			
 
				+        cur_node >>= 1;
			
 
				+        ++path_num_nodes;
			
 
				+    }
			
 
				+    this->set_shape_size(path_num_nodes);
			
 
				+}
			
 
				+
			
 
				+#endif