avadapal před 1 rokem
rodič
revize
82ef536bc2
13 změnil soubory, kde provedl 1603 přidání a 338 odebrání
  1. 2 2
      Makefile
  2. 5 0
      cell.cpp
  3. 99 20
      duoram.cpp
  4. 299 71
      duoram.hpp
  5. 118 82
      duoram.tcc
  6. 57 29
      mpcio.cpp
  7. 10 0
      mpcio.hpp
  8. 441 104
      online.cpp
  9. 1 1
      preproc.cpp
  10. 121 10
      rdpf.hpp
  11. 63 19
      rdpf.tcc
  12. 302 0
      shapes.hpp
  13. 85 0
      shapes.tcc

+ 2 - 2
Makefile

@@ -42,7 +42,7 @@ preproc.o: rdpf.tcc mpcops.hpp mpcops.tcc cdpf.hpp cdpf.tcc
 online.o: online.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
 online.o: options.hpp mpcops.hpp coroutine.hpp mpcops.tcc rdpf.hpp dpf.hpp
 online.o: prg.hpp aes.hpp rdpf.tcc duoram.hpp duoram.tcc cdpf.hpp cdpf.tcc
-online.o: cell.hpp
+online.o: cell.hpp shapes.hpp shapes.tcc
 mpcops.o: mpcops.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
 mpcops.o: coroutine.hpp mpcops.tcc
 rdpf.o: rdpf.hpp mpcio.hpp types.hpp bitutils.hpp corotypes.hpp mpcio.tcc
@@ -51,7 +51,7 @@ cdpf.o: bitutils.hpp cdpf.hpp mpcio.hpp types.hpp corotypes.hpp mpcio.tcc
 cdpf.o: coroutine.hpp dpf.hpp prg.hpp aes.hpp cdpf.tcc
 duoram.o: duoram.hpp types.hpp bitutils.hpp mpcio.hpp corotypes.hpp mpcio.tcc
 duoram.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
-duoram.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc
+duoram.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc shapes.hpp shapes.tcc
 cell.o: types.hpp bitutils.hpp duoram.hpp mpcio.hpp corotypes.hpp mpcio.tcc
 cell.o: coroutine.hpp duoram.tcc mpcops.hpp mpcops.tcc cdpf.hpp dpf.hpp
 cell.o: prg.hpp aes.hpp cdpf.tcc rdpf.hpp rdpf.tcc cell.hpp options.hpp

+ 5 - 0
cell.cpp

@@ -22,6 +22,11 @@ void cell(MPCIO &mpcio,
         size_t size = size_t(1)<<depth;
         Duoram<Cell> oram(tio.player(), size);
         auto A = oram.flat(tio, yield);
+        Cell init;
+        init.key.set(0xffffffffffffffff);
+        init.pointers.set(0xeeeeeeeeeeeeeeee);
+        init.value.set(0xdddddddddddddddd);
+        A.init(init);
         Cell c;
         c.key.set(0x0102030405060708);
         c.pointers.set(0x1112131415161718);

+ 99 - 20
duoram.cpp

@@ -1,51 +1,70 @@
 #include "duoram.hpp"
+#include "shapes.hpp"
 
 // Assuming the memory is already sorted, do an oblivious binary
-// search for the largest index containing the value at most the
-// given one.  (The answer will be 0 if all of the memory elements
-// are greate than the target.) This Flat must be a power of 2 size.
-// Only available for additive shared databases for now.
+// search for the smallest index containing the value at least the
+// given one.  (The answer will be the length of the Shape if all
+// elements are smaller than the target.) Only available for additive
+// shared databases for now.
+
+// The basic version uses log(N) ORAM reads of size N, where N is the
+// smallest power of 2 strictly larger than the Shape size
 template <>
-RegAS Duoram<RegAS>::Flat::obliv_binary_search(RegAS &target)
+RegAS Duoram<RegAS>::Shape::basic_binary_search(RegAS &target)
 {
-    nbits_t depth = this->addr_size;
+    if (this->shape_size == 0) {
+        RegAS zero;
+        return zero;
+    }
+    // Create a Pad of the smallest power of 2 size strictly greater
+    // than the Shape size
+    address_t padsize = 1;
+    nbits_t depth = 0;
+    while (padsize <= this->shape_size) {
+        padsize *= 2;
+        ++depth;
+    }
+    Duoram<RegAS>::Pad P(*this, tio, yield, padsize);
+
     // Start in the middle
     RegAS index;
-    index.set(this->tio.player() ? 0 : 1<<(depth-1));
-    // Invariant: index points to the first element of the right half of
-    // the remaining possible range
+    index.set(this->tio.player() ? 0 : (1<<(depth-1))-1);
+    // Invariant: index points to the last element of the left half of
+    // the remaining possible range, which is of width (1<<depth).
     while (depth > 0) {
         // Obliviously read the value there
-        RegAS val = operator[](index);
+        RegAS val = P[index];
         // Compare it to the target
         CDPF cdpf = tio.cdpf(this->yield);
         auto [lt, eq, gt] = cdpf.compare(this->tio, this->yield,
             val-target, tio.aes_ops());
         if (depth > 1) {
-            // If val > target, the answer is strictly to the left
+            // If val >= target, the answer is here or to the left
             // and we should subtract 2^{depth-2} from index
-            // If val <= target, the answer is here or to the right
+            // If val < target, the answer is strictly to the right
             // and we should add 2^{depth-2} to index
             // So we unconditionally subtract 2^{depth-2} from index, and
-            // add (lt+eq)*2^{depth-1}.
+            // add (lt)*2^{depth-1}.
             RegAS uncond;
             uncond.set(tio.player() ? 0 : address_t(1)<<(depth-2));
             RegAS cond;
             cond.set(tio.player() ? 0 : address_t(1)<<(depth-1));
             RegAS condprod;
-            RegBS le = lt ^ eq;
-            mpc_flagmult(this->tio, this->yield, condprod, le, cond);
+            mpc_flagmult(this->tio, this->yield, condprod, lt, cond);
             index -= uncond;
             index += condprod;
         } else {
-            // If val > target, the answer is strictly to the left
-            // If val <= target, the answer is here or to the right
-            // so subtract gt from index
+            // The possible range is of width 2, and we're pointing to
+            // the first element of it.
+            // If val >= target, the answer is here or to the left, so
+            // it's here.
+            // If val < target, the answer is strictly to the right
+            // so add lt to index
             RegAS cond;
             cond.set(tio.player() ? 0 : 1);
             RegAS condprod;
-            mpc_flagmult(this->tio, this->yield, condprod, gt, cond);
-            index -= condprod;
+            mpc_flagmult(this->tio, this->yield, condprod, lt, cond);
+            index += condprod;
         }
         --depth;
     }
@@ -53,3 +72,63 @@ RegAS Duoram<RegAS>::Flat::obliv_binary_search(RegAS &target)
     return index;
 }
 
+// This version does 1 ORAM read of size 2, 1 of size 4, 1 of size
+// 8, ..., 1 of size N/2, where N is the smallest power of 2 strictly
+// larger than the Shape size
+template <>
+RegXS Duoram<RegAS>::Shape::binary_search(RegAS &target)
+{
+    if (this->shape_size == 0) {
+        RegXS zero;
+        return zero;
+    }
+    // Create a Pad of the smallest power of 2 size strictly greater
+    // than the Shape size
+    address_t padsize = 1;
+    nbits_t depth = 0;
+    while (padsize <= this->shape_size) {
+        padsize *= 2;
+        ++depth;
+    }
+    Duoram<RegAS>::Pad P(*this, tio, yield, padsize);
+    // Explicitly read the middle item
+    address_t mid = (1<<(depth-1))-1;
+    RegAS val = P[mid];
+    // Compare it to the target
+    CDPF cdpf = tio.cdpf(this->yield);
+    auto [lt, eq, gt] = cdpf.compare(this->tio, this->yield,
+        val-target, tio.aes_ops());
+    if (depth == 1) {
+        // There was only one item in the Shape, and mid will equal 0, so
+        // val is (a share of) that item, P[0].  If val >= target, the
+        // answer is here or to the left, so it must be 0.  If val <
+        // target, the answer is strictly to the right, so it must be 1.
+        // So just return lt.
+        return RegXS(lt);
+    }
+    auto oidx = P.oblivindex(depth-1);
+    oidx.incr(lt);
+    --depth;
+    while(depth > 0) {
+        // Create the Stride shape; the ORAM will operate only over
+        // elements of the Stride, which will consist of exactly those
+        // elements of the Pad we could possibly be accessing at this
+        // depth.  Those will be elements start=(1<<(depth-1)-1,
+        // start+(1<<depth), start+(2<<depth), start+(3<<depth), and so
+        // on.  The invariant is that the range of remaining possible
+        // answers is of width (1<<depth), and we will look at the
+        // rightmost element of the left half.  If that value (val) has
+        // val >= target, then the answer is at that position or to the
+        // left, so we append a 0 to the index.  If val < targer, then
+        // the answer is strictly to the right, so we append a 1 to the
+        // index.  That is, always append lt to the index.
+        Duoram<RegAS>::Stride S(P, tio, yield, (1<<(depth-1))-1, 1<<depth);
+        RegAS val = S[oidx];
+        CDPF cdpf = tio.cdpf(this->yield);
+        auto [lt, eq, gt] = cdpf.compare(this->tio, this->yield,
+            val-target, tio.aes_ops());
+        oidx.incr(lt);
+        --depth;
+    }
+    return oidx.index();
+}

+ 299 - 71
duoram.hpp

@@ -1,6 +1,9 @@
 #ifndef __DUORAM_HPP__
 #define __DUORAM_HPP__
 
+#include <optional>
+#include <functional>
+
 #include "types.hpp"
 #include "mpcio.hpp"
 #include "coroutine.hpp"
@@ -22,10 +25,8 @@
 // on a Shape shared with other threads or coroutines.
 
 // This is templated, because you can have a Duoram of additively shared
-// (RegAS) or XOR shared (RegXS) elements, or std::arrays of those to
-// get "wide" memory cells.
-
-// The initial implementation is focused on additive shares.
+// (RegAS) or XOR shared (RegXS) elements, or more complex cell types
+// (see cell.hpp for example).
 
 template <typename T>
 class Duoram {
@@ -58,6 +59,13 @@ public:
     class Shape;
     // These are the different Shapes that exist
     class Flat;
+    class Pad;
+    class Stride;
+    class Path;
+
+    // Oblivious indices for use in related-index ORAM accesses
+    template <typename U, nbits_t WIDTH>
+    class OblivIndex;
 
     // Pass the player number and desired size
     Duoram(int player, size_t size);
@@ -80,8 +88,15 @@ public:
 
 template <typename T>
 class Duoram<T>::Shape {
-    // Subclasses should be able to access _other_ Shapes' indexmap
+    // Subclasses should be able to access _other_ Shapes'
+    // get_{comp,server} functions
     friend class Flat;
+    friend class Pad;
+    friend class Stride;
+    friend class Path;
+
+    template <typename U, nbits_t WIDTH>
+    friend class OblivIndex;
 
     // When you index into a shape (A[x]), you get one of these types,
     // depending on the type of x (the index), _not_ on the type T (the
@@ -97,8 +112,8 @@ class Duoram<T>::Shape {
     // a particular field of T, then FT will be the type of the field
     // (RegAS or RegXS) and FST will be a pointer-to-member T::* type
     // pointing to that field.  Sh is the specific Shape subtype used to
-    // create the MemRefS.
-    template <typename U, typename FT, typename FST, typename Sh>
+    // create the MemRefS.  WIDTH is the RDPF width to use.
+    template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
     class MemRefS;
     // When x is unshared explicit value.  FT and FST are as above.
     template <typename FT, typename FST>
@@ -161,12 +176,8 @@ protected:
         explicitmode(copy_from.explicitmode) {}
 
     // The index-mapping function. Input the index relative to this
-    // shape, and output the corresponding physical address.  The
-    // strategy is to map the index relative to this shape to the index
-    // relative to the parent shape, call the parent's indexmap function
-    // on that (unless this is the topmost shape), and return what it
-    // returns.  If this is the topmost shape, just return what you
-    // would have passed to the parent's indexmap.
+    // shape, and output the corresponding index relative to the parent
+    // shape.
     //
     // This is a pure virtual function; all subclasses of Shape must
     // implement it, and of course Shape itself therefore cannot be
@@ -174,55 +185,156 @@ protected:
     virtual size_t indexmap(size_t idx) const = 0;
 
     // Get a pair (for the server) of references to the underlying
-    // Duoram entries at share virtual index idx.  (That is, it gets
-    // duoram.p0_blind[indexmap(idx)], etc.)
-    inline std::tuple<T&,T&> get_server(size_t idx,
+    // Duoram entries at share virtual index idx.
+    virtual inline std::tuple<T&,T&> get_server(size_t idx,
         std::nullopt_t null = std::nullopt) const {
-        size_t physaddr = indexmap(idx);
-        return std::tie(
-            duoram.p0_blind[physaddr],
-            duoram.p1_blind[physaddr]);
+        size_t parindex = indexmap(idx);
+        if (&(this->parent) == this) {
+            return std::tie(
+                duoram.p0_blind[parindex],
+                duoram.p1_blind[parindex]);
+        } else {
+            return this->parent.get_server(parindex, null);
+        }
     }
 
     // Get a triple (for the computational players) of references to the
-    // underlying Duoram entries at share virtual index idx.  (That is,
-    // it gets duoram.database[indexmap(idx)], etc.)
-    inline std::tuple<T&,T&,T&> get_comp(size_t idx,
+    // underlying Duoram entries at share virtual index idx.
+    virtual inline std::tuple<T&,T&,T&> get_comp(size_t idx,
         std::nullopt_t null = std::nullopt) const {
-        size_t physaddr = indexmap(idx);
-        return std::tie(
-            duoram.database[physaddr],
-            duoram.blind[physaddr],
-            duoram.peer_blinded_db[physaddr]);
+        size_t parindex = indexmap(idx);
+        if (&(this->parent) == this) {
+            return std::tie(
+                duoram.database[parindex],
+                duoram.blind[parindex],
+                duoram.peer_blinded_db[parindex]);
+        } else {
+            return this->parent.get_comp(parindex, null);
+        }
     }
 
     // Get a pair (for the server) of references to a particular field
     // of the underlying Duoram entries at share virtual index idx.
-    // (That is, it gets duoram.p0_blind[indexmap(idx)].field, etc.)
     template <typename FT>
     inline std::tuple<FT&,FT&> get_server(size_t idx, FT T::*field) const {
-        size_t physaddr = indexmap(idx);
-        return std::tie(
-            duoram.p0_blind[physaddr].*field,
-            duoram.p1_blind[physaddr].*field);
+        size_t parindex = indexmap(idx);
+        if (&(this->parent) == this) {
+            return std::tie(
+                duoram.p0_blind[parindex].*field,
+                duoram.p1_blind[parindex].*field);
+        } else {
+            return this->parent.get_server(parindex, field);
+        }
     }
 
     // Get a triple (for the computational players) of references to a
     // particular field to the underlying Duoram entries at share
-    // virtual index idx.  (That is, it gets
-    // duoram.database[indexmap(idx)].field, etc.)
+    // virtual index idx.
     template <typename FT>
     inline std::tuple<FT&,FT&,FT&> get_comp(size_t idx, FT T::*field) const {
-        size_t physaddr = indexmap(idx);
-        return std::tie(
-            duoram.database[physaddr].*field,
-            duoram.blind[physaddr].*field,
-            duoram.peer_blinded_db[physaddr].*field);
+        size_t parindex = indexmap(idx);
+        if (&(this->parent) == this) {
+            return std::tie(
+                duoram.database[parindex].*field,
+                duoram.blind[parindex].*field,
+                duoram.peer_blinded_db[parindex].*field);
+        } else {
+            return this->parent.get_comp(parindex, field);
+        }
     }
 
 public:
     // Get the size
-    inline size_t size() { return shape_size; }
+    inline size_t size() const { return shape_size; }
+
+    // Initialize the contents of the Shape to a constant.  This method
+    // does no communication; all the operations are local.  This only
+    // works for T=RegXS or RegAS.
+    void init(size_t value) {
+        T v;
+        v.set(value);
+        init([v] (size_t i) { return v; });
+    }
+
+    // As above, but for general T
+    void init(const T &value) {
+        init([value] (size_t i) { return value; });
+    }
+
+    // As above, but use the default initializer for T (probably sets
+    // everything to 0).
+    void init() {
+        T deflt;
+        init(deflt);
+    }
+
+    // Pass a function f: size_t -> size_t, and initialize element i of the
+    // Shape to f(i) for each i.  This method does no communication; all
+    // the operations are local.  This function must be deterministic
+    // and public.  Only works for T=RegAS or RegXS.
+    void init(std::function<size_t(size_t)> f) {
+        int player = tio.player();
+        if (player < 2) {
+            for (size_t i=0; i<shape_size; ++i) {
+                auto [DB, BL, PBD] = get_comp(i);
+                BL.set(0);
+                if (player) {
+                    DB.set(f(i));
+                    PBD.set(0);
+                } else {
+                    DB.set(0);
+                    PBD.set(f(i));
+                }
+            }
+        } else {
+            for (size_t i=0; i<shape_size; ++i) {
+                auto [BL0, BL1] = get_server(i);
+                BL0.set(0);
+                BL1.set(0);
+            }
+        }
+    }
+
+    // Pass a function f: size_t -> T, and initialize element i of the
+    // Shape to f(i) for each i.  This method does no communication; all
+    // the operations are local.  This function must be deterministic
+    // and public.
+    void init(std::function<T(size_t)> f) {
+        int player = tio.player();
+        if (player < 2) {
+            for (size_t i=0; i<shape_size; ++i) {
+                auto [DB, BL, PBD] = get_comp(i);
+                BL = T();
+                if (player) {
+                    DB = f(i);
+                    PBD = T();
+                } else {
+                    DB = T();
+                    PBD = f(i);
+                }
+            }
+        } else {
+            for (size_t i=0; i<shape_size; ++i) {
+                auto [BL0, BL1] = get_server(i);
+                BL0 = T();
+                BL1 = T();
+            }
+        }
+    }
+
+    // Assuming the Shape is already sorted, do an oblivious binary
+    // search for the smallest index containing the value at least the
+    // given one.  (The answer will be the length of the Shape if all
+    // elements are smaller than the target.) Only available for additive
+    // shared databases for now.
+
+    // The basic version uses log(N) ORAM reads of size N, where N is
+    // the smallest power of 2 strictly larger than the Shape size
+    RegAS basic_binary_search(RegAS &target);
+    // This version does 1 ORAM read of size 2, 1 of size 4, 1 of size
+    // 8, ..., 1 of size N/2, where N is the smallest power of 2
+    // strictly larger than the Shape size
+    RegXS binary_search(RegAS &target);
 
     // Enable or disable explicit-only mode.  Only using [] with
     // explicit (address_t) indices are allowed in this mode.  Using []
@@ -236,6 +348,40 @@ public:
     // next oblivious read or write.  Bitonic sort is a prime example.
     void explicitonly(bool enable);
 
+    // Create an OblivIndex, non-incrementally (supply the shares of the
+    // index directly) or incrementally (the bits of the index will be
+    // supplied later, one at a time)
+
+    // Non-incremental, RegXS index
+    OblivIndex<RegXS,1> oblivindex(const RegXS &idx, nbits_t depth=0) {
+        if (depth == 0) {
+            depth = this->addr_size;
+        }
+        typename Duoram<T>::template OblivIndex<RegXS,1>
+            res(this->tio, this->yield, idx, depth);
+        return res;
+    }
+
+    // Non-incremental, RegAS index
+    OblivIndex<RegAS,1> oblivindex(const RegAS &idx, nbits_t depth=0) {
+        if (depth == 0) {
+            depth = this->addr_size;
+        }
+        typename Duoram<T>::template OblivIndex<RegAS,1>
+            res(this->tio, this->yield, idx, depth);
+        return res;
+    }
+
+    // Incremental (requires RegXS index, supplied bit-by-bit later)
+    OblivIndex<RegXS,1> oblivindex(nbits_t depth=0) {
+        if (depth == 0) {
+            depth = this->addr_size;
+        }
+        typename Duoram<T>::template OblivIndex<RegXS,1>
+            res(this->tio, this->yield, depth);
+        return res;
+    }
+
     // For debugging or checking your answers (using this in general is
     // of course insecure)
 
@@ -258,15 +404,11 @@ class Duoram<T>::Flat : public Duoram<T>::Shape {
 
     inline size_t indexmap(size_t idx) const {
         size_t paridx = idx + start;
-        if (&(this->parent) == this) {
-            return paridx;
-        } else {
-            return this->parent.indexmap(paridx);
-        }
+        return paridx;
     }
 
     // Internal function to aid bitonic_sort
-    void butterfly(address_t start, nbits_t depth, bool dir);
+    void butterfly(address_t start, address_t len, bool dir);
 
 public:
     // Constructor.  len=0 means the maximum size (the parent's size
@@ -274,6 +416,11 @@ public:
     Flat(Duoram &duoram, MPCTIO &tio, yield_t &yield, size_t start = 0,
         size_t len = 0);
 
+    // Constructor.  len=0 means the maximum size (the parent's size
+    // minus start).
+    Flat(const Shape &parent, MPCTIO &tio, yield_t &yield, size_t start = 0,
+        size_t len = 0);
+
     // Copy the given Flat except for the tio and yield
     Flat(const Flat &copy_from, MPCTIO &tio, yield_t &yield) :
         Shape(copy_from, tio, yield), start(copy_from.start),
@@ -290,20 +437,28 @@ public:
     }
 
     // Index into this Flat in various ways
-    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Flat>
+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Flat,1>
             operator[](const RegAS &idx) {
         typename Duoram<T>::Shape::
-            template MemRefS<RegAS,T,std::nullopt_t,Flat>
+            template MemRefS<RegAS,T,std::nullopt_t,Flat,1>
             res(*this, idx, std::nullopt);
         return res;
     }
-    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Flat>
+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Flat,1>
             operator[](const RegXS &idx) {
         typename Duoram<T>::Shape::
-            template MemRefS<RegXS,T,std::nullopt_t,Flat>
+            template MemRefS<RegXS,T,std::nullopt_t,Flat,1>
             res(*this, idx, std::nullopt);
         return res;
     }
+    template <typename U, nbits_t WIDTH>
+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Flat,WIDTH>
+            operator[](OblivIndex<U,WIDTH> &obidx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegXS,T,std::nullopt_t,Flat,WIDTH>
+            res(*this, obidx, std::nullopt);
+        return res;
+    }
     typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
             operator[](address_t idx) {
         typename Duoram<T>::Shape::
@@ -338,18 +493,72 @@ public:
     template<typename U,typename V>
     void osort(const U &idx1, const V &idx2, bool dir=0);
 
-    // Bitonic sort the elements from start to start+(1<<depth)-1, in
+    // Bitonic sort the elements from start to start+len-1, in
     // increasing order if dir=0 or decreasing order if dir=1. Note that
     // the elements must be at most 63 bits long each for the notion of
     // ">" to make consistent sense.
-    void bitonic_sort(address_t start, nbits_t depth, bool dir=0);
-
-    // Assuming the memory is already sorted, do an oblivious binary
-    // search for the largest index containing the value at most the
-    // given one.  (The answer will be 0 if all of the memory elements
-    // are greate than the target.) This Flat must be a power of 2 size.
-    // Only available for additive shared databases for now.
-    RegAS obliv_binary_search(RegAS &target);
+    void bitonic_sort(address_t start, address_t len, bool dir=0);
+};
+
+// Oblivious indices for use in related-index ORAM accesses.
+
+template <typename T>
+template <typename U, nbits_t WIDTH>
+class Duoram<T>::OblivIndex {
+    template <typename Ux,typename FT,typename FST,typename Sh,nbits_t WIDTHx>
+    friend class Shape::MemRefS;
+
+    int player;
+    std::optional<RDPFTriple<WIDTH>> dt;
+    std::optional<RDPFPair<WIDTH>> dp;
+    nbits_t curdepth, maxdepth;
+    nbits_t next_windex;
+    bool incremental;
+    U idx;
+
+public:
+    // Non-incremental constructor
+    OblivIndex(MPCTIO &tio, yield_t &yield, const U &idx, nbits_t depth) :
+        player(tio.player()), curdepth(depth), maxdepth(depth),
+        next_windex(0), incremental(false), idx(idx)
+    {
+        if (player < 2) {
+            dt = tio.rdpftriple<WIDTH>(yield, depth);
+        } else {
+            dp = tio.rdpfpair<WIDTH>(yield, depth);
+        }
+    }
+
+    // Incremental constructor: only for U=RegXS
+    OblivIndex(MPCTIO &tio, yield_t &yield, nbits_t depth) :
+        player(tio.player()), curdepth(0), maxdepth(depth),
+        next_windex(0), incremental(true), idx(RegXS())
+    {
+        if (player < 2) {
+            dt = tio.rdpftriple(yield, depth, true);
+        } else {
+            dp = tio.rdpfpair(yield, depth, true);
+        }
+    }
+
+    // Incrementally append a (shared) bit to the oblivious index
+    void incr(RegBS bit)
+    {
+        assert(incremental);
+        idx.xshare = (idx.xshare << 1) | value_t(bit.bshare);
+        ++curdepth;
+        if (player < 2) {
+            dt->depth(curdepth);
+        } else {
+            dp->depth(curdepth);
+        }
+    }
+
+    // Get a copy of the index
+    U index() { return idx; }
+
+    // Get the next wide-RDPF index
+    nbits_t windex() { assert(next_windex < WIDTH); return next_windex++; }
 };
 
 // An additive or XOR shared memory reference.  You get one of these
@@ -363,31 +572,50 @@ public:
 // particular field of T, then FT will be the type of the field (RegAS
 // or RegXS) and FST will be a pointer-to-member T::* type pointing to
 // that field.  Sh is the specific Shape subtype used to create the
-// MemRefS.
+// MemRefS.  WIDTH is the RDPF width to use.
 
 template <typename T>
-template <typename U, typename FT, typename FST, typename Sh>
+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
 class Duoram<T>::Shape::MemRefS {
     Sh &shape;
-    U idx;
+    // oblividx is a reference to the OblivIndex we're using.  In the
+    // common case, we own the actual OblivIndex, and it's stored in
+    // our_oblividx, and oblividx is a pointer to that.  Sometimes
+    // (for example incremental ORAM accesses), the caller will own (and
+    // modify between uses) the OblivIndex.  In that case, oblividx will
+    // be a pointer to the caller's OblivIndex object, and
+    // our_oblividx will be nullopt.
+    std::optional<Duoram<T>::OblivIndex<U,WIDTH>> our_oblividx;
+    Duoram<T>::OblivIndex<U,WIDTH> *oblividx;
+
     FST fieldsel;
 
 private:
     // Oblivious update to a shared index of Duoram memory, only for
     // FT = RegAS or RegXS
-    MemRefS<U,FT,FST,Sh> &oram_update(const FT& M, const prac_template_true&);
+    MemRefS<U,FT,FST,Sh,WIDTH> &oram_update(const FT& M, const prac_template_true&);
     // Oblivious update to a shared index of Duoram memory, for
     // FT not RegAS or RegXS
-    MemRefS<U,FT,FST,Sh> &oram_update(const FT& M, const prac_template_false&);
+    MemRefS<U,FT,FST,Sh,WIDTH> &oram_update(const FT& M, const prac_template_false&);
 
 public:
-    MemRefS<U,FT,FST,Sh>(Sh &shape, const U &idx, FST fieldsel) :
-        shape(shape), idx(idx), fieldsel(fieldsel) {}
+    MemRefS<U,FT,FST,Sh,WIDTH>(Sh &shape, const U &idx, FST fieldsel) :
+        shape(shape), fieldsel(fieldsel) {
+        our_oblividx.emplace(shape.tio, shape.yield, idx,
+            shape.addr_size);
+        oblividx = &(*our_oblividx);
+    }
+
+    MemRefS<U,FT,FST,Sh,WIDTH>(Sh &shape, OblivIndex<U,WIDTH> &obidx, FST fieldsel) :
+        shape(shape), fieldsel(fieldsel) {
+        oblividx = &obidx;
+    }
 
     // Create a MemRefExpl for accessing a partcular field of T
     template <typename SFT>
-    MemRefS<U,SFT,SFT T::*,Sh> field(SFT T::*subfieldsel) {
-        auto res = MemRefS<U,SFT,SFT T::*,Sh>(this->shape, idx, subfieldsel);
+    MemRefS<U,SFT,SFT T::*,Sh,WIDTH> field(SFT T::*subfieldsel) {
+        auto res = MemRefS<U,SFT,SFT T::*,Sh,WIDTH>(this->shape,
+            *oblividx, subfieldsel);
         return res;
     }
 
@@ -395,10 +623,10 @@ public:
     operator FT();
 
     // Oblivious update to a shared index of Duoram memory
-    MemRefS<U,FT,FST,Sh> &operator+=(const FT& M);
+    MemRefS<U,FT,FST,Sh,WIDTH> &operator+=(const FT& M);
 
     // Oblivious write to a shared index of Duoram memory
-    MemRefS<U,FT,FST,Sh> &operator=(const FT& M);
+    MemRefS<U,FT,FST,Sh,WIDTH> &operator=(const FT& M);
 };
 
 // An explicit memory reference.  You get one of these from a Shape A

+ 118 - 82
duoram.tcc

@@ -177,62 +177,94 @@ Duoram<T>::Flat::Flat(Duoram &duoram, MPCTIO &tio, yield_t &yield,
     this->set_shape_size(len);
 }
 
-// Bitonic sort the elements from start to start+(1<<depth)-1, in
+// Constructor for the Flat shape.  len=0 means the maximum size (the
+// parent's size minus start).
+template <typename T>
+Duoram<T>::Flat::Flat(const Shape &parent, MPCTIO &tio, yield_t &yield,
+    size_t start, size_t len) : Shape(parent, parent.duoram, tio, yield)
+{
+    size_t parentsize = parent.size();
+    if (start > parentsize) {
+        start = parentsize;
+    }
+    this->start = start;
+    size_t maxshapesize = parentsize - start;
+    if (len > maxshapesize || len == 0) {
+        len = maxshapesize;
+    }
+    this->len = len;
+    this->set_shape_size(len);
+}
+
+// Bitonic sort the elements from start to start+len-1, in
 // increasing order if dir=0 or decreasing order if dir=1. Note that
 // the elements must be at most 63 bits long each for the notion of
 // ">" to make consistent sense.
 template <typename T>
-void Duoram<T>::Flat::bitonic_sort(address_t start, nbits_t depth, bool dir)
+void Duoram<T>::Flat::bitonic_sort(address_t start, address_t len, bool dir)
 {
-    if (depth == 0) return;
-    if (depth == 1) {
+    if (len < 2) return;
+    if (len == 2) {
         osort(start, start+1, dir);
         return;
     }
-    // Recurse on the first half (increasing order) and the second half
-    // (decreasing order) in parallel
+    address_t leftlen, rightlen;
+    leftlen = (len+1) >> 1;
+    rightlen = len >> 1;
+
+    // Recurse on the first half (opposite to the desired order)
+    // and the second half (desired order) in parallel
     run_coroutines(this->yield,
-        [this, start, depth](yield_t &yield) {
+        [this, start, leftlen, dir](yield_t &yield) {
             Flat Acoro = context(yield);
-            Acoro.bitonic_sort(start, depth-1, 0);
+            Acoro.bitonic_sort(start, leftlen, !dir);
         },
-        [this, start, depth](yield_t &yield) {
+        [this, start, leftlen, rightlen, dir](yield_t &yield) {
             Flat Acoro = context(yield);
-            Acoro.bitonic_sort(start+(1<<(depth-1)), depth-1, 1);
+            Acoro.bitonic_sort(start+leftlen, rightlen, dir);
         });
     // Merge the two into the desired order
-    butterfly(start, depth, dir);
+    butterfly(start, len, dir);
 }
 
 // Internal function to aid bitonic_sort
 template <typename T>
-void Duoram<T>::Flat::butterfly(address_t start, nbits_t depth, bool dir)
+void Duoram<T>::Flat::butterfly(address_t start, address_t len, bool dir)
 {
-    if (depth == 0) return;
-    if (depth == 1) {
+    if (len < 2) return;
+    if (len == 2) {
         osort(start, start+1, dir);
         return;
     }
-    // Sort pairs of elements half the width apart in parallel
-    address_t halfwidth = address_t(1)<<(depth-1);
+    address_t leftlen, rightlen, offset, num_swaps;
+    // leftlen = (len+1) >> 1;
+    leftlen = 1;
+    while(2*leftlen < len) {
+        leftlen *= 2;
+    }
+    rightlen = len - leftlen;
+    offset = leftlen;
+    num_swaps = rightlen;
+
+    // Sort pairs of elements offset apart in parallel
     std::vector<coro_t> coroutines;
-    for (address_t i=0; i<halfwidth;++i) {
+    for (address_t i=0; i<num_swaps;++i) {
         coroutines.emplace_back(
-            [this, start, halfwidth, dir, i](yield_t &yield) {
+            [this, start, offset, dir, i](yield_t &yield) {
                 Flat Acoro = context(yield);
-                Acoro.osort(start+i, start+i+halfwidth, dir);
+                Acoro.osort(start+i, start+i+offset, dir);
             });
     }
     run_coroutines(this->yield, coroutines);
     // Recurse on each half in parallel
     run_coroutines(this->yield,
-        [this, start, depth, dir](yield_t &yield) {
+        [this, start, leftlen, dir](yield_t &yield) {
             Flat Acoro = context(yield);
-            Acoro.butterfly(start, depth-1, dir);
+            Acoro.butterfly(start, leftlen, dir);
         },
-        [this, start, halfwidth, depth, dir](yield_t &yield) {
+        [this, start, leftlen, rightlen, dir](yield_t &yield) {
             Flat Acoro = context(yield);
-            Acoro.butterfly(start+halfwidth, depth-1, dir);
+            Acoro.butterfly(start+leftlen, rightlen, dir);
         });
 }
 
@@ -260,11 +292,11 @@ inline address_t IfRegXS<RegXS>(address_t val) { return val; }
 // a particular field of T, then FT will be the type of the field (RegAS
 // or RegXS) and FST will be a pointer-to-member T::* type pointing to
 // that field.  Sh is the specific Shape subtype used to create the
-// MemRefS.
+// MemRefS.  WIDTH is the RDPF width to use.
 
 template <typename T>
-template <typename U,typename FT,typename FST,typename Sh>
-Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
+template <typename U,typename FT,typename FST,typename Sh,nbits_t WIDTH>
+Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::operator FT()
 {
     FT res;
     Sh &shape = this->shape;
@@ -273,30 +305,29 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
     if (player < 2) {
         // Computational players do this
 
-        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
+        const RDPFTriple<WIDTH> &dt = *(oblividx->dt);
+        const nbits_t depth = dt.depth();
 
         // Compute the index offset
         U indoffset;
         dt.get_target(indoffset);
-        indoffset -= idx;
+        indoffset -= oblividx->idx;
 
         // We only need two of the DPFs for reading
-        RDPFPair<1> dp(std::move(dt), 0, player == 0 ? 2 : 1);
-        // The RDPFTriple dt is now broken, since we've moved things out
-        // of it.
+        RDPF2of3<WIDTH> dp(dt, 0, player == 0 ? 2 : 1);
 
         // Send it to the peer and the server
-        shape.tio.queue_peer(&indoffset, BITBYTES(shape.addr_size));
-        shape.tio.queue_server(&indoffset, BITBYTES(shape.addr_size));
+        shape.tio.queue_peer(&indoffset, BITBYTES(depth));
+        shape.tio.queue_server(&indoffset, BITBYTES(depth));
 
         shape.yield();
 
         // Receive the above from the peer
         U peerindoffset;
-        shape.tio.recv_peer(&peerindoffset, BITBYTES(shape.addr_size));
+        shape.tio.recv_peer(&peerindoffset, BITBYTES(depth));
 
         // Reconstruct the total offset
-        auto indshift = combine(indoffset, peerindoffset, shape.addr_size);
+        auto indshift = combine(indoffset, peerindoffset, depth);
 
         // Evaluate the DPFs and compute the dotproducts
         ParallelEval pe(dp, IfRegAS<U>(indshift), IfRegXS<U>(indshift),
@@ -304,7 +335,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
             shape.tio.aes_ops());
         FT init;
         res = pe.reduce(init, [this, &dp, &shape] (int thread_num,
-                address_t i, const RDPFPair<1>::LeafNode &leaf) {
+                address_t i, const typename RDPFPair<WIDTH>::LeafNode &leaf) {
             // The values from the two DPFs, which will each be of type T
             std::tuple<FT,FT> V;
             dp.unit(V, leaf);
@@ -324,16 +355,17 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
     } else {
         // The server does this
 
-        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
+        const RDPFPair<WIDTH> &dp = *(oblividx->dp);
+        const nbits_t depth = dp.depth();
         U p0indoffset, p1indoffset;
 
         shape.yield();
 
         // Receive the index offset from the computational players and
         // combine them
-        shape.tio.recv_p0(&p0indoffset, BITBYTES(shape.addr_size));
-        shape.tio.recv_p1(&p1indoffset, BITBYTES(shape.addr_size));
-        auto indshift = combine(p0indoffset, p1indoffset, shape.addr_size);
+        shape.tio.recv_p0(&p0indoffset, BITBYTES(depth));
+        shape.tio.recv_p1(&p1indoffset, BITBYTES(depth));
+        auto indshift = combine(p0indoffset, p1indoffset, depth);
 
         // Evaluate the DPFs to compute the cancellation terms
         std::tuple<FT,FT> init, gamma;
@@ -341,7 +373,7 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
             shape.shape_size, shape.tio.cpu_nthreads(),
             shape.tio.aes_ops());
         gamma = pe.reduce(init, [this, &dp, &shape] (int thread_num,
-                address_t i, const RDPFPair<1>::LeafNode &leaf) {
+                address_t i, const typename RDPFPair<WIDTH>::LeafNode &leaf) {
             // The values from the two DPFs, each of type FT
             std::tuple<FT,FT> V;
             dp.unit(V, leaf);
@@ -372,9 +404,9 @@ Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator FT()
 // Oblivious update to a shared index of Duoram memory, only for
 // FT = RegAS or RegXS.  The template parameters are as above.
 template <typename T>
-template <typename U, typename FT, typename FST, typename Sh>
-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::oram_update(const FT& M,
+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::oram_update(const FT& M,
         const prac_template_true &)
 {
     Sh &shape = this->shape;
@@ -383,24 +415,26 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
     if (player < 2) {
         // Computational players do this
 
-        RDPFTriple<1> dt = shape.tio.rdpftriple(shape.yield, shape.addr_size);
+        const RDPFTriple<WIDTH> &dt = *(oblividx->dt);
+        const nbits_t windex = oblividx->windex();
+        const nbits_t depth = dt.depth();
 
         // Compute the index and message offsets
         U indoffset;
         dt.get_target(indoffset);
-        indoffset -= idx;
-        RDPF<1>::W<FT> MW;
-        MW[0] = M;
+        indoffset -= oblividx->idx;
+        typename RDPF<WIDTH>::template W<FT> MW;
+        MW[windex] = M;
         auto Moffset = std::make_tuple(MW, MW, MW);
-        RDPFTriple<1>::WTriple<FT> scaled_val;
+        typename RDPFTriple<WIDTH>::template WTriple<FT> scaled_val;
         dt.scaled_value(scaled_val);
         Moffset -= scaled_val;
 
         // Send them to the peer, and everything except the first offset
         // to the server
-        shape.tio.queue_peer(&indoffset, BITBYTES(shape.addr_size));
+        shape.tio.queue_peer(&indoffset, BITBYTES(depth));
         shape.tio.iostream_peer() << Moffset;
-        shape.tio.queue_server(&indoffset, BITBYTES(shape.addr_size));
+        shape.tio.queue_server(&indoffset, BITBYTES(depth));
         shape.tio.iostream_server() << std::get<1>(Moffset) <<
             std::get<2>(Moffset);
 
@@ -408,12 +442,12 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 
         // Receive the above from the peer
         U peerindoffset;
-        RDPFTriple<1>::WTriple<FT> peerMoffset;
-        shape.tio.recv_peer(&peerindoffset, BITBYTES(shape.addr_size));
+        typename RDPFTriple<WIDTH>::template WTriple<FT> peerMoffset;
+        shape.tio.recv_peer(&peerindoffset, BITBYTES(depth));
         shape.tio.iostream_peer() >> peerMoffset;
 
         // Reconstruct the total offsets
-        auto indshift = combine(indoffset, peerindoffset, shape.addr_size);
+        auto indshift = combine(indoffset, peerindoffset, depth);
         auto Mshift = combine(Moffset, peerMoffset);
 
         // Evaluate the DPFs and add them to the database
@@ -421,10 +455,10 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             shape.shape_size, shape.tio.cpu_nthreads(),
             shape.tio.aes_ops());
         int init = 0;
-        pe.reduce(init, [this, &dt, &shape, &Mshift, player] (int thread_num,
-                address_t i, const RDPFTriple<1>::LeafNode &leaf) {
+        pe.reduce(init, [this, &dt, &shape, &Mshift, player, windex] (int thread_num,
+                address_t i, const typename RDPFTriple<WIDTH>::LeafNode &leaf) {
             // The values from the three DPFs
-            RDPFTriple<1>::WTriple<FT> scaled;
+            typename RDPFTriple<WIDTH>::template WTriple<FT> scaled;
             std::tuple<FT,FT,FT> unit;
             dt.scaled(scaled, leaf);
             dt.unit(unit, leaf);
@@ -432,32 +466,34 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             // References to the appropriate cells in our database, our
             // blind, and our copy of the peer's blinded database
             auto [DB, BL, PBD] = shape.get_comp(i,fieldsel);
-            DB += V0[0];
+            DB += V0[windex];
             if (player == 0) {
-                BL -= V1[0];
-                PBD += V2[0]-V0[0];
+                BL -= V1[windex];
+                PBD += V2[windex]-V0[windex];
             } else {
-                BL -= V2[0];
-                PBD += V1[0]-V0[0];
+                BL -= V2[windex];
+                PBD += V1[windex]-V0[windex];
             }
             return 0;
         });
     } else {
         // The server does this
 
-        RDPFPair<1> dp = shape.tio.rdpfpair(shape.yield, shape.addr_size);
+        const RDPFPair<WIDTH> &dp = *(oblividx->dp);
+        const nbits_t windex = oblividx->windex();
+        const nbits_t depth = dp.depth();
         U p0indoffset, p1indoffset;
-        RDPFPair<1>::WPair<FT> p0Moffset, p1Moffset;
+        typename RDPFPair<WIDTH>::template WPair<FT> p0Moffset, p1Moffset;
 
         shape.yield();
 
         // Receive the index and message offsets from the computational
         // players and combine them
-        shape.tio.recv_p0(&p0indoffset, BITBYTES(shape.addr_size));
+        shape.tio.recv_p0(&p0indoffset, BITBYTES(depth));
         shape.tio.iostream_p0() >> p0Moffset;
-        shape.tio.recv_p1(&p1indoffset, BITBYTES(shape.addr_size));
+        shape.tio.recv_p1(&p1indoffset, BITBYTES(depth));
         shape.tio.iostream_p1() >> p1Moffset;
-        auto indshift = combine(p0indoffset, p1indoffset, shape.addr_size);
+        auto indshift = combine(p0indoffset, p1indoffset, depth);
         auto Mshift = combine(p0Moffset, p1Moffset);
 
         // Evaluate the DPFs and subtract them from the blinds
@@ -465,10 +501,10 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             shape.shape_size, shape.tio.cpu_nthreads(),
             shape.tio.aes_ops());
         int init = 0;
-        pe.reduce(init, [this, &dp, &shape, &Mshift] (int thread_num,
-                address_t i, const RDPFPair<1>::LeafNode &leaf) {
+        pe.reduce(init, [this, &dp, &shape, &Mshift, windex] (int thread_num,
+                address_t i, const typename RDPFPair<WIDTH>::LeafNode &leaf) {
             // The values from the two DPFs
-            RDPFPair<1>::WPair<FT> scaled;
+            typename RDPFPair<WIDTH>::template WPair<FT> scaled;
             std::tuple<FT,FT> unit;
             dp.scaled(scaled, leaf);
             dp.unit(unit, leaf);
@@ -477,8 +513,8 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
             // appropriate cells in the two blinded databases, so we can
             // subtract the pair directly.
             auto [BL0, BL1] = shape.get_server(i,fieldsel);
-            BL0 -= V0[0];
-            BL1 -= V1[0];
+            BL0 -= V0[windex];
+            BL1 -= V1[windex];
             return 0;
         });
     }
@@ -488,21 +524,21 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 // Oblivious update to a shared index of Duoram memory, only for
 // FT not RegAS or RegXS.  The template parameters are as above.
 template <typename T>
-template <typename U, typename FT, typename FST, typename Sh>
-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::oram_update(const FT& M,
+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::oram_update(const FT& M,
         const prac_template_false &)
 {
-    T::update(shape, shape.yield, idx, M);
+    T::update(shape, shape.yield, oblividx->idx, M);
     return *this;
 }
 
 // Oblivious update to an additively or XOR shared index of Duoram
 // memory. The template parameters are as above.
 template <typename T>
-template <typename U, typename FT, typename FST, typename Sh>
-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator+=(const FT& M)
+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::operator+=(const FT& M)
 {
     return oram_update(M, prac_basic_Reg_S<FT>());
 }
@@ -510,9 +546,9 @@ typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
 // Oblivious write to an additively or XOR shared index of Duoram
 // memory. The template parameters are as above.
 template <typename T>
-template <typename U, typename FT, typename FST, typename Sh>
-typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh>
-    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh>::operator=(const FT& M)
+template <typename U, typename FT, typename FST, typename Sh, nbits_t WIDTH>
+typename Duoram<T>::Shape::template MemRefS<U,FT,FST,Sh,WIDTH>
+    &Duoram<T>::Shape::MemRefS<U,FT,FST,Sh,WIDTH>::operator=(const FT& M)
 {
     FT oldval = *this;
     FT update = M - oldval;

+ 57 - 29
mpcio.cpp

@@ -460,7 +460,8 @@ MPCTIO::MPCTIO(MPCIO &mpcio, int thread_num, int num_threads) :
 #ifdef VERBOSE_COMMS
         round_num(0),
 #endif
-        last_andtriple_bits_remaining(0)
+        last_andtriple_bits_remaining(0),
+        remaining_nodesselecttriples(0)
 {
     if (mpcio.player < 2) {
         MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
@@ -726,44 +727,71 @@ MultTriple MPCTIO::andtriple(yield_t &yield)
     return val;
 }
 
-SelectTriple<DPFnode> MPCTIO::nodeselecttriple(yield_t &yield)
+void MPCTIO::request_nodeselecttriples(yield_t &yield, size_t num)
 {
-    SelectTriple<DPFnode> val;
     if (mpcio.player < 2) {
         MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
         if (mpcpio.mode != MODE_ONLINE) {
-            uint8_t Xbyte;
             yield();
-            recv_server(&Xbyte, sizeof(Xbyte));
-            val.X = Xbyte & 1;
-            recv_server(&val.Y, sizeof(val.Y));
-            recv_server(&val.Z, sizeof(val.Z));
+            for (size_t i=0; i<num; ++i) {
+                SelectTriple<DPFnode> v;
+                uint8_t Xbyte;
+                recv_server(&Xbyte, sizeof(Xbyte));
+                v.X = Xbyte & 1;
+                recv_server(&v.Y, sizeof(v.Y));
+                recv_server(&v.Z, sizeof(v.Z));
+                queued_nodeselecttriples.push_back(v);
+            }
+            remaining_nodesselecttriples += num;
         } else {
             std::cerr << "Attempted to read SelectTriple<DPFnode> in online phase\n";
         }
     } else if (mpcio.mode != MODE_ONLINE) {
-        // Create triples (X0,Y0,Z0),(X1,Y1,Z1) such that
-        // (X0*Y1 ^ Y0*X1) = (Z0^Z1)
-        bit_t X0, X1;
-        DPFnode Y0, Z0, Y1, Z1;
-        X0 = arc4random() & 1;
-        arc4random_buf(&Y0, sizeof(Y0));
-        arc4random_buf(&Z0, sizeof(Z0));
-        X1 = arc4random() & 1;
-        arc4random_buf(&Y1, sizeof(Y1));
-        DPFnode X0ext, X1ext;
-        // Sign-extend X0 and X1 (so that 0 -> 0000...0 and
-        // 1 -> 1111...1)
-        X0ext = if128_mask[X0];
-        X1ext = if128_mask[X1];
-        Z1 = ((X0ext & Y1) ^ (X1ext & Y0)) ^ Z0;
-        queue_p0(&X0, sizeof(X0));
-        queue_p0(&Y0, sizeof(Y0));
-        queue_p0(&Z0, sizeof(Z0));
-        queue_p1(&X1, sizeof(X1));
-        queue_p1(&Y1, sizeof(Y1));
-        queue_p1(&Z1, sizeof(Z1));
+        for (size_t i=0; i<num; ++i) {
+            // Create triples (X0,Y0,Z0),(X1,Y1,Z1) such that
+            // (X0*Y1 ^ Y0*X1) = (Z0^Z1)
+            bit_t X0, X1;
+            DPFnode Y0, Z0, Y1, Z1;
+            X0 = arc4random() & 1;
+            arc4random_buf(&Y0, sizeof(Y0));
+            arc4random_buf(&Z0, sizeof(Z0));
+            X1 = arc4random() & 1;
+            arc4random_buf(&Y1, sizeof(Y1));
+            DPFnode X0ext, X1ext;
+            // Sign-extend X0 and X1 (so that 0 -> 0000...0 and
+            // 1 -> 1111...1)
+            X0ext = if128_mask[X0];
+            X1ext = if128_mask[X1];
+            Z1 = ((X0ext & Y1) ^ (X1ext & Y0)) ^ Z0;
+            queue_p0(&X0, sizeof(X0));
+            queue_p0(&Y0, sizeof(Y0));
+            queue_p0(&Z0, sizeof(Z0));
+            queue_p1(&X1, sizeof(X1));
+            queue_p1(&Y1, sizeof(Y1));
+            queue_p1(&Z1, sizeof(Z1));
+        }
         yield();
+        remaining_nodesselecttriples += num;
+    }
+}
+
+SelectTriple<DPFnode> MPCTIO::nodeselecttriple(yield_t &yield)
+{
+    SelectTriple<DPFnode> val;
+    if (remaining_nodesselecttriples == 0) {
+        request_nodeselecttriples(yield, 1);
+    }
+    if (mpcio.player < 2) {
+        MPCPeerIO &mpcpio = static_cast<MPCPeerIO&>(mpcio);
+        if (mpcpio.mode != MODE_ONLINE) {
+            val = queued_nodeselecttriples.front();
+            queued_nodeselecttriples.pop_front();
+            --remaining_nodesselecttriples;
+        } else {
+            std::cerr << "Attempted to read SelectTriple<DPFnode> in online phase\n";
+        }
+    } else if (mpcio.mode != MODE_ONLINE) {
+        --remaining_nodesselecttriples;
     }
     return val;
 }

+ 10 - 0
mpcio.hpp

@@ -353,6 +353,15 @@ class MPCTIO {
     AndTriple last_andtriple;
     nbits_t last_andtriple_bits_remaining;
 
+    // We allow for prefetching of SelectTriple<DPFnode>s to save one
+    // network round per level when constructing RDPFs
+    std::deque<SelectTriple<DPFnode>> queued_nodeselecttriples;
+    // For P0 and P1, it should always be the case that
+    // remaining_nodesselecttriples equals
+    // queued_nodeselecttriples.size().  P2 does not store anything in
+    // queued_nodeselecttriples, however.
+    size_t remaining_nodesselecttriples;
+
 public:
     MPCTIO(MPCIO &mpcio, int thread_num, int num_threads = 1);
 
@@ -425,6 +434,7 @@ public:
     MultTriple multtriple(yield_t &yield);
     HalfTriple halftriple(yield_t &yield, bool tally=true);
     AndTriple andtriple(yield_t &yield);
+    void request_nodeselecttriples(yield_t &yield, size_t num);
     SelectTriple<DPFnode> nodeselecttriple(yield_t &yield);
     SelectTriple<value_t> valselecttriple(yield_t &yield);
     SelectTriple<bit_t> bitselecttriple(yield_t &yield);

+ 441 - 104
online.cpp

@@ -7,6 +7,7 @@
 #include "cdpf.hpp"
 #include "cell.hpp"
 #include "heap.hpp"
+#include "shapes.hpp"
 
 
 static void online_test(MPCIO &mpcio,
@@ -260,8 +261,11 @@ static void rdpf_test(MPCIO &mpcio,
                     RDPF<WIDTH> &dpf = dt.dpf[i];
                     for (nbits_t level=min_level; level<=depth; ++level) {
                         if (incremental) {
-                            printf("Level = %u\n\n", level);
-                            dpf.depth(level);
+                            printf("Level = %u\n", level);
+                            dt.depth(level);
+                            RegXS tshare;
+                            dt.get_target(tshare);
+                            printf("Target share = %lx\n\n", tshare.share());
                         }
                         typename RDPF<WIDTH>::RegXSW peer_scaled_xor;
                         typename RDPF<WIDTH>::RegASW peer_scaled_sum;
@@ -651,12 +655,16 @@ static void duoram_test(MPCIO &mpcio,
         ++args;
     }
     share &= ((address_t(1)<<depth)-1);
+    address_t len = (1<<depth);
+    if (*args) {
+        len = atoi(*args);
+        ++args;
+    }
 
     MPCTIO tio(mpcio, 0, opts.num_threads);
-    run_coroutines(tio, [&tio, depth, share] (yield_t &yield) {
-        size_t size = size_t(1)<<depth;
+    run_coroutines(tio, [&tio, depth, share, len] (yield_t &yield) {
         // size_t &aes_ops = tio.aes_ops();
-        Duoram<T> oram(tio.player(), size);
+        Duoram<T> oram(tio.player(), len);
         auto A = oram.flat(tio, yield);
         RegAS aidx, aidx2, aidx3;
         aidx.ashare = share;
@@ -676,6 +684,14 @@ static void duoram_test(MPCIO &mpcio,
         } else {
             N.set(0x0000beef);
         }
+        RegXS oxidx;
+        oxidx.xshare = share+3*tio.player();
+        T O;
+        if (tio.player() == 0) {
+            O.set(0x31410000);
+        } else {
+            O.set(0x00005926);
+        }
         // Writing and reading with additively shared indices
         printf("Additive Updating\n");
         A[aidx] += M;
@@ -686,8 +702,14 @@ static void duoram_test(MPCIO &mpcio,
         A[xidx] += N;
         printf("XOR Reading\n");
         T Ax = A[xidx];
-        T Ae;
+        // Writing and reading with OblivIndex indices
+        auto oidx = A.oblivindex(oxidx);
+        printf("OblivIndex Updating\n");
+        A[oidx] += O;
+        printf("OblivIndex Reading\n");
+        T Ox = A[oidx];
         // Writing and reading with explicit indices
+        T Ae;
         if (depth > 2) {
             printf("Explicit Updating\n");
             A[5] += Aa;
@@ -714,7 +736,7 @@ static void duoram_test(MPCIO &mpcio,
             oram.dump();
             auto check = A.reconstruct();
             if (tio.player() == 0) {
-                for (address_t i=0;i<size;++i) {
+                for (address_t i=0;i<len;++i) {
                     printf("%04x %016lx\n", i, check[i].share());
                 }
             }
@@ -722,10 +744,12 @@ static void duoram_test(MPCIO &mpcio,
         auto checkread = A.reconstruct(Aa);
         auto checkreade = A.reconstruct(Ae);
         auto checkreadx = A.reconstruct(Ax);
+        auto checkreado = A.reconstruct(Ox);
         if (tio.player() == 0) {
             printf("Read AS value = %016lx\n", checkread.share());
             printf("Read AX value = %016lx\n", checkreadx.share());
             printf("Read Ex value = %016lx\n", checkreade.share());
+            printf("Read OI value = %016lx\n", checkreado.share());
         }
         for (auto &v : Av) {
             auto checkv = A.reconstruct(v);
@@ -1067,48 +1091,126 @@ static void sort_test(MPCIO &mpcio,
         depth = atoi(*args);
         ++args;
     }
+    address_t len = (1<<depth);
+    if (*args) {
+        len = atoi(*args);
+        ++args;
+    }
 
-    int num_threads = opts.num_threads;
-    boost::asio::thread_pool pool(num_threads);
-    for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
-        boost::asio::post(pool, [&mpcio, thread_num, depth] {
-            MPCTIO tio(mpcio, thread_num);
-            run_coroutines(tio, [&tio, depth] (yield_t &yield) {
-                address_t size = address_t(1)<<depth;
-                // size_t &aes_ops = tio.aes_ops();
-                Duoram<RegAS> oram(tio.player(), size);
-                auto A = oram.flat(tio, yield);
-                A.explicitonly(true);
-                // Initialize the memory to random values in parallel
-                std::vector<coro_t> coroutines;
-                for (address_t i=0; i<size; ++i) {
-                    coroutines.emplace_back(
-                        [&A, i](yield_t &yield) {
-                            auto Acoro = A.context(yield);
-                            RegAS v;
-                            v.randomize(62);
-                            Acoro[i] += v;
-                        });
-                }
-                run_coroutines(yield, coroutines);
-                A.bitonic_sort(0, depth);
+    MPCTIO tio(mpcio, 0, opts.num_threads);
+    run_coroutines(tio, [&tio, depth, len] (yield_t &yield) {
+        address_t size = address_t(1)<<depth;
+        // size_t &aes_ops = tio.aes_ops();
+        Duoram<RegAS> oram(tio.player(), size);
+        auto A = oram.flat(tio, yield);
+        A.explicitonly(true);
+        // Initialize the memory to random values in parallel
+        std::vector<coro_t> coroutines;
+        for (address_t i=0; i<size; ++i) {
+            coroutines.emplace_back(
+                [&A, i](yield_t &yield) {
+                    auto Acoro = A.context(yield);
+                    RegAS v;
+                    v.randomize(62);
+                    Acoro[i] += v;
+                });
+        }
+        run_coroutines(yield, coroutines);
+        A.bitonic_sort(0, len);
+        if (depth <= 10) {
+            oram.dump();
+        }
+        auto check = A.reconstruct();
+        bool fail = false;
+        if (tio.player() == 0) {
+            for (address_t i=0;i<size;++i) {
                 if (depth <= 10) {
-                    oram.dump();
-                    auto check = A.reconstruct();
-                    if (tio.player() == 0) {
-                        for (address_t i=0;i<size;++i) {
-                            printf("%04x %016lx\n", i, check[i].share());
-                        }
-                    }
+                    printf("%04x %016lx\n", i, check[i].share());
                 }
-            });
-        });
+                if (i>0 && i<len &&
+                    check[i].share() < check[i-1].share()) {
+                    fail = true;
+                }
+            }
+            if (fail) {
+                printf("FAIL\n");
+            } else {
+                printf("PASS\n");
+            }
+        }
+    });
+}
+
+static void pad_test(MPCIO &mpcio,
+    const PRACOptions &opts, char **args)
+{
+    nbits_t depth=6;
+
+    if (*args) {
+        depth = atoi(*args);
+        ++args;
     }
-    pool.join();
+    address_t len = (1<<depth);
+    if (*args) {
+        len = atoi(*args);
+        ++args;
+    }
+
+    MPCTIO tio(mpcio, 0, opts.num_threads);
+    run_coroutines(tio, [&mpcio, &tio, depth, len] (yield_t &yield) {
+        int player = tio.player();
+        Duoram<RegAS> oram(player, len);
+        auto A = oram.flat(tio, yield);
+        // Initialize the ORAM in explicit mode
+        A.explicitonly(true);
+        for (address_t i=0; i<len; ++i) {
+            RegAS v;
+            v.set((player*0xffff+1)*i);
+            A[i] = v;
+        }
+        A.explicitonly(false);
+        // Obliviously add 0 to A[0], which reblinds the whole database
+        RegAS z;
+        A[z] += z;
+        auto check = A.reconstruct();
+        if (player == 0) {
+            for (address_t i=0;i<len;++i) {
+                if (depth <= 10) {
+                    printf("%04x %016lx\n", i, check[i].share());
+                }
+            }
+            printf("\n");
+        }
+        address_t maxsize = address_t(1)<<depth;
+        Duoram<RegAS>::Pad P(A, tio, yield, maxsize);
+        for (address_t i=0; i<maxsize; ++i) {
+            RegAS v = P[i];
+            if (depth <= 10) {
+                value_t vval = mpc_reconstruct(tio, yield, v);
+                printf("%04x %016lx %016lx\n", i, v.share(), vval);
+            }
+        }
+        printf("\n");
+        for (address_t i=0; i<maxsize; ++i) {
+            value_t offset = 0xdeadbeef;
+            if (player) {
+                offset = -offset;
+            }
+            RegAS ind;
+            ind.set(player*i+offset);
+            RegAS v = P[ind];
+            if (depth <= 10) {
+                value_t vval = mpc_reconstruct(tio, yield, v);
+                printf("%04x %016lx %016lx\n", i, v.share(), vval);
+            }
+        }
+        printf("\n");
+    });
 }
 
+
 static void bsearch_test(MPCIO &mpcio,
-    const PRACOptions &opts, char **args)
+    const PRACOptions &opts, char **args, bool basic)
 {
     value_t target;
     arc4random_buf(&target, sizeof(target));
@@ -1119,79 +1221,298 @@ static void bsearch_test(MPCIO &mpcio,
         depth = atoi(*args);
         ++args;
     }
+    address_t len = (1<<depth);
+    if (*args) {
+        len = atoi(*args);
+        ++args;
+    }
     if (*args) {
         target = strtoull(*args, NULL, 16);
         ++args;
     }
 
-    int num_threads = opts.num_threads;
-    boost::asio::thread_pool pool(num_threads);
-    for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
-        boost::asio::post(pool, [&mpcio, thread_num, depth, target] {
-            MPCTIO tio(mpcio, thread_num);
-            run_coroutines(tio, [&tio, depth, target] (yield_t &yield) {
-                address_t size = address_t(1)<<depth;
-                RegAS tshare;
-                if (tio.player() == 2) {
-                    // Send shares of the target to the computational
-                    // players
-                    RegAS tshare0, tshare1;
-                    tshare0.randomize();
-                    tshare1.set(target-tshare0.share());
-                    tio.iostream_p0() << tshare0;
-                    tio.iostream_p1() << tshare1;
-                    printf("Using target = %016lx\n", target);
-                    yield();
-                } else {
-                    // Get the share of the target
-                    tio.iostream_server() >> tshare;
-                }
+    MPCTIO tio(mpcio, 0, opts.num_threads);
+    run_coroutines(tio, [&tio, &mpcio, depth, len, target, basic] (yield_t &yield) {
+        RegAS tshare;
+        std::cout << "\n===== SETUP =====\n";
+
+        if (tio.player() == 2) {
+            // Send shares of the target to the computational
+            // players
+            RegAS tshare0, tshare1;
+            tshare0.randomize();
+            tshare1.set(target-tshare0.share());
+            tio.iostream_p0() << tshare0;
+            tio.iostream_p1() << tshare1;
+            printf("Using target = %016lx\n", target);
+            yield();
+        } else {
+            // Get the share of the target
+            tio.iostream_server() >> tshare;
+        }
+
+        tio.sync_lamport();
+        mpcio.dump_stats(std::cout);
+
+        std::cout << "\n===== SORT RANDOM DATABASE =====\n";
+        mpcio.reset_stats();
+        tio.reset_lamport();
+        // Create a random database and sort it
+        // size_t &aes_ops = tio.aes_ops();
+        Duoram<RegAS> oram(tio.player(), len);
+        auto A = oram.flat(tio, yield);
+        A.explicitonly(true);
+        // Initialize the memory to random values in parallel
+        std::vector<coro_t> coroutines;
+        for (address_t i=0; i<len; ++i) {
+            coroutines.emplace_back(
+                [&A, i](yield_t &yield) {
+                    auto Acoro = A.context(yield);
+                    RegAS v;
+                    v.randomize(62);
+                    Acoro[i] += v;
+                });
+        }
+        run_coroutines(yield, coroutines);
+        A.bitonic_sort(0, len);
+        A.explicitonly(false);
+
+        tio.sync_lamport();
+        mpcio.dump_stats(std::cout);
 
-                // Create a random database and sort it
-                // size_t &aes_ops = tio.aes_ops();
-                Duoram<RegAS> oram(tio.player(), size);
-                auto A = oram.flat(tio, yield);
-                A.explicitonly(true);
-                // Initialize the memory to random values in parallel
-                std::vector<coro_t> coroutines;
-                for (address_t i=0; i<size; ++i) {
-                    coroutines.emplace_back(
-                        [&A, i](yield_t &yield) {
-                            auto Acoro = A.context(yield);
-                            RegAS v;
-                            v.randomize(62);
-                            Acoro[i] += v;
-                        });
+        std::cout << "\n===== BINARY SEARCH =====\n";
+        mpcio.reset_stats();
+        tio.reset_lamport();
+        // Binary search for the target
+        value_t checkindex;
+        if (basic) {
+            RegAS tindex = A.basic_binary_search(tshare);
+            checkindex = mpc_reconstruct(tio, yield, tindex);
+        } else {
+            RegXS tindex = A.binary_search(tshare);
+            checkindex = mpc_reconstruct(tio, yield, tindex);
+        }
+
+        tio.sync_lamport();
+        mpcio.dump_stats(std::cout);
+
+        std::cout << "\n===== CHECK ANSWER =====\n";
+        mpcio.reset_stats();
+        tio.reset_lamport();
+        // Check the answer
+        size_t size = size_t(1) << depth;
+        value_t checktarget = mpc_reconstruct(tio, yield, tshare);
+        auto check = A.reconstruct();
+        bool fail = false;
+        if (tio.player() == 0) {
+            for (address_t i=0;i<len;++i) {
+                if (depth <= 10) {
+                    printf("%c%04x %016lx\n",
+                        (i == checkindex ? '*' : ' '),
+                        i, check[i].share());
                 }
-                run_coroutines(yield, coroutines);
-                A.bitonic_sort(0, depth);
-
-                // Binary search for the target
-                RegAS tindex = A.obliv_binary_search(tshare);
-
-                // Check the answer
-                if (tio.player() == 1) {
-                    tio.iostream_peer() << tindex;
-                } else if (tio.player() == 0) {
-                    RegAS peer_tindex;
-                    tio.iostream_peer() >> peer_tindex;
-                    tindex += peer_tindex;
+                if (i>0 && i<len &&
+                    check[i].share() < check[i-1].share()) {
+                    fail = true;
                 }
-                if (depth <= 10) {
-                    auto check = A.reconstruct();
-                    if (tio.player() == 0) {
-                        for (address_t i=0;i<size;++i) {
-                            printf("%04x %016lx\n", i, check[i].share());
-                        }
+                if (i == checkindex) {
+                    // check[i] should be >= target, and check[i-1]
+                    // should be < target
+                    if ((i < len && check[i].share() < checktarget) ||
+                        (i > 0 && check[i-1].share() >= checktarget)) {
+                        fail = true;
                     }
                 }
-                if (tio.player() == 0) {
-                    printf("Found index = %lx\n", tindex.share());
+            }
+            if (checkindex == len && check[len-1].share() >= checktarget) {
+                fail = true;
+            }
+
+            printf("Target = %016lx\n", checktarget);
+            printf("Found index = %02lx\n", checkindex);
+            if (checkindex > size) {
+                fail = true;
+            }
+            if (fail) {
+                printf("FAIL\n");
+            } else {
+                printf("PASS\n");
+            }
+        }
+    });
+}
+
+template <typename T>
+static void related(MPCIO &mpcio,
+    const PRACOptions &opts, char **args)
+{
+    nbits_t depth = 5;
+
+    // The depth of the (complete) binary tree
+    if (*args) {
+        depth = atoi(*args);
+        ++args;
+    }
+    // The layer at which to choose a random parent node (and its two
+    // children along with it)
+    nbits_t layer = depth-1;
+    if (*args) {
+        layer = atoi(*args);
+        ++args;
+    }
+    assert(layer < depth);
+
+    MPCTIO tio(mpcio, 0, opts.num_threads);
+    run_coroutines(tio, [&mpcio, &tio, depth, layer] (yield_t &yield) {
+        size_t size = size_t(1)<<(depth+1);
+        Duoram<T> oram(tio.player(), size);
+        auto A = oram.flat(tio, yield);
+
+        // Initialize A with words with sequential top and bottom halves
+        // (just so we can more easily eyeball the right answers)
+        A.init([] (size_t i) { return i * 0x100000001; } );
+
+        // We use this layout for the tree:
+        // A[0] is unused
+        // A[1] is the root (layer 0)
+        // A[2..3] is layer 1
+        // A[4..7] is layer 2
+        // ...
+        // A[(1<<j)..((2<<j)-1)] is layer j
+        //
+        // So the parent of x is at location (x/2) and the children of x
+        // are at locations 2*x and 2*x+1
+
+        // Pick a random index _within_ the given layer (i.e., the
+        // offset from the beginning of the layer, not the absolute
+        // location in A)
+        RegXS idx;
+        idx.randomize(layer);
+        // Create the OblivIndex. RegXS is the type of the common index
+        // (idx), 3 is the maximum number of related updates to support
+        // (which equals the width of the underlying RDPF, currently
+        // maximum 5), layer is the depth of the underlying RDPF (the
+        // bit length of idx).
+        typename Duoram<T>::template OblivIndex<RegXS,3> oidx(tio, yield, idx, layer);
+
+        // This is the (known) layer containing the (unknown) parent
+        // node
+        typename Duoram<T>::Flat P(A, tio, yield, 1<<layer, 1<<layer);
+        // This is the layer below that one, containing all possible
+        // children
+        typename Duoram<T>::Flat C(A, tio, yield, 2<<layer, 2<<layer);
+        // These are the subsets of C containing the left children and
+        // the right children respectively
+        typename Duoram<T>::Stride L(C, tio, yield, 0, 2);
+        typename Duoram<T>::Stride R(C, tio, yield, 1, 2);
+
+        T parent, left, right;
+
+        // Do three related reads.  In this version, only one DPF will
+        // be used, but it will still be _evaluated_ three times.
+        parent = P[oidx];
+        left = L[oidx];
+        right = R[oidx];
+
+        // The operation is just a simple rotation: the value in the
+        // parent moves to the left child, the left child moves to the
+        // right child, and the right child becomes the parent
+
+        // Do three related updates.  As above, only one (wide) DPF will
+        // be used (the same one as for the reads in fact), but it will
+        // still be _evaluated_ three more times.
+        P[oidx] += right-parent;
+        L[oidx] += parent-left;
+        R[oidx] += left-right;
+
+        // Check the answer
+        auto check = A.reconstruct();
+        if (depth <= 10) {
+            oram.dump();
+            if (tio.player() == 0) {
+                for (address_t i=0;i<size;++i) {
+                    printf("%04x %016lx\n", i, check[i].share());
                 }
-            });
-        });
+            }
+        }
+        value_t pval = mpc_reconstruct(tio, yield, parent);
+        value_t lval = mpc_reconstruct(tio, yield, left);
+        value_t rval = mpc_reconstruct(tio, yield, right);
+        printf("parent = %016lx\nleft   = %016lx\nright  = %016lx\n",
+            pval, lval, rval);
+    });
+}
+
+template <typename T>
+static void path(MPCIO &mpcio,
+    const PRACOptions &opts, char **args)
+{
+    nbits_t depth = 5;
+
+    // The depth of the (complete) binary tree
+    if (*args) {
+        depth = atoi(*args);
+        ++args;
     }
-    pool.join();
+    // The target node
+    size_t target_node = 3 << (depth-1);
+    if (*args) {
+        target_node = atoi(*args);
+        ++args;
+    }
+
+    MPCTIO tio(mpcio, 0, opts.num_threads);
+    run_coroutines(tio, [&mpcio, &tio, depth, target_node] (yield_t &yield) {
+        size_t size = size_t(1)<<(depth+1);
+        Duoram<T> oram(tio.player(), size);
+        auto A = oram.flat(tio, yield);
+
+        // Initialize A with words with sequential top and bottom halves
+        // (just so we can more easily eyeball the right answers)
+        A.init([] (size_t i) { return i * 0x100000001; } );
+
+        // We use this layout for the tree:
+        // A[0] is unused
+        // A[1] is the root (layer 0)
+        // A[2..3] is layer 1
+        // A[4..7] is layer 2
+        // ...
+        // A[(1<<j)..((2<<j)-1)] is layer j
+        //
+        // So the parent of x is at location (x/2) and the children of x
+        // are at locations 2*x and 2*x+1
+
+        // Create a Path from the root to the target node
+        typename Duoram<T>::Path P(A, tio, yield, target_node);
+
+        // Re-initialize that path to something recognizable
+        P.init([] (size_t i) { return 0xff + i * 0x1000000010000; } );
+
+        // ORAM update along that path
+        RegXS idx;
+        idx.set(tio.player() * arc4random_uniform(P.size()));
+        T val;
+        val.set(tio.player() * 0xaaaa00000000);
+        P[idx] += val;
+
+        // Binary search along that path
+        T lookup;
+        lookup.set(tio.player() * 0x3000000000000);
+        RegXS foundidx = P.binary_search(lookup);
+
+        // Check the answer
+        auto check = A.reconstruct();
+        if (depth <= 10) {
+            oram.dump();
+            if (tio.player() == 0) {
+                for (address_t i=0;i<size;++i) {
+                    printf("%04x %016lx\n", i, check[i].share());
+                }
+            }
+        }
+        value_t found = mpc_reconstruct(tio, yield, foundidx);
+        printf("foundidx = %lu\n", found);
+    });
 }
 
 void online_main(MPCIO &mpcio, const PRACOptions &opts, char **args)
@@ -1267,9 +1588,15 @@ void online_main(MPCIO &mpcio, const PRACOptions &opts, char **args)
     } else if (!strcmp(*args, "sorttest")) {
         ++args;
         sort_test(mpcio, opts, args);
+    } else if (!strcmp(*args, "padtest")) {
+        ++args;
+        pad_test(mpcio, opts, args);
+    } else if (!strcmp(*args, "bbsearch")) {
+        ++args;
+        bsearch_test(mpcio, opts, args, true);
     } else if (!strcmp(*args, "bsearch")) {
         ++args;
-        bsearch_test(mpcio, opts, args);
+        bsearch_test(mpcio, opts, args, false);
     } else if (!strcmp(*args, "duoram")) {
         ++args;
         if (opts.use_xor_db) {
@@ -1277,6 +1604,16 @@ void online_main(MPCIO &mpcio, const PRACOptions &opts, char **args)
         } else {
             duoram<RegAS>(mpcio, opts, args);
         }
+    } else if (!strcmp(*args, "related")) {
+        ++args;
+        if (opts.use_xor_db) {
+            related<RegXS>(mpcio, opts, args);
+        } else {
+            related<RegAS>(mpcio, opts, args);
+        }
+    } else if (!strcmp(*args, "path")) {
+        ++args;
+        path<RegAS>(mpcio, opts, args);
     } else if (!strcmp(*args, "cell")) {
         ++args;
         cell(mpcio, opts, args);

+ 1 - 1
preproc.cpp

@@ -476,7 +476,7 @@ void preprocessing_server(MPCServerIO &mpcsrvio, const PRACOptions &opts, char *
                                 stio.cdpf(yield);
                             });
                     }
-                } else if (!strcmp(type, "i")) {
+                } else if (!strcmp(type, "k")) {
                     unsigned char typetag = 0x8e;
                     unsigned char subtypetag = 0x00;
                     stio.queue_p0(&typetag, 1);

+ 121 - 10
rdpf.hpp

@@ -289,8 +289,11 @@ struct RDPFTriple {
     // outputs so that the appropriate one can be selected with a
     // parameter
 
+    // Only RegXS, not RegAS, indices are used with incremental RDPFs
     inline void get_target(RegAS &target) const { target = as_target; }
-    inline void get_target(RegXS &target) const { target = xs_target; }
+    inline void get_target(RegXS &target) const {
+        target = xs_target >> (dpf[0].maxdepth - dpf[0].curdepth);
+    }
 
     // Additive share of the scaling value M_as such that the high words
     // of the leaf values for P0 and P1 add to M_as * e_{target}
@@ -368,15 +371,6 @@ struct RDPFPair {
 
     RDPFPair() {}
 
-    // Create an RDPFPair from an RDPFTriple, keeping two of the RDPFs
-    // and dropping one.  This _moves_ the dpfs from the triple to the
-    // pair, so the triple will no longer be valid after using this.
-    // which0 and which1 indicate which of the dpfs to keep.
-    RDPFPair(RDPFTriple<WIDTH> &&trip, int which0, int which1) {
-        dpf[0] = std::move(trip.dpf[which0]);
-        dpf[1] = std::move(trip.dpf[which1]);
-    }
-
     // The depth
     inline nbits_t depth() const { return dpf[0].depth(); }
 
@@ -464,6 +458,123 @@ struct RDPFPair {
 
 };
 
+// These are used by computational peers, who hold RPDFTriples, but when
+// reading, only need to use 2 of the 3 RDPFs.  The API follows that of
+// RDPFPair, but internally, it holds two references to external RDPFs,
+// instead of holding the RDPFs themselves.
+
+template <nbits_t WIDTH>
+struct RDPF2of3 {
+    template <typename T>
+    using Pair = std::tuple<T, T>;
+    template <typename T>
+    using WPair = std::tuple<
+        typename std::array<T,WIDTH>,
+        typename std::array<T,WIDTH> >;
+
+    // The type of pairs of nodes, LeafNodes, and the wide shared
+    // register types
+    using node = Pair<DPFnode>;
+    using LeafNode = Pair<typename RDPF<WIDTH>::LeafNode>;
+    using RegASWP = WPair<RegAS>;
+    using RegXSWP = WPair<RegXS>;
+
+    const RDPF<WIDTH> &dpf0, &dpf1;
+
+    // Create an RDPFPair from an RDPFTriple, keeping two of the RDPFs
+    // and dropping one.  This _moves_ the dpfs from the triple to the
+    // pair, so the triple will no longer be valid after using this.
+    // which0 and which1 indicate which of the dpfs to keep.
+    RDPF2of3(const RDPFTriple<WIDTH> &trip, int which0, int which1) :
+        dpf0(trip.dpf[which0]), dpf1(trip.dpf[which1]) {}
+
+    // The depth
+    inline nbits_t depth() const { return dpf0.depth(); }
+
+    // Set the current depth for an incremental RDPFPair; 0 means to use
+    // maxdepth
+    inline void depth(nbits_t newdepth) {
+        dpf0.depth(newdepth);
+        dpf1.depth(newdepth);
+    }
+
+    // The seed
+    inline node get_seed() const {
+        return std::make_tuple(dpf0.get_seed(), dpf1.get_seed());
+    }
+
+    // Do we have a precomputed expansion?
+    inline bool has_expansion() const {
+        int li_index = dpf0.maxdepth - dpf0.curdepth;
+        return dpf0.li[li_index].expansion.size() > 0;
+    }
+
+    // Get an element of the expansion
+    inline LeafNode get_expansion(address_t index) const {
+        return std::make_tuple(dpf0.get_expansion(index),
+            dpf1.get_expansion(index));
+    }
+
+    // Descend the two RDPFs in lock step
+    node descend(const node &parent, nbits_t parentdepth,
+        bit_t whichchild, size_t &aes_ops) const;
+
+    // Descend the two RDPFs in lock step to a leaf node
+    LeafNode descend_to_leaf(const node &parent, nbits_t parentdepth,
+        bit_t whichchild, size_t &aes_ops) const;
+
+    // Overloaded versions of functions to get DPF components and
+    // outputs so that the appropriate one can be selected with a
+    // parameter
+
+    // Additive share of the scaling value M_as such that the high words
+    // of the leaf values for P0 and P1 add to M_as * e_{target}
+    inline void scaled_value(RegASWP &v) const {
+        std::get<0>(v) = dpf0.scaled_sum;
+        std::get<1>(v) = dpf1.scaled_sum;
+    }
+
+    // XOR share of the scaling value M_xs such that the high words
+    // of the leaf values for P0 and P1 XOR to M_xs * e_{target}
+    inline void scaled_value(RegXSWP &v) const {
+        std::get<0>(v) = dpf0.scaled_xor;
+        std::get<1>(v) = dpf1.scaled_xor;
+    }
+
+    // Get the additive-shared unit vector entry from the leaf node
+    inline void unit(std::tuple<RegAS,RegAS> &u, const LeafNode &leaf) const {
+        std::get<0>(u) = dpf0.unit_as(std::get<0>(leaf));
+        std::get<1>(u) = dpf1.unit_as(std::get<1>(leaf));
+    }
+
+    // Get the bit-shared unit vector entry from the leaf node
+    inline void unit(std::tuple<RegXS,RegXS> &u, const LeafNode &leaf) const {
+        std::get<0>(u) = dpf0.unit_bs(std::get<0>(leaf));
+        std::get<1>(u) = dpf1.unit_bs(std::get<1>(leaf));
+    }
+
+    // For any more complex entry type, that type will handle the conversion
+    // for each DPF
+    template <typename T>
+    inline void unit(std::tuple<T,T> &u, const LeafNode &leaf) const {
+        std::get<0>(u).unit(dpf0, std::get<0>(leaf));
+        std::get<1>(u).unit(dpf1, std::get<1>(leaf));
+    }
+
+    // Get the additive-shared scaled vector entry from the leaf node
+    inline void scaled(RegASWP &s, const LeafNode &leaf) const {
+        std::get<0>(s) = dpf0.scaled_as(std::get<0>(leaf));
+        std::get<1>(s) = dpf1.scaled_as(std::get<1>(leaf));
+    }
+
+    // Get the XOR-shared scaled vector entry from the leaf node
+    inline void scaled(RegXSWP &s, const LeafNode &leaf) const {
+        std::get<0>(s) = dpf0.scaled_xs(std::get<0>(leaf));
+        std::get<1>(s) = dpf1.scaled_xs(std::get<1>(leaf));
+    }
+
+};
+
 // Streaming evaluation, to avoid taking up enough memory to store
 // an entire evaluation.  T can be RDPF, RDPFPair, or RDPFTriple.
 template <typename T>

+ 63 - 19
rdpf.tcc

@@ -145,7 +145,7 @@ inline V ParallelEval<T>::reduce(V init, W process)
     size_t thread_aes_ops[num_threads];
     V accums[num_threads];
     boost::asio::thread_pool pool(num_threads);
-    address_t threadstart = start;
+    address_t threadstart = 0;
     address_t threadchunk = num_evals / num_threads;
     address_t threadextra = num_evals % num_threads;
     nbits_t depth = rdpf.depth();
@@ -857,6 +857,9 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
 
     li.resize(incremental ? depth : 1);
 
+    // Prefetch the right number of nodeselecttriples
+    tio.request_nodeselecttriples(yield, incremental ? 2*depth-1 : depth);
+
     // Construct each intermediate level
     while(level < depth) {
         LeafNode *leaflevel = NULL;
@@ -883,29 +886,44 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
         // The bit-shared choice bit is bit (depth-level-1) of the
         // XOR-shared target index
         RegBS bs_choice = target.bit(depth-level-1);
-        bool cfbit;
 
+        // At each layer, we can create the next internal layer and the
+        // leaf layer in parallel coroutines if we're making an
+        // incremental RDPF.  If not, exactly one of these coroutines
+        // will be created, and we just run that one.
+        std::vector<coro_t> coroutines;
         if (level < depth-1) {
-            DPFnode CW;
-            // This field is ignored when we're not expanding to a leaf
-            // level, but it needs to be an lvalue reference.
-            int noleafinfo = 0;
-            create_level(tio, yield, curlevel, nextlevel, player, level,
-                depth, bs_choice, CW, cfbit, save_expansion, noleafinfo,
-                aes_ops);
-            cfbits |= (value_t(cfbit)<<level);
-            if (player < 2) {
-                cw.push_back(CW);
-            }
+            coroutines.emplace_back([this, &tio, curlevel, nextlevel,
+                player, level, depth, bs_choice, save_expansion,
+                &aes_ops] (yield_t &yield) {
+                    DPFnode CW;
+                    bool cfbit;
+                    // This field is ignored when we're not expanding to a leaf
+                    // level, but it needs to be an lvalue reference.
+                    int noleafinfo = 0;
+                    create_level(tio, yield, curlevel, nextlevel, player, level,
+                        depth, bs_choice, CW, cfbit, save_expansion, noleafinfo,
+                        aes_ops);
+                    cfbits |= (value_t(cfbit)<<level);
+                    if (player < 2) {
+                        cw.push_back(CW);
+                    }
+                });
         }
         if (incremental || level == depth-1) {
-            LeafNode CW;
-            create_level(tio, yield, curlevel, leaflevel, player, level,
-                depth, bs_choice, CW, cfbit, save_expansion,
-                li[depth-level-1], aes_ops);
-            leaf_cfbits |= (value_t(cfbit)<<(depth-level-1));
-            li[depth-level-1].leaf_cw = CW;
+            coroutines.emplace_back([this, &tio, curlevel, leaflevel,
+                player, level, depth, bs_choice, save_expansion,
+                &aes_ops](yield_t &yield) {
+                    LeafNode CW;
+                    bool cfbit;
+                    create_level(tio, yield, curlevel, leaflevel, player,
+                        level, depth, bs_choice, CW, cfbit, save_expansion,
+                        li[depth-level-1], aes_ops);
+                    leaf_cfbits |= (value_t(cfbit)<<(depth-level-1));
+                    li[depth-level-1].leaf_cw = CW;
+                });
         }
+        run_coroutines(yield, coroutines);
 
         if (!save_expansion) {
             delete[] leaflevel;
@@ -1081,3 +1099,29 @@ typename RDPFPair<WIDTH>::LeafNode RDPFPair<WIDTH>::descend_to_leaf(
     C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
     return std::make_tuple(C0,C1);
 }
+
+template <nbits_t WIDTH>
+typename RDPF2of3<WIDTH>::node RDPF2of3<WIDTH>::descend(
+    const RDPF2of3<WIDTH>::node &parent,
+    nbits_t parentdepth, bit_t whichchild,
+    size_t &aes_ops) const
+{
+    auto [P0, P1] = parent;
+    DPFnode C0, C1;
+    C0 = dpf0.descend(P0, parentdepth, whichchild, aes_ops);
+    C1 = dpf1.descend(P1, parentdepth, whichchild, aes_ops);
+    return std::make_tuple(C0,C1);
+}
+
+template <nbits_t WIDTH>
+typename RDPF2of3<WIDTH>::LeafNode RDPF2of3<WIDTH>::descend_to_leaf(
+    const RDPF2of3<WIDTH>::node &parent,
+    nbits_t parentdepth, bit_t whichchild,
+    size_t &aes_ops) const
+{
+    auto [P0, P1] = parent;
+    typename RDPF<WIDTH>::LeafNode C0, C1;
+    C0 = dpf0.descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
+    C1 = dpf1.descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
+    return std::make_tuple(C0,C1);
+}

+ 302 - 0
shapes.hpp

@@ -0,0 +1,302 @@
+#ifndef __SHAPES_HPP__
+#define __SHAPES_HPP__
+
+// Various Shapes beyond the standard Flat (in duoram.hpp)
+
+#include "duoram.hpp"
+
+
+// A Pad is a Shape that pads an underlying Shape so that read accesses
+// past the end return a fixed constant value.  Do _not_ write into a
+// Pad!
+
+template <typename T>
+class Duoram<T>::Pad : public Duoram<T>::Shape {
+    // These are pointers because we need to be able to return a
+    // (non-const) T& even from a const Pad.
+    T *padvalp;
+    T *peerpadvalp;
+    T *zerop;
+    address_t padded_size;
+
+    inline size_t indexmap(size_t idx) const override {
+        return idx;
+    }
+
+    Pad &operator=(const Pad &) = delete;
+
+public:
+    // Constructor for the Pad shape. The parent must _not_ be in
+    // explicit-only mode.
+    Pad(Shape &parent, MPCTIO &tio, yield_t &yield,
+        address_t padded_size, value_t padval = 0x7fffffffffffffff);
+
+    // Copy the given Pad except for the tio and yield
+    Pad(const Pad &copy_from, MPCTIO &tio, yield_t &yield);
+
+    // Destructor
+    ~Pad();
+
+    // Update the context (MPCTIO and yield if you've started a new
+    // thread, or just yield if you've started a new coroutine in the
+    // same thread).  Returns a new Shape with an updated context.
+    Pad context(MPCTIO &new_tio, yield_t &new_yield) const {
+        return Pad(*this, new_tio, new_yield);
+    }
+    Pad context(yield_t &new_yield) const {
+        return Pad(*this, this->tio, new_yield);
+    }
+
+    // Get a pair (for the server) of references to the underlying
+    // Duoram entries at share virtual index idx.  (That is, it gets
+    // duoram.p0_blind[indexmap(idx)], etc.)
+    inline std::tuple<T&,T&> get_server(size_t idx,
+        std::nullopt_t null = std::nullopt) const override {
+        size_t parindex = indexmap(idx);
+        if (parindex < this->parent.shape_size) {
+            return this->parent.get_server(parindex, null);
+        } else {
+            return std::tie(*zerop, *zerop);
+        }
+    }
+
+    // Get a triple (for the computational players) of references to the
+    // underlying Duoram entries at share virtual index idx.  (That is,
+    // it gets duoram.database[indexmap(idx)], etc.)
+    inline std::tuple<T&,T&,T&> get_comp(size_t idx,
+        std::nullopt_t null = std::nullopt) const override {
+        size_t parindex = indexmap(idx);
+        if (parindex < this->parent.shape_size) {
+            return this->parent.get_comp(parindex, null);
+        } else {
+            return std::tie(*padvalp, *zerop, *peerpadvalp);
+        }
+    }
+
+    // Index into this Pad in various ways
+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Pad,1>
+            operator[](const RegAS &idx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegAS,T,std::nullopt_t,Pad,1>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Pad,1>
+            operator[](const RegXS &idx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegXS,T,std::nullopt_t,Pad,1>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    template <typename U, nbits_t WIDTH>
+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Pad,WIDTH>
+            operator[](OblivIndex<U,WIDTH> &obidx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegXS,T,std::nullopt_t,Pad,WIDTH>
+            res(*this, obidx, std::nullopt);
+        return res;
+    }
+    typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
+            operator[](address_t idx) {
+        typename Duoram<T>::Shape::
+            template MemRefExpl<T,std::nullopt_t>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    template <typename U>
+    Duoram::Shape::MemRefInd<U, Pad>
+            operator[](const std::vector<U> &indcs) {
+        typename Duoram<T>::Shape::
+            template MemRefInd<U,Pad>
+            res(*this, indcs);
+        return res;
+    }
+    template <typename U, size_t N>
+    Duoram::Shape::MemRefInd<U, Pad>
+            operator[](const std::array<U,N> &indcs) {
+        typename Duoram<T>::Shape::
+            template MemRefInd<U,Pad>
+            res(*this, indcs);
+        return res;
+    }
+};
+
+
+// A Stride is a Shape that represents evenly spaced elements of its
+// parent Shape, starting with some offset, and then every stride
+// elements.
+
+template <typename T>
+class Duoram<T>::Stride : public Duoram<T>::Shape {
+    size_t offset;
+    size_t stride;
+
+    inline size_t indexmap(size_t idx) const override {
+        size_t paridx = offset + idx*stride;
+        return paridx;
+    }
+
+public:
+    // Constructor
+    Stride(Shape &parent, MPCTIO &tio, yield_t &yield, size_t offset,
+        size_t stride);
+
+    // Copy the given Stride except for the tio and yield
+    Stride(const Stride &copy_from, MPCTIO &tio, yield_t &yield) :
+        Shape(copy_from, tio, yield), offset(copy_from.offset),
+        stride(copy_from.stride) {}
+
+    // Update the context (MPCTIO and yield if you've started a new
+    // thread, or just yield if you've started a new coroutine in the
+    // same thread).  Returns a new Shape with an updated context.
+    Stride context(MPCTIO &new_tio, yield_t &new_yield) const {
+        return Stride(*this, new_tio, new_yield);
+    }
+    Stride context(yield_t &new_yield) const {
+        return Stride(*this, this->tio, new_yield);
+    }
+
+    // Index into this Stride in various ways
+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Stride,1>
+            operator[](const RegAS &idx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegAS,T,std::nullopt_t,Stride,1>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Stride,1>
+            operator[](const RegXS &idx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegXS,T,std::nullopt_t,Stride,1>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    template <typename U, nbits_t WIDTH>
+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Stride,WIDTH>
+            operator[](OblivIndex<U,WIDTH> &obidx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegXS,T,std::nullopt_t,Stride,WIDTH>
+            res(*this, obidx, std::nullopt);
+        return res;
+    }
+    typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
+            operator[](address_t idx) {
+        typename Duoram<T>::Shape::
+            template MemRefExpl<T,std::nullopt_t>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    template <typename U>
+    Duoram::Shape::MemRefInd<U, Stride>
+            operator[](const std::vector<U> &indcs) {
+        typename Duoram<T>::Shape::
+            template MemRefInd<U,Stride>
+            res(*this, indcs);
+        return res;
+    }
+    template <typename U, size_t N>
+    Duoram::Shape::MemRefInd<U, Stride>
+            operator[](const std::array<U,N> &indcs) {
+        typename Duoram<T>::Shape::
+            template MemRefInd<U,Stride>
+            res(*this, indcs);
+        return res;
+    }
+};
+
+
+// A Path is a Shape that represents a path from the root of a complete
+// binary tree down to a given node.
+
+// We assume this layout for the tree (the _parent_ shape of the Path):
+// A[0] is unused
+// A[1] is the root (layer 0)
+// A[2..3] is layer 1
+// A[4..7] is layer 2
+// ...
+// A[(1<<j)..((2<<j)-1)] is layer j
+//
+// So the parent of x is at location (x/2) and the children of x
+// are at locations 2*x and 2*x+1
+
+template <typename T>
+class Duoram<T>::Path : public Duoram<T>::Shape {
+    size_t target_node;
+
+    inline size_t indexmap(size_t idx) const override {
+        size_t paridx = target_node >> (this->shape_size - idx - 1);
+        return paridx;
+    }
+
+public:
+    // Constructor
+    Path(Shape &parent, MPCTIO &tio, yield_t &yield,
+        size_t target_node);
+
+    // Copy the given Path except for the tio and yield
+    Path(const Path &copy_from, MPCTIO &tio, yield_t &yield) :
+        Shape(copy_from, tio, yield),
+        target_node(copy_from.target_node) {}
+
+    // Update the context (MPCTIO and yield if you've started a new
+    // thread, or just yield if you've started a new coroutine in the
+    // same thread).  Returns a new Shape with an updated context.
+    Path context(MPCTIO &new_tio, yield_t &new_yield) const {
+        return Path(*this, new_tio, new_yield);
+    }
+    Path context(yield_t &new_yield) const {
+        return Path(*this, this->tio, new_yield);
+    }
+
+    // Index into this Path in various ways
+    typename Duoram::Shape::template MemRefS<RegAS,T,std::nullopt_t,Path,1>
+            operator[](const RegAS &idx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegAS,T,std::nullopt_t,Path,1>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    typename Duoram::Shape::template MemRefS<RegXS,T,std::nullopt_t,Path,1>
+            operator[](const RegXS &idx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegXS,T,std::nullopt_t,Path,1>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    template <typename U, nbits_t WIDTH>
+    typename Duoram::Shape::template MemRefS<U,T,std::nullopt_t,Path,WIDTH>
+            operator[](OblivIndex<U,WIDTH> &obidx) {
+        typename Duoram<T>::Shape::
+            template MemRefS<RegXS,T,std::nullopt_t,Path,WIDTH>
+            res(*this, obidx, std::nullopt);
+        return res;
+    }
+    typename Duoram::Shape::template MemRefExpl<T,std::nullopt_t>
+            operator[](address_t idx) {
+        typename Duoram<T>::Shape::
+            template MemRefExpl<T,std::nullopt_t>
+            res(*this, idx, std::nullopt);
+        return res;
+    }
+    template <typename U>
+    Duoram::Shape::MemRefInd<U, Path>
+            operator[](const std::vector<U> &indcs) {
+        typename Duoram<T>::Shape::
+            template MemRefInd<U,Path>
+            res(*this, indcs);
+        return res;
+    }
+    template <typename U, size_t N>
+    Duoram::Shape::MemRefInd<U, Path>
+            operator[](const std::array<U,N> &indcs) {
+        typename Duoram<T>::Shape::
+            template MemRefInd<U,Path>
+            res(*this, indcs);
+        return res;
+    }
+};
+
+
+#include "shapes.tcc"
+
+#endif

+ 85 - 0
shapes.tcc

@@ -0,0 +1,85 @@
+#ifndef __SHAPES_TCC__
+#define __SHAPES_TCC__
+
+// Constructor for the Pad shape. The parent must _not_ be in
+// explicit-only mode.
+template <typename T>
+Duoram<T>::Pad::Pad(Shape &parent, MPCTIO &tio, yield_t &yield,
+    address_t padded_size, size_t padval) :
+    Shape(parent, parent.duoram, tio, yield)
+{
+    int player = tio.player();
+    padvalp = new T;
+    padvalp->set(player*padval);
+    zerop = new T;
+    peerpadvalp = new T;
+    peerpadvalp->set((1-player)*padval);
+    this->set_shape_size(padded_size);
+}
+
+// Copy the given Pad except for the tio and yield
+template <typename T>
+Duoram<T>::Pad::Pad(const Pad &copy_from, MPCTIO &tio, yield_t &yield) :
+    Shape(copy_from, tio, yield)
+{
+    padvalp = new T;
+    padvalp->set(copy_from.padvalp->share());
+    zerop = new T;
+    peerpadvalp = new T;
+    peerpadvalp->set(copy_from.peerpadvalp->share());
+}
+
+// Destructor
+template <typename T>
+Duoram<T>::Pad::~Pad()
+{
+    delete padvalp;
+    delete zerop;
+    delete peerpadvalp;
+}
+
+// Constructor for the Stride shape.
+template <typename T>
+Duoram<T>::Stride::Stride(Shape &parent, MPCTIO &tio, yield_t &yield,
+    size_t offset, size_t stride) :
+    Shape(parent, parent.duoram, tio, yield)
+{
+    size_t parentsize = parent.size();
+    if (offset > parentsize) {
+        offset = parentsize;
+    }
+    this->offset = offset;
+    this->stride = stride;
+    // How many items are there if you take every stride'th item,
+    // starting at offset?  strideregionsize corrects for the offset, so
+    // we're asking how many multiples of stride are there strictly less
+    // than strideregionsize.  That's just ceil(strideregionsize/stride)
+    // which is the same as (strideregionsize + stride - 1)/stride with
+    // integer truncated division.
+    size_t strideregionsize = parentsize - offset;
+    size_t numelements = (strideregionsize + stride - 1) / stride;
+    this->set_shape_size(numelements);
+}
+
+// Constructor for the Path shape.
+template <typename T>
+Duoram<T>::Path::Path(Shape &parent, MPCTIO &tio, yield_t &yield,
+    size_t target_node) :
+    Shape(parent, parent.duoram, tio, yield)
+{
+    size_t parentsize = parent.size();
+    assert(target_node > 0 && target_node < parentsize);
+    this->target_node = target_node;
+
+    // How many nodes are there on the path from the root (index 1) to
+    // the target node?  Recall that the parent of the node at index x
+    // is just the node at index (x>>1).
+    size_t path_num_nodes = 1, cur_node = target_node;
+    while (cur_node > 1) {
+        cur_node >>= 1;
+        ++path_num_nodes;
+    }
+    this->set_shape_size(path_num_nodes);
+}
+
+#endif