// Templated method implementations for rdpf.hpp #include "mpcops.hpp" // Compute the multiplicative inverse of x mod 2^{VALUE_BITS} // This is the same as computing x to the power of // 2^{VALUE_BITS-1}-1. static value_t inverse_value_t(value_t x) { int expon = 1; value_t xe = x; // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS} // Goal: compute x^(2^{VALUE_BITS-1} - 1) while (expon < VALUE_BITS-1) { xe = xe * xe * x; ++expon; } return xe; } // Create a StreamEval object that will start its output at index start. // It will wrap around to 0 when it hits 2^depth. If use_expansion // is true, then if the DPF has been expanded, just output values // from that. If use_expansion=false or if the DPF has not been // expanded, compute the values on the fly. If xor_offset is non-zero, // then the outputs are actually // DPF(start XOR xor_offset) // DPF((start+1) XOR xor_offset) // DPF((start+2) XOR xor_offset) // etc. template StreamEval::StreamEval(const T &rdpf, address_t start, address_t xor_offset, size_t &aes_ops, bool use_expansion) : rdpf(rdpf), aes_ops(aes_ops), use_expansion(use_expansion), counter_xor_offset(xor_offset) { depth = rdpf.depth(); // Prevent overflow of 1< typename T::LeafNode StreamEval::next() { if (use_expansion && rdpf.has_expansion()) { // Just use the precomputed values typename T::LeafNode leaf = rdpf.get_expansion(nextindex ^ counter_xor_offset); nextindex = (nextindex + 1) & indexmask; return leaf; } // Invariant: in the first call to next(), nextindex = pathindex. // Otherwise, nextindex = pathindex+1. // Get the XOR of nextindex and pathindex, and strip the low bit. // If nextindex and pathindex are equal, or pathindex is even // and nextindex is the consecutive odd number, index_xor will be 0, // indicating that we don't have to update the path, but just // compute the appropriate leaf given by the low bit of nextindex. // // Otherwise, say for example pathindex is 010010111 and nextindex // is 010011000. Then their XOR is 000001111, and stripping the low // bit yields 000001110, so how_many_1_bits will be 3. // That indicates (typically) that path[depth-3] was a left child, // and now we need to change it to a right child by descending right // from path[depth-4], and then filling the path after that with // left children. // // When we wrap around, however, index_xor will be 111111110 (after // we strip the low bit), and how_many_1_bits will be depth-1, but // the new top child (of the root seed) we have to compute will be a // left, not a right, child. uint64_t index_xor = (nextindex ^ pathindex) & ~1; nbits_t how_many_1_bits = __builtin_popcount(index_xor); if (how_many_1_bits > 0) { // This will almost always be 1, unless we've just wrapped // around from the right subtree back to the left, in which case // it will be 0. bool top_changed_bit = !!(nextindex & (address_t(1) << how_many_1_bits)); bool xor_offset_bit = !!(counter_xor_offset & (address_t(1) << how_many_1_bits)); path[depth-how_many_1_bits] = rdpf.descend(path[depth-how_many_1_bits-1], depth-how_many_1_bits-1, top_changed_bit ^ xor_offset_bit, aes_ops); for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) { bool xor_offset_bit = !!(counter_xor_offset & (address_t(1) << (depth-i-1))); path[i+1] = rdpf.descend(path[i], i, xor_offset_bit, aes_ops); } } bool xor_offset_bit = counter_xor_offset & 1; typename T::LeafNode leaf = rdpf.descend_to_leaf(path[depth-1], depth-1, (nextindex & 1) ^ xor_offset_bit, aes_ops); pathindex = nextindex; nextindex = (nextindex + 1) & indexmask; return leaf; } // Run the parallel evaluator. The type V is the type of the // accumulator; init should be the "zero" value of the accumulator. // The type W (process) is a lambda type with the signature // (int, address_t, const T::node &) -> V // which will be called like this for each i from 0 to num_evals-1, // across num_thread threads: // value_i = process(t, i, DPF((start+i) XOR xor_offset)) // t is the thread number (0 <= t < num_threads). // The resulting num_evals values will be combined using V's += // operator, first accumulating the values within each thread // (starting with the init value), and then accumulating the totals // from each thread together (again starting with the init value): // // total = init // for each thread t: // accum_t = init // for each accum_i generated by thread t: // accum_t += value_i // total += accum_t template template inline V ParallelEval::reduce(V init, W process) { size_t thread_aes_ops[num_threads]; V accums[num_threads]; boost::asio::thread_pool pool(num_threads); address_t threadstart = 0; address_t threadchunk = num_evals / num_threads; address_t threadextra = num_evals % num_threads; nbits_t depth = rdpf.depth(); address_t indexmask = (depth < ADDRESS_MAX_BITS ? ((address_t(1)< inline typename RDPF::LeafNode RDPF::descend_to_leaf( const DPFnode &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { typename RDPF::LeafNode prgout; bool flag = get_lsb(parent); prg(prgout, parent, whichchild, aes_ops); if (flag) { LeafNode CW = li[maxdepth-parentdepth-1].leaf_cw; LeafNode CWR = CW; bit_t cfbit = !!(leaf_cfbits & (value_t(1)<<(maxdepth-parentdepth-1))); CWR[0] ^= lsb128_mask[cfbit]; prgout ^= (whichchild ? CWR : CW); } return prgout; } // I/O for RDPFs template T& operator>>(T &is, RDPF &rdpf) { is.read((char *)&rdpf.seed, sizeof(rdpf.seed)); rdpf.whichhalf = get_lsb(rdpf.seed); uint8_t depth; // Add 64 to depth to indicate an expanded RDPF, and add 128 to // indicate an incremental RDPF is.read((char *)&depth, sizeof(depth)); bool read_expanded = false; bool read_incremental = false; if (depth > 128) { read_incremental = true; depth -= 128; } if (depth > 64) { read_expanded = true; depth -= 64; } rdpf.maxdepth = depth; rdpf.curdepth = depth; assert(depth <= ADDRESS_MAX_BITS); rdpf.cw.clear(); for (uint8_t i=0; i T& write_maybe_expanded(T &os, const RDPF &rdpf, bool expanded = true) { os.write((const char *)&rdpf.seed, sizeof(rdpf.seed)); uint8_t depth = rdpf.maxdepth; assert(depth <= ADDRESS_MAX_BITS); // If we're writing an expansion, add 64 to depth uint8_t expanded_depth = depth; bool write_expansion = false; if (expanded && rdpf.li[0].expansion.size() == (size_t(1)< 1) { expanded_depth += 128; } os.write((const char *)&expanded_depth, sizeof(expanded_depth)); for (uint8_t i=0; i T& operator<<(T &os, const RDPF &rdpf) { return write_maybe_expanded(os, rdpf, false); } // I/O for RDPF Triples // We never write RDPFTriples over the network, so always write // the DPF expansions if they're available. template T& operator<<(T &os, const RDPFTriple &rdpftrip) { write_maybe_expanded(os, rdpftrip.dpf[0], true); write_maybe_expanded(os, rdpftrip.dpf[1], true); write_maybe_expanded(os, rdpftrip.dpf[2], true); nbits_t depth = rdpftrip.dpf[0].depth(); os.write((const char *)&rdpftrip.as_target.ashare, BITBYTES(depth)); os.write((const char *)&rdpftrip.xs_target.xshare, BITBYTES(depth)); return os; } template T& operator>>(T &is, RDPFTriple &rdpftrip) { is >> rdpftrip.dpf[0] >> rdpftrip.dpf[1] >> rdpftrip.dpf[2]; nbits_t depth = rdpftrip.dpf[0].depth(); rdpftrip.as_target.ashare = 0; is.read((char *)&rdpftrip.as_target.ashare, BITBYTES(depth)); rdpftrip.xs_target.xshare = 0; is.read((char *)&rdpftrip.xs_target.xshare, BITBYTES(depth)); return is; } // I/O for RDPF Pairs // We never write RDPFPairs over the network, so always write // the DPF expansions if they're available. template T& operator<<(T &os, const RDPFPair &rdpfpair) { write_maybe_expanded(os, rdpfpair.dpf[0], true); write_maybe_expanded(os, rdpfpair.dpf[1], true); return os; } template T& operator>>(T &is, RDPFPair &rdpfpair) { is >> rdpfpair.dpf[0] >> rdpfpair.dpf[1]; return is; } // Set a DPFnode to zero static inline void zero(DPFnode &z) { z = _mm_setzero_si128(); } // Set a LeafNode to zero template static inline void zero(std::array &z) { for (size_t j=0;j static inline void zero(std::array &z) { for (size_t j=0;j static inline void expand_level_nothreads(size_t start, size_t end, const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R, size_t &aes_ops) { // Only touch registers in the inner loop if possible NT lL, lR; zero(lL); zero(lR); size_t laes_ops = 0; for(size_t i=start;i static inline void expand_level(int max_nthreads, nbits_t level, const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R, size_t &aes_ops) { size_t curlevel_size = (size_t(1)< max_nthreads) { nthreads = max_nthreads; } NT tL[nthreads]; NT tR[nthreads]; size_t taes_ops[nthreads]; size_t threadstart = 0; size_t threadchunk = curlevel_size / nthreads; size_t threadextra = curlevel_size % nthreads; boost::asio::thread_pool pool(nthreads); for (int t=0;t max_nthreads) { nthreads = max_nthreads; } size_t threadstart = 0; size_t threadchunk = curlevel_size / nthreads; size_t threadextra = curlevel_size % nthreads; boost::asio::thread_pool pool(nthreads); for (int t=0;t static inline void finalize_leaf_layer_nothreads(size_t start, size_t end, const DPFnode *curlevel, LN *nextlevel, bool save_expansion, LN CWL, LN CWR, value_t &low_sum, std::array &high_sum, std::array &high_xor) { value_t llow_sum = 0; std::array lhigh_sum; std::array lhigh_xor; zero(lhigh_sum); zero(lhigh_xor); for(size_t i=start;i static inline void finalize_leaf_layer(int max_nthreads, nbits_t level, const DPFnode *curlevel, LN *nextlevel, bool save_expansion, LN CWL, LN CWR, value_t &low_sum, std::array &high_sum, std::array &high_xor) { size_t curlevel_size = (size_t(1)< max_nthreads) { nthreads = max_nthreads; } value_t tlow_sum[nthreads]; std::array thigh_sum[nthreads]; std::array thigh_xor[nthreads]; size_t threadstart = 0; size_t threadchunk = curlevel_size / nthreads; size_t threadextra = curlevel_size % nthreads; boost::asio::thread_pool pool(nthreads); for (int t=0;t static inline void create_level(MPCTIO &tio, yield_t &yield, const DPFnode *curlevel, NT *nextlevel, int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW, bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops) { // tio.cpu_nthreads() is the maximum number of threads we // have available. int max_nthreads = tio.cpu_nthreads(); NT L, R; zero(L); zero(R); // The server doesn't need to do this computation, but it does // need to execute mpc_reconstruct_choice so that it sends // the AndTriples at the appropriate time. if (player < 2) { #ifdef RDPF_MTGEN_TIMING_1 if (player == 0) { mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel, nextlevel, aes_ops); size_t niters = 2048; if (level > 8) niters = (1<<20)>>level; for(int t=1;t<=8;++t) { mtgen_timetest_1(level, t, niters, curlevel, nextlevel, aes_ops); } mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel, nextlevel, aes_ops); } #endif // Using the timing results gathered above, decide whether // to multithread, and if so, how many threads to use. expand_level(max_nthreads, level, curlevel, nextlevel, L, R, aes_ops); } // If we're going left (bs_choice = 0), we want the correction // word to be the XOR of our right side and our peer's right // side; if bs_choice = 1, it should be the XOR or our left side // and our peer's left side. // We also have to ensure that the flag bits (the lsb) of the // side that will end up the same be of course the same, but // also that the flag bits (the lsb) of the side that will end // up different _must_ be different. That is, it's not enough // for the nodes of the child selected by choice to be different // as 128-bit values; they also have to be different in their // lsb. // This is where we make a small optimization over Appendix C of // the Duoram paper: instead of keeping separate correction flag // bits for the left and right children, we observe that the low // bit of the overall correction word effectively serves as one // of those bits, so we just need to store one extra bit per // level, not two. (We arbitrarily choose the one for the right // child.) // Note that the XOR of our left and right child before and // after applying the correction word won't change, since the // correction word is applied to either both children or // neither, depending on the value of the parent's flag. So in // particular, the XOR of the flag bits won't change, and if our // children's flag's XOR equals our peer's children's flag's // XOR, then we won't have different flag bits even for the // children that have different 128-bit values. // So we compute our_parity = lsb(L^R)^player, and we XOR that // into the R value in the correction word computation. At the // same time, we exchange these parity values to compute the // combined parity, which we store in the DPF. Then when the // DPF is evaluated, if the parent's flag is set, not only apply // the correction work to both children, but also apply the // (combined) parity bit to just the right child. Then for // unequal nodes (where the flag bit is different), exactly one // of the four children (two for P0 and two for P1) will have // the parity bit applied, which will set the XOR of the lsb of // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity // = 1 because everything cancels out except player (for which // one player is 0 and the other is 1). bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player; xor_lsb(R, our_parity_bit); NT CWL; bool peer_parity_bit; // Exchange the parities and do mpc_reconstruct_choice at the // same time (bundled into the same rounds) run_coroutines(yield, [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) { tio.queue_peer(&our_parity_bit, 1); yield(); uint8_t peer_parity_byte; tio.recv_peer(&peer_parity_byte, 1); peer_parity_bit = peer_parity_byte & 1; }, [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) { mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L); }); cfbit = our_parity_bit ^ peer_parity_bit; CW = CWL; NT CWR = CWL; xor_lsb(CWR, cfbit); if (player < 2) { // The timing of each iteration of the inner loop is // comparable to the above, so just use the same // computations. All of this could be tuned, of course. if constexpr (std::is_same_v) { finalize_nonleaf_layer(max_nthreads, level, curlevel, nextlevel, CWL, CWR); } else { // Recall there are four potentially useful vectors that // can come out of a DPF: // - (single-bit) bitwise unit vector // - additive-shared unit vector // - XOR-shared scaled unit vector // - additive-shared scaled unit vector // // (No single DPF should be used for both of the first // two or both of the last two, though, since they're // correlated; you _can_ use one of the first two and // one of the last two.) // // For each 128-bit leaf, the low bit is the flag bit, // and we're guaranteed that the flag bits (and indeed // the whole 128-bit value) for P0 and P1 are the same // for every leaf except the target, and that the flag // bits definitely differ for the target (and the other // 127 bits are independently random on each side). // // We divide the 128-bit leaf into a low 64-bit word and // a high 64-bit word. We use the low word for the unit // vector and the high word for the scaled vector; this // choice is not arbitrary: the flag bit in the low word // means that the sum of all the low words (with P1's // low words negated) across both P0 and P1 is // definitely odd, so we can compute that sum's inverse // mod 2^64, and store it now during precomputation. At // evaluation time for the additive-shared unit vector, // we will output this global inverse times the low word // of each leaf, which will make the sum of all of those // values 1. (This technique replaces the protocol in // Appendix D of the Duoram paper.) // // For the scaled vector, we just have to compute shares // of what the scaled vector is a sharing _of_, but // that's just XORing or adding all of each party's // local high words; no communication needed. value_t low_sum; const size_t WIDTH = LI::W; std::array high_sum; std::array high_xor; finalize_leaf_layer(max_nthreads, level, curlevel, nextlevel, save_expansion, CWL, CWR, low_sum, high_sum, high_xor); if (player == 1) { low_sum = -low_sum; for(size_t j=0; j) { yield(); } } // Construct a DPF with the given (XOR-shared) target location, and // of the given depth, to be used for random-access memory reads and // writes. The DPF is construction collaboratively by P0 and P1, // with the server P2 helping by providing various kinds of // correlated randomness, such as MultTriples and AndTriples. // // This algorithm is based on Appendix C from the Duoram paper, with a // small optimization noted below. template RDPF::RDPF(MPCTIO &tio, yield_t &yield, RegXS target, nbits_t depth, bool incremental, bool save_expansion) { int player = tio.player(); size_t &aes_ops = tio.aes_ops(); // Choose a random seed arc4random_buf(&seed, sizeof(seed)); // Ensure the flag bits (the lsb of each node) are different seed = set_lsb(seed, !!player); cfbits = 0; leaf_cfbits = 0; whichhalf = (player == 1); maxdepth = depth; curdepth = depth; // The root level is just the seed nbits_t level = 0; DPFnode *curlevel = NULL; DPFnode *nextlevel = new DPFnode[1]; nextlevel[0] = seed; li.resize(incremental ? depth : 1); // Prefetch the right number of nodeselecttriples tio.request_nodeselecttriples(yield, incremental ? 2*depth-1 : depth); // Construct each intermediate level while(level < depth) { LeafNode *leaflevel = NULL; if (player < 2) { delete[] curlevel; curlevel = nextlevel; nextlevel = NULL; if (save_expansion && (incremental || level == depth-1)) { li[depth-1-level].expansion.resize(1<<(level+1)); leaflevel = li[depth-1-level].expansion.data(); } else if (incremental || level == depth-1) { leaflevel = new LeafNode[1<<(level+1)]; } if (level < depth-1) { nextlevel = new DPFnode[1<<(level+1)]; } } // Invariant: curlevel has 2^level DPFnode elements; nextlevel // has 2^{level+1} DPFnode elements if we're not at the last // level, and leaflevel has 2^{level+1} LeafNode elements if we // are at a leaf level (the last level always, and all levels if // we are making an incremental RDPF). // The bit-shared choice bit is bit (depth-level-1) of the // XOR-shared target index RegBS bs_choice = target.bit(depth-level-1); // At each layer, we can create the next internal layer and the // leaf layer in parallel coroutines if we're making an // incremental RDPF. If not, exactly one of these coroutines // will be created, and we just run that one. std::vector coroutines; if (level < depth-1) { coroutines.emplace_back([this, &tio, curlevel, nextlevel, player, level, depth, bs_choice, save_expansion, &aes_ops] (yield_t &yield) { DPFnode CW; bool cfbit; // This field is ignored when we're not expanding to a leaf // level, but it needs to be an lvalue reference. int noleafinfo = 0; create_level(tio, yield, curlevel, nextlevel, player, level, depth, bs_choice, CW, cfbit, save_expansion, noleafinfo, aes_ops); cfbits |= (value_t(cfbit)< typename RDPF::LeafNode RDPF::leaf(address_t input, size_t &aes_ops) const { // If we have a precomputed expansion, just use it if (li[maxdepth-curdepth].expansion.size()) { return li[maxdepth-curdepth].expansion[input]; } DPFnode node = seed; for (nbits_t d=0;d void RDPF::expand_leaf_layer(nbits_t li_index, size_t &aes_ops) { nbits_t depth = maxdepth - li_index; size_t num_leaves = size_t(1)< index goes for example from (in binary) // 010010110 -> 010011000, then index_xor will be // 000001110 and how_many_1_bits will be 3. // That indicates that path[depth-3] was a left child, and now // we need to change it to a right child by descending right // from path[depth-4], and then filling the path after that with // left children. path[depth-how_many_1_bits] = descend(path[depth-how_many_1_bits-1], depth-how_many_1_bits-1, 1, aes_ops); for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) { path[i+1] = descend(path[i], i, 0, aes_ops); } lastindex = index; li[li_index].expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 0, aes_ops); li[li_index].expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 1, aes_ops); } delete[] path; } // Expand the DPF if it's not already expanded // // This routine is slightly more efficient (except for incremental // RDPFs) than repeatedly calling StreamEval::next(), but it uses a lot // more memory. template void RDPF::expand(size_t &aes_ops) { nbits_t num_leaf_layers = li.size(); for (nbits_t li_index=0; li_index < num_leaf_layers; ++li_index) { expand_leaf_layer(li_index, aes_ops); } } // Construct three RDPFs of the given depth all with the same randomly // generated target index. template RDPFTriple::RDPFTriple(MPCTIO &tio, yield_t &yield, nbits_t depth, bool incremental, bool save_expansion) { // Pick a random XOR share of the target xs_target.randomize(depth); // Now create three RDPFs with that target, and also convert the XOR // shares of the target to additive shares std::vector coroutines; for (int i=0;i<3;++i) { coroutines.emplace_back( [this, &tio, depth, i, incremental, save_expansion](yield_t &yield) { dpf[i] = RDPF(tio, yield, xs_target, depth, incremental, save_expansion); }); } coroutines.emplace_back( [this, &tio, depth](yield_t &yield) { mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false); }); run_coroutines(yield, coroutines); } template typename RDPFTriple::node RDPFTriple::descend( const RDPFTriple::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1, P2] = parent; DPFnode C0, C1, C2; C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops); C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops); C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1,C2); } template typename RDPFTriple::LeafNode RDPFTriple::descend_to_leaf( const RDPFTriple::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1, P2] = parent; typename RDPF::LeafNode C0, C1, C2; C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops); C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops); C2 = dpf[2].descend_to_leaf(P2, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1,C2); } template typename RDPFPair::node RDPFPair::descend( const RDPFPair::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1] = parent; DPFnode C0, C1; C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops); C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1); } template typename RDPFPair::LeafNode RDPFPair::descend_to_leaf( const RDPFPair::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1] = parent; typename RDPF::LeafNode C0, C1; C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops); C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1); } template typename RDPF2of3::node RDPF2of3::descend( const RDPF2of3::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1] = parent; DPFnode C0, C1; C0 = dpf0.descend(P0, parentdepth, whichchild, aes_ops); C1 = dpf1.descend(P1, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1); } template typename RDPF2of3::LeafNode RDPF2of3::descend_to_leaf( const RDPF2of3::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1] = parent; typename RDPF::LeafNode C0, C1; C0 = dpf0.descend_to_leaf(P0, parentdepth, whichchild, aes_ops); C1 = dpf1.descend_to_leaf(P1, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1); }