// Templated method implementations for rdpf.hpp #include "mpcops.hpp" // Compute the multiplicative inverse of x mod 2^{VALUE_BITS} // This is the same as computing x to the power of // 2^{VALUE_BITS-1}-1. static value_t inverse_value_t(value_t x) { int expon = 1; value_t xe = x; // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS} // Goal: compute x^(2^{VALUE_BITS-1} - 1) while (expon < VALUE_BITS-1) { xe = xe * xe * x; ++expon; } return xe; } // Create a StreamEval object that will start its output at index start. // It will wrap around to 0 when it hits 2^depth. If use_expansion // is true, then if the DPF has been expanded, just output values // from that. If use_expansion=false or if the DPF has not been // expanded, compute the values on the fly. If xor_offset is non-zero, // then the outputs are actually // DPF(start XOR xor_offset) // DPF((start+1) XOR xor_offset) // DPF((start+2) XOR xor_offset) // etc. template StreamEval::StreamEval(const T &rdpf, address_t start, address_t xor_offset, size_t &aes_ops, bool use_expansion) : rdpf(rdpf), aes_ops(aes_ops), use_expansion(use_expansion), counter_xor_offset(xor_offset) { depth = rdpf.depth(); // Prevent overflow of 1< typename T::node StreamEval::next() { if (use_expansion && rdpf.has_expansion()) { // Just use the precomputed values typename T::node leaf = rdpf.get_expansion(nextindex ^ counter_xor_offset); nextindex = (nextindex + 1) & indexmask; return leaf; } // Invariant: in the first call to next(), nextindex = pathindex. // Otherwise, nextindex = pathindex+1. // Get the XOR of nextindex and pathindex, and strip the low bit. // If nextindex and pathindex are equal, or pathindex is even // and nextindex is the consecutive odd number, index_xor will be 0, // indicating that we don't have to update the path, but just // compute the appropriate leaf given by the low bit of nextindex. // // Otherwise, say for example pathindex is 010010111 and nextindex // is 010011000. Then their XOR is 000001111, and stripping the low // bit yields 000001110, so how_many_1_bits will be 3. // That indicates (typically) that path[depth-3] was a left child, // and now we need to change it to a right child by descending right // from path[depth-4], and then filling the path after that with // left children. // // When we wrap around, however, index_xor will be 111111110 (after // we strip the low bit), and how_many_1_bits will be depth-1, but // the new top child (of the root seed) we have to compute will be a // left, not a right, child. uint64_t index_xor = (nextindex ^ pathindex) & ~1; nbits_t how_many_1_bits = __builtin_popcount(index_xor); if (how_many_1_bits > 0) { // This will almost always be 1, unless we've just wrapped // around from the right subtree back to the left, in which case // it will be 0. bool top_changed_bit = !!(nextindex & (address_t(1) << how_many_1_bits)); bool xor_offset_bit = !!(counter_xor_offset & (address_t(1) << how_many_1_bits)); path[depth-how_many_1_bits] = rdpf.descend(path[depth-how_many_1_bits-1], depth-how_many_1_bits-1, top_changed_bit ^ xor_offset_bit, aes_ops); for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) { bool xor_offset_bit = !!(counter_xor_offset & (address_t(1) << (depth-i-1))); path[i+1] = rdpf.descend(path[i], i, xor_offset_bit, aes_ops); } } bool xor_offset_bit = counter_xor_offset & 1; typename T::node leaf = rdpf.descend(path[depth-1], depth-1, (nextindex & 1) ^ xor_offset_bit, aes_ops); pathindex = nextindex; nextindex = (nextindex + 1) & indexmask; return leaf; } // Run the parallel evaluator. The type V is the type of the // accumulator; init should be the "zero" value of the accumulator. // The type W (process) is a lambda type with the signature // (int, address_t, const T::node &) -> V // which will be called like this for each i from 0 to num_evals-1, // across num_thread threads: // value_i = process(t, i, DPF((start+i) XOR xor_offset)) // t is the thread number (0 <= t < num_threads). // The resulting num_evals values will be combined using V's += // operator, first accumulating the values within each thread // (starting with the init value), and then accumulating the totals // from each thread together (again starting with the init value): // // total = init // for each thread t: // accum_t = init // for each accum_i generated by thread t: // accum_t += value_i // total += accum_t template template inline V ParallelEval::reduce(V init, W process) { size_t thread_aes_ops[num_threads]; V accums[num_threads]; boost::asio::thread_pool pool(num_threads); address_t threadstart = start; address_t threadchunk = num_evals / num_threads; address_t threadextra = num_evals % num_threads; nbits_t depth = rdpf.depth(); address_t indexmask = (depth < ADDRESS_MAX_BITS ? ((address_t(1)< T& operator>>(T &is, RDPF &rdpf) { is.read((char *)&rdpf.seed, sizeof(rdpf.seed)); rdpf.whichhalf = get_lsb(rdpf.seed); uint8_t depth; // Add 64 to depth to indicate an expanded RDPF is.read((char *)&depth, sizeof(depth)); bool read_expanded = false; if (depth > 64) { read_expanded = true; depth -= 64; } assert(depth <= ADDRESS_MAX_BITS); rdpf.cw.clear(); for (uint8_t i=0; i T& write_maybe_expanded(T &os, const RDPF &rdpf, bool expanded = true) { os.write((const char *)&rdpf.seed, sizeof(rdpf.seed)); uint8_t depth = rdpf.cw.size(); assert(depth <= ADDRESS_MAX_BITS); // If we're writing an expansion, add 64 to depth uint8_t expanded_depth = depth; bool write_expansion = false; if (expanded && rdpf.expansion.size() == (size_t(1)< T& operator<<(T &os, const RDPF &rdpf) { return write_maybe_expanded(os, rdpf, false); } // I/O for RDPF Triples // We never write RDPFTriples over the network, so always write // the DPF expansions if they're available. template T& operator<<(T &os, const RDPFTriple &rdpftrip) { write_maybe_expanded(os, rdpftrip.dpf[0], true); write_maybe_expanded(os, rdpftrip.dpf[1], true); write_maybe_expanded(os, rdpftrip.dpf[2], true); nbits_t depth = rdpftrip.dpf[0].depth(); os.write((const char *)&rdpftrip.as_target.ashare, BITBYTES(depth)); os.write((const char *)&rdpftrip.xs_target.xshare, BITBYTES(depth)); return os; } template T& operator>>(T &is, RDPFTriple &rdpftrip) { is >> rdpftrip.dpf[0] >> rdpftrip.dpf[1] >> rdpftrip.dpf[2]; nbits_t depth = rdpftrip.dpf[0].depth(); rdpftrip.as_target.ashare = 0; is.read((char *)&rdpftrip.as_target.ashare, BITBYTES(depth)); rdpftrip.xs_target.xshare = 0; is.read((char *)&rdpftrip.xs_target.xshare, BITBYTES(depth)); return is; } // I/O for RDPF Pairs // We never write RDPFPairs over the network, so always write // the DPF expansions if they're available. template T& operator<<(T &os, const RDPFPair &rdpfpair) { write_maybe_expanded(os, rdpfpair.dpf[0], true); write_maybe_expanded(os, rdpfpair.dpf[1], true); return os; } template T& operator>>(T &is, RDPFPair &rdpfpair) { is >> rdpfpair.dpf[0] >> rdpfpair.dpf[1]; return is; } // Construct a DPF with the given (XOR-shared) target location, and // of the given depth, to be used for random-access memory reads and // writes. The DPF is construction collaboratively by P0 and P1, // with the server P2 helping by providing various kinds of // correlated randomness, such as MultTriples and AndTriples. // // This algorithm is based on Appendix C from the Duoram paper, with a // small optimization noted below. template RDPF::RDPF(MPCTIO &tio, yield_t &yield, RegXS target, nbits_t depth, bool save_expansion) { int player = tio.player(); size_t &aes_ops = tio.aes_ops(); // Choose a random seed arc4random_buf(&seed, sizeof(seed)); // Ensure the flag bits (the lsb of each node) are different seed = set_lsb(seed, !!player); cfbits = 0; whichhalf = (player == 1); // The root level is just the seed nbits_t level = 0; DPFnode *curlevel = NULL; DPFnode *nextlevel = new DPFnode[1]; nextlevel[0] = seed; // Construct each intermediate level while(level < depth) { if (player < 2) { delete[] curlevel; curlevel = nextlevel; if (save_expansion && level == depth-1) { expansion.resize(1<>level, curlevel, nextlevel, aes_ops); size_t niters = 2048; if (level > 8) niters = (1<<20)>>level; for(int t=1;t<=8;++t) { mtgen_timetest_1(level, t, niters, curlevel, nextlevel, aes_ops); } mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel, nextlevel, aes_ops); } #endif // Using the timing results gathered above, decide whether // to multithread, and if so, how many threads to use. // tio.cpu_nthreads() is the maximum number we have // available. int max_nthreads = tio.cpu_nthreads(); if (max_nthreads == 1 || level < 19) { // No threading size_t laes_ops = 0; for(size_t i=0;i max_nthreads) { nthreads = max_nthreads; } DPFnode tL[nthreads]; DPFnode tR[nthreads]; size_t taes_ops[nthreads]; size_t threadstart = 0; size_t threadchunk = curlevel_size / nthreads; size_t threadextra = curlevel_size % nthreads; boost::asio::thread_pool pool(nthreads); for (int t=0;t max_nthreads) { nthreads = max_nthreads; } size_t threadstart = 0; size_t threadchunk = curlevel_size / nthreads; size_t threadextra = curlevel_size % nthreads; boost::asio::thread_pool pool(nthreads); for (int t=0;t max_nthreads) { nthreads = max_nthreads; } value_t tlow_sum[nthreads]; value_t thigh_sum[nthreads]; value_t thigh_xor[nthreads]; size_t threadstart = 0; size_t threadchunk = curlevel_size / nthreads; size_t threadextra = curlevel_size % nthreads; boost::asio::thread_pool pool(nthreads); for (int t=0;t DPFnode RDPF::leaf(address_t input, size_t &aes_ops) const { // If we have a precomputed expansion, just use it if (expansion.size()) { return expansion[input]; } nbits_t totdepth = depth(); DPFnode node = seed; for (nbits_t d=0;d void RDPF::expand(size_t &aes_ops) { nbits_t depth = this->depth(); size_t num_leaves = size_t(1)< index goes for example from (in binary) // 010010110 -> 010011000, then index_xor will be // 000001110 and how_many_1_bits will be 3. // That indicates that path[depth-3] was a left child, and now // we need to change it to a right child by descending right // from path[depth-4], and then filling the path after that with // left children. path[depth-how_many_1_bits] = descend(path[depth-how_many_1_bits-1], depth-how_many_1_bits-1, 1, aes_ops); for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) { path[i+1] = descend(path[i], i, 0, aes_ops); } lastindex = index; expansion[index++] = descend(path[depth-1], depth-1, 0, aes_ops); expansion[index++] = descend(path[depth-1], depth-1, 1, aes_ops); } delete[] path; } // Construct three RDPFs of the given depth all with the same randomly // generated target index. template RDPFTriple::RDPFTriple(MPCTIO &tio, yield_t &yield, nbits_t depth, bool save_expansion) { // Pick a random XOR share of the target xs_target.randomize(depth); // Now create three RDPFs with that target, and also convert the XOR // shares of the target to additive shares std::vector coroutines; for (int i=0;i<3;++i) { coroutines.emplace_back( [this, &tio, depth, i, save_expansion](yield_t &yield) { dpf[i] = RDPF(tio, yield, xs_target, depth, save_expansion); }); } coroutines.emplace_back( [this, &tio, depth](yield_t &yield) { mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false); }); run_coroutines(yield, coroutines); } template typename RDPFTriple::node RDPFTriple::descend( const RDPFTriple::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1, P2] = parent; DPFnode C0, C1, C2; C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops); C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops); C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1,C2); } template typename RDPFPair::node RDPFPair::descend( const RDPFPair::node &parent, nbits_t parentdepth, bit_t whichchild, size_t &aes_ops) const { auto [P0, P1] = parent; DPFnode C0, C1; C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops); C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops); return std::make_tuple(C0,C1); }