|
@@ -191,7 +191,7 @@ inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
|
|
{
|
|
{
|
|
typename RDPF<WIDTH>::LeafNode prgout;
|
|
typename RDPF<WIDTH>::LeafNode prgout;
|
|
bool flag = get_lsb(parent);
|
|
bool flag = get_lsb(parent);
|
|
- prgleaf(prgout, parent, whichchild, aes_ops);
|
|
+ prg(prgout, parent, whichchild, aes_ops);
|
|
if (flag) {
|
|
if (flag) {
|
|
LeafNode CW = li[0].leaf_cw;
|
|
LeafNode CW = li[0].leaf_cw;
|
|
LeafNode CWR = CW;
|
|
LeafNode CWR = CW;
|
|
@@ -335,6 +335,463 @@ T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
|
|
return is;
|
|
return is;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+// Set a DPFnode to zero
|
|
|
|
+static inline void zero(DPFnode &z)
|
|
|
|
+{
|
|
|
|
+ z = _mm_setzero_si128();
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// Set a LeafNode to zero
|
|
|
|
+template <size_t LWIDTH>
|
|
|
|
+static inline void zero(std::array<DPFnode,LWIDTH> &z)
|
|
|
|
+{
|
|
|
|
+ for (size_t j=0;j<LWIDTH;++j) {
|
|
|
|
+ zero(z[j]);
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// Set an array of value_r to zero
|
|
|
|
+template <size_t WIDTH>
|
|
|
|
+static inline void zero(std::array<value_t,WIDTH> &z)
|
|
|
|
+{
|
|
|
|
+ for (size_t j=0;j<WIDTH;++j) {
|
|
|
|
+ z[j] = 0;
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// Expand a level of the RDPF into the next level without threads. This
|
|
|
|
+// just computes the PRGs without computing or applying the correction
|
|
|
|
+// words. L and R will be set to the XORs of the left children and the
|
|
|
|
+// XORs of the right children respectively. NT will be LeafNode if we
|
|
|
|
+// are expanding into a leaf level, DPFnode if not.
|
|
|
|
+template <typename NT>
|
|
|
|
+static inline void expand_level_nothreads(size_t start, size_t end,
|
|
|
|
+ const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
|
|
|
|
+ size_t &aes_ops)
|
|
|
|
+{
|
|
|
|
+ // Only touch registers in the inner loop if possible
|
|
|
|
+ NT lL, lR;
|
|
|
|
+ zero(lL);
|
|
|
|
+ zero(lR);
|
|
|
|
+ size_t laes_ops = 0;
|
|
|
|
+ for(size_t i=start;i<end;++i) {
|
|
|
|
+ NT lchild, rchild;
|
|
|
|
+ prgboth(lchild, rchild, curlevel[i], laes_ops);
|
|
|
|
+ lL ^= lchild;
|
|
|
|
+ lR ^= rchild;
|
|
|
|
+ nextlevel[2*i] = lchild;
|
|
|
|
+ nextlevel[2*i+1] = rchild;
|
|
|
|
+ }
|
|
|
|
+ L = lL;
|
|
|
|
+ R = lR;
|
|
|
|
+ aes_ops += laes_ops;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
|
|
|
|
+// timing benchmarks
|
|
|
|
+template <typename NT>
|
|
|
|
+static inline void expand_level(int max_nthreads, nbits_t level,
|
|
|
|
+ const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
|
|
|
|
+ size_t &aes_ops)
|
|
|
|
+{
|
|
|
|
+ size_t curlevel_size = (size_t(1)<<level);
|
|
|
|
+ if (max_nthreads == 1 || level < 19) {
|
|
|
|
+ // No threading
|
|
|
|
+ expand_level_nothreads(0, curlevel_size,
|
|
|
|
+ curlevel, nextlevel, L, R, aes_ops);
|
|
|
|
+ } else {
|
|
|
|
+ int nthreads =
|
|
|
|
+ int(ceil(sqrt(double(curlevel_size/6000))));
|
|
|
|
+ if (nthreads > max_nthreads) {
|
|
|
|
+ nthreads = max_nthreads;
|
|
|
|
+ }
|
|
|
|
+ NT tL[nthreads];
|
|
|
|
+ NT tR[nthreads];
|
|
|
|
+ size_t taes_ops[nthreads];
|
|
|
|
+ size_t threadstart = 0;
|
|
|
|
+ size_t threadchunk = curlevel_size / nthreads;
|
|
|
|
+ size_t threadextra = curlevel_size % nthreads;
|
|
|
|
+ boost::asio::thread_pool pool(nthreads);
|
|
|
|
+ for (int t=0;t<nthreads;++t) {
|
|
|
|
+ size_t threadsize = threadchunk + (size_t(t) < threadextra);
|
|
|
|
+ size_t threadend = threadstart + threadsize;
|
|
|
|
+ taes_ops[t] = 0;
|
|
|
|
+ boost::asio::post(pool,
|
|
|
|
+ [t, &tL, &tR, &taes_ops, threadstart, threadend,
|
|
|
|
+ &curlevel, &nextlevel] {
|
|
|
|
+ expand_level_nothreads(threadstart, threadend,
|
|
|
|
+ curlevel, nextlevel, tL[t], tR[t], taes_ops[t]);
|
|
|
|
+ });
|
|
|
|
+ threadstart = threadend;
|
|
|
|
+ }
|
|
|
|
+ pool.join();
|
|
|
|
+ // Again work on registers as much as possible
|
|
|
|
+ NT lL, lR;
|
|
|
|
+ zero(lL);
|
|
|
|
+ zero(lR);
|
|
|
|
+ size_t laes_ops = 0;
|
|
|
|
+ for (int t=0;t<nthreads;++t) {
|
|
|
|
+ lL ^= tL[t];
|
|
|
|
+ lR ^= tR[t];
|
|
|
|
+ laes_ops += taes_ops[t];
|
|
|
|
+ }
|
|
|
|
+ L = lL;
|
|
|
|
+ R = lR;
|
|
|
|
+ aes_ops += laes_ops;
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// Apply the correction words to an expanded non-leaf level (nextlevel),
|
|
|
|
+// based on the flag bits in curlevel. This version does not use
|
|
|
|
+// threads.
|
|
|
|
+static inline void finalize_nonleaf_layer_nothreads(size_t start,
|
|
|
|
+ size_t end, const DPFnode *curlevel, DPFnode *nextlevel,
|
|
|
|
+ DPFnode CWL, DPFnode CWR)
|
|
|
|
+{
|
|
|
|
+ for(size_t i=start;i<end;++i) {
|
|
|
|
+ bool flag = get_lsb(curlevel[i]);
|
|
|
|
+ nextlevel[2*i] = xor_if(nextlevel[2*i], CWL, flag);
|
|
|
|
+ nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
|
|
|
|
+// timing benchmarks. The timing of each iteration of the inner loop is
|
|
|
|
+// comparable to the above, so just use the same computations. All of
|
|
|
|
+// this could be tuned, of course.
|
|
|
|
+static inline void finalize_nonleaf_layer(int max_nthreads, nbits_t level,
|
|
|
|
+ const DPFnode *curlevel, DPFnode *nextlevel, DPFnode CWL,
|
|
|
|
+ DPFnode CWR)
|
|
|
|
+{
|
|
|
|
+ size_t curlevel_size = (size_t(1)<<level);
|
|
|
|
+ if (max_nthreads == 1 || level < 19) {
|
|
|
|
+ // No threading
|
|
|
|
+ finalize_nonleaf_layer_nothreads(0, curlevel_size,
|
|
|
|
+ curlevel, nextlevel, CWL, CWR);
|
|
|
|
+ } else {
|
|
|
|
+ int nthreads =
|
|
|
|
+ int(ceil(sqrt(double(curlevel_size/6000))));
|
|
|
|
+ if (nthreads > max_nthreads) {
|
|
|
|
+ nthreads = max_nthreads;
|
|
|
|
+ }
|
|
|
|
+ size_t threadstart = 0;
|
|
|
|
+ size_t threadchunk = curlevel_size / nthreads;
|
|
|
|
+ size_t threadextra = curlevel_size % nthreads;
|
|
|
|
+ boost::asio::thread_pool pool(nthreads);
|
|
|
|
+ for (int t=0;t<nthreads;++t) {
|
|
|
|
+ size_t threadsize = threadchunk + (size_t(t) < threadextra);
|
|
|
|
+ size_t threadend = threadstart + threadsize;
|
|
|
|
+ boost::asio::post(pool,
|
|
|
|
+ [threadstart, threadend, CWL, CWR,
|
|
|
|
+ &curlevel, &nextlevel] {
|
|
|
|
+ finalize_nonleaf_layer_nothreads(threadstart, threadend,
|
|
|
|
+ curlevel, nextlevel, CWL, CWR);
|
|
|
|
+ });
|
|
|
|
+ threadstart = threadend;
|
|
|
|
+ }
|
|
|
|
+ pool.join();
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// Finalize a leaf layer. This applies the correction words, and
|
|
|
|
+// computes the low and high sums and XORs. This version does not use
|
|
|
|
+// threads. You can pass save_expansion = false here if you don't need
|
|
|
|
+// to save the expansion. LN is a LeafNode.
|
|
|
|
+template <size_t WIDTH, typename LN>
|
|
|
|
+static inline void finalize_leaf_layer_nothreads(size_t start,
|
|
|
|
+ size_t end, const DPFnode *curlevel, LN *nextlevel,
|
|
|
|
+ bool save_expansion, LN CWL, LN CWR, value_t &low_sum,
|
|
|
|
+ std::array<value_t,WIDTH> &high_sum,
|
|
|
|
+ std::array<value_t,WIDTH> &high_xor)
|
|
|
|
+{
|
|
|
|
+ value_t llow_sum = 0;
|
|
|
|
+ std::array<value_t,WIDTH> lhigh_sum;
|
|
|
|
+ std::array<value_t,WIDTH> lhigh_xor;
|
|
|
|
+ zero(lhigh_sum);
|
|
|
|
+ zero(lhigh_xor);
|
|
|
|
+ for(size_t i=start;i<end;++i) {
|
|
|
|
+ bool flag = get_lsb(curlevel[i]);
|
|
|
|
+ LN leftchild = xor_if(nextlevel[2*i], CWL, flag);
|
|
|
|
+ LN rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
|
|
|
|
+ if (save_expansion) {
|
|
|
|
+ nextlevel[2*i] = leftchild;
|
|
|
|
+ nextlevel[2*i+1] = rightchild;
|
|
|
|
+ }
|
|
|
|
+ value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[0]));
|
|
|
|
+ value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[0]));
|
|
|
|
+ value_t lefthigh =
|
|
|
|
+ value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[0],8)));
|
|
|
|
+ value_t righthigh =
|
|
|
|
+ value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[0],8)));
|
|
|
|
+ llow_sum += (leftlow + rightlow);
|
|
|
|
+ lhigh_sum[0] += (lefthigh + righthigh);
|
|
|
|
+ lhigh_xor[0] ^= (lefthigh ^ righthigh);
|
|
|
|
+ size_t w = 0;
|
|
|
|
+ for (size_t j=1; j<WIDTH; j+=2) {
|
|
|
|
+ ++w;
|
|
|
|
+ value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[w]));
|
|
|
|
+ value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[w]));
|
|
|
|
+ value_t lefthigh =
|
|
|
|
+ value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[w],8)));
|
|
|
|
+ value_t righthigh =
|
|
|
|
+ value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[w],8)));
|
|
|
|
+ lhigh_sum[j] += (leftlow + rightlow);
|
|
|
|
+ lhigh_xor[j] ^= (leftlow ^ rightlow);
|
|
|
|
+ if (j+1 < WIDTH) {
|
|
|
|
+ lhigh_sum[j+1] += (lefthigh + righthigh);
|
|
|
|
+ lhigh_xor[j+1] ^= (lefthigh ^ righthigh);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ low_sum = llow_sum;
|
|
|
|
+ high_sum = lhigh_sum;
|
|
|
|
+ high_xor = lhigh_xor;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
|
|
|
|
+// timing benchmarks. The timing of each iteration of the inner loop is
|
|
|
|
+// comparable to the above, so just use the same computations. All of
|
|
|
|
+// this could be tuned, of course.
|
|
|
|
+template <size_t WIDTH, typename LN>
|
|
|
|
+static inline void finalize_leaf_layer(int max_nthreads, nbits_t level,
|
|
|
|
+ const DPFnode *curlevel, LN *nextlevel, bool save_expansion,
|
|
|
|
+ LN CWL, LN CWR, value_t &low_sum,
|
|
|
|
+ std::array<value_t,WIDTH> &high_sum,
|
|
|
|
+ std::array<value_t,WIDTH> &high_xor)
|
|
|
|
+{
|
|
|
|
+ size_t curlevel_size = (size_t(1)<<level);
|
|
|
|
+ if (max_nthreads == 1 || level < 19) {
|
|
|
|
+ // No threading
|
|
|
|
+ finalize_leaf_layer_nothreads(0, curlevel_size,
|
|
|
|
+ curlevel, nextlevel, save_expansion, CWL, CWR,
|
|
|
|
+ low_sum, high_sum, high_xor);
|
|
|
|
+ } else {
|
|
|
|
+ int nthreads =
|
|
|
|
+ int(ceil(sqrt(double(curlevel_size/6000))));
|
|
|
|
+ if (nthreads > max_nthreads) {
|
|
|
|
+ nthreads = max_nthreads;
|
|
|
|
+ }
|
|
|
|
+ value_t tlow_sum[nthreads];
|
|
|
|
+ std::array<value_t,WIDTH> thigh_sum[nthreads];
|
|
|
|
+ std::array<value_t,WIDTH> thigh_xor[nthreads];
|
|
|
|
+ size_t threadstart = 0;
|
|
|
|
+ size_t threadchunk = curlevel_size / nthreads;
|
|
|
|
+ size_t threadextra = curlevel_size % nthreads;
|
|
|
|
+ boost::asio::thread_pool pool(nthreads);
|
|
|
|
+ for (int t=0;t<nthreads;++t) {
|
|
|
|
+ size_t threadsize = threadchunk + (size_t(t) < threadextra);
|
|
|
|
+ size_t threadend = threadstart + threadsize;
|
|
|
|
+ boost::asio::post(pool,
|
|
|
|
+ [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
|
|
|
|
+ &curlevel, &nextlevel, CWL, CWR, save_expansion] {
|
|
|
|
+ finalize_leaf_layer_nothreads(threadstart, threadend,
|
|
|
|
+ curlevel, nextlevel, save_expansion, CWL, CWR,
|
|
|
|
+ tlow_sum[t], thigh_sum[t], thigh_xor[t]);
|
|
|
|
+ });
|
|
|
|
+ threadstart = threadend;
|
|
|
|
+ }
|
|
|
|
+ pool.join();
|
|
|
|
+ low_sum = 0;
|
|
|
|
+ zero(high_sum);
|
|
|
|
+ zero(high_xor);
|
|
|
|
+ for (int t=0;t<nthreads;++t) {
|
|
|
|
+ low_sum += tlow_sum[t];
|
|
|
|
+ high_sum += thigh_sum[t];
|
|
|
|
+ high_xor ^= thigh_xor[t];
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// Create one level of the RDPF. NT will be as above: LeafNode if we
|
|
|
|
+// are expanding into a leaf level, DPFnode if not. LI will be LeafInfo
|
|
|
|
+// if we are expanding into a leaf level, and it is unused otherwise.
|
|
|
|
+template<typename NT, typename LI>
|
|
|
|
+static inline void create_level(MPCTIO &tio, yield_t &yield,
|
|
|
|
+ const DPFnode *curlevel, NT *nextlevel,
|
|
|
|
+ int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW,
|
|
|
|
+ bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops)
|
|
|
|
+{
|
|
|
|
+ // tio.cpu_nthreads() is the maximum number of threads we
|
|
|
|
+ // have available.
|
|
|
|
+ int max_nthreads = tio.cpu_nthreads();
|
|
|
|
+
|
|
|
|
+ NT L, R;
|
|
|
|
+ zero(L);
|
|
|
|
+ zero(R);
|
|
|
|
+ // The server doesn't need to do this computation, but it does
|
|
|
|
+ // need to execute mpc_reconstruct_choice so that it sends
|
|
|
|
+ // the AndTriples at the appropriate time.
|
|
|
|
+ if (player < 2) {
|
|
|
|
+#ifdef RDPF_MTGEN_TIMING_1
|
|
|
|
+ if (player == 0) {
|
|
|
|
+ mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
|
|
|
|
+ nextlevel, aes_ops);
|
|
|
|
+ size_t niters = 2048;
|
|
|
|
+ if (level > 8) niters = (1<<20)>>level;
|
|
|
|
+ for(int t=1;t<=8;++t) {
|
|
|
|
+ mtgen_timetest_1(level, t, niters, curlevel,
|
|
|
|
+ nextlevel, aes_ops);
|
|
|
|
+ }
|
|
|
|
+ mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
|
|
|
|
+ nextlevel, aes_ops);
|
|
|
|
+ }
|
|
|
|
+#endif
|
|
|
|
+ // Using the timing results gathered above, decide whether
|
|
|
|
+ // to multithread, and if so, how many threads to use.
|
|
|
|
+ expand_level(max_nthreads, level, curlevel, nextlevel,
|
|
|
|
+ L, R, aes_ops);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // If we're going left (bs_choice = 0), we want the correction
|
|
|
|
+ // word to be the XOR of our right side and our peer's right
|
|
|
|
+ // side; if bs_choice = 1, it should be the XOR or our left side
|
|
|
|
+ // and our peer's left side.
|
|
|
|
+
|
|
|
|
+ // We also have to ensure that the flag bits (the lsb) of the
|
|
|
|
+ // side that will end up the same be of course the same, but
|
|
|
|
+ // also that the flag bits (the lsb) of the side that will end
|
|
|
|
+ // up different _must_ be different. That is, it's not enough
|
|
|
|
+ // for the nodes of the child selected by choice to be different
|
|
|
|
+ // as 128-bit values; they also have to be different in their
|
|
|
|
+ // lsb.
|
|
|
|
+
|
|
|
|
+ // This is where we make a small optimization over Appendix C of
|
|
|
|
+ // the Duoram paper: instead of keeping separate correction flag
|
|
|
|
+ // bits for the left and right children, we observe that the low
|
|
|
|
+ // bit of the overall correction word effectively serves as one
|
|
|
|
+ // of those bits, so we just need to store one extra bit per
|
|
|
|
+ // level, not two. (We arbitrarily choose the one for the right
|
|
|
|
+ // child.)
|
|
|
|
+
|
|
|
|
+ // Note that the XOR of our left and right child before and
|
|
|
|
+ // after applying the correction word won't change, since the
|
|
|
|
+ // correction word is applied to either both children or
|
|
|
|
+ // neither, depending on the value of the parent's flag. So in
|
|
|
|
+ // particular, the XOR of the flag bits won't change, and if our
|
|
|
|
+ // children's flag's XOR equals our peer's children's flag's
|
|
|
|
+ // XOR, then we won't have different flag bits even for the
|
|
|
|
+ // children that have different 128-bit values.
|
|
|
|
+
|
|
|
|
+ // So we compute our_parity = lsb(L^R)^player, and we XOR that
|
|
|
|
+ // into the R value in the correction word computation. At the
|
|
|
|
+ // same time, we exchange these parity values to compute the
|
|
|
|
+ // combined parity, which we store in the DPF. Then when the
|
|
|
|
+ // DPF is evaluated, if the parent's flag is set, not only apply
|
|
|
|
+ // the correction work to both children, but also apply the
|
|
|
|
+ // (combined) parity bit to just the right child. Then for
|
|
|
|
+ // unequal nodes (where the flag bit is different), exactly one
|
|
|
|
+ // of the four children (two for P0 and two for P1) will have
|
|
|
|
+ // the parity bit applied, which will set the XOR of the lsb of
|
|
|
|
+ // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
|
|
|
|
+ // = 1 because everything cancels out except player (for which
|
|
|
|
+ // one player is 0 and the other is 1).
|
|
|
|
+
|
|
|
|
+ bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player;
|
|
|
|
+ xor_lsb(R, our_parity_bit);
|
|
|
|
+
|
|
|
|
+ NT CWL;
|
|
|
|
+ bool peer_parity_bit;
|
|
|
|
+ // Exchange the parities and do mpc_reconstruct_choice at the
|
|
|
|
+ // same time (bundled into the same rounds)
|
|
|
|
+ run_coroutines(yield,
|
|
|
|
+ [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
|
|
|
|
+ tio.queue_peer(&our_parity_bit, 1);
|
|
|
|
+ yield();
|
|
|
|
+ uint8_t peer_parity_byte;
|
|
|
|
+ tio.recv_peer(&peer_parity_byte, 1);
|
|
|
|
+ peer_parity_bit = peer_parity_byte & 1;
|
|
|
|
+ },
|
|
|
|
+ [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) {
|
|
|
|
+ mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L);
|
|
|
|
+ });
|
|
|
|
+ cfbit = our_parity_bit ^ peer_parity_bit;
|
|
|
|
+ CW = CWL;
|
|
|
|
+ NT CWR = CWL;
|
|
|
|
+ xor_lsb(CWR, cfbit);
|
|
|
|
+ if (player < 2) {
|
|
|
|
+ // The timing of each iteration of the inner loop is
|
|
|
|
+ // comparable to the above, so just use the same
|
|
|
|
+ // computations. All of this could be tuned, of course.
|
|
|
|
+
|
|
|
|
+ if constexpr (std::is_same_v<NT, DPFnode>) {
|
|
|
|
+ finalize_nonleaf_layer(max_nthreads, level, curlevel,
|
|
|
|
+ nextlevel, CWL, CWR);
|
|
|
|
+ } else {
|
|
|
|
+ // Recall there are four potentially useful vectors that
|
|
|
|
+ // can come out of a DPF:
|
|
|
|
+ // - (single-bit) bitwise unit vector
|
|
|
|
+ // - additive-shared unit vector
|
|
|
|
+ // - XOR-shared scaled unit vector
|
|
|
|
+ // - additive-shared scaled unit vector
|
|
|
|
+ //
|
|
|
|
+ // (No single DPF should be used for both of the first
|
|
|
|
+ // two or both of the last two, though, since they're
|
|
|
|
+ // correlated; you _can_ use one of the first two and
|
|
|
|
+ // one of the last two.)
|
|
|
|
+ //
|
|
|
|
+ // For each 128-bit leaf, the low bit is the flag bit,
|
|
|
|
+ // and we're guaranteed that the flag bits (and indeed
|
|
|
|
+ // the whole 128-bit value) for P0 and P1 are the same
|
|
|
|
+ // for every leaf except the target, and that the flag
|
|
|
|
+ // bits definitely differ for the target (and the other
|
|
|
|
+ // 127 bits are independently random on each side).
|
|
|
|
+ //
|
|
|
|
+ // We divide the 128-bit leaf into a low 64-bit word and
|
|
|
|
+ // a high 64-bit word. We use the low word for the unit
|
|
|
|
+ // vector and the high word for the scaled vector; this
|
|
|
|
+ // choice is not arbitrary: the flag bit in the low word
|
|
|
|
+ // means that the sum of all the low words (with P1's
|
|
|
|
+ // low words negated) across both P0 and P1 is
|
|
|
|
+ // definitely odd, so we can compute that sum's inverse
|
|
|
|
+ // mod 2^64, and store it now during precomputation. At
|
|
|
|
+ // evaluation time for the additive-shared unit vector,
|
|
|
|
+ // we will output this global inverse times the low word
|
|
|
|
+ // of each leaf, which will make the sum of all of those
|
|
|
|
+ // values 1. (This technique replaces the protocol in
|
|
|
|
+ // Appendix D of the Duoram paper.)
|
|
|
|
+ //
|
|
|
|
+ // For the scaled vector, we just have to compute shares
|
|
|
|
+ // of what the scaled vector is a sharing _of_, but
|
|
|
|
+ // that's just XORing or adding all of each party's
|
|
|
|
+ // local high words; no communication needed.
|
|
|
|
+
|
|
|
|
+ value_t low_sum;
|
|
|
|
+ const size_t WIDTH = LI::W;
|
|
|
|
+ std::array<value_t,WIDTH> high_sum;
|
|
|
|
+ std::array<value_t,WIDTH> high_xor;
|
|
|
|
+ finalize_leaf_layer(max_nthreads, level, curlevel,
|
|
|
|
+ nextlevel, save_expansion, CWL, CWR, low_sum, high_sum,
|
|
|
|
+ high_xor);
|
|
|
|
+
|
|
|
|
+ if (player == 1) {
|
|
|
|
+ low_sum = -low_sum;
|
|
|
|
+ for(size_t j=0; j<WIDTH; ++j) {
|
|
|
|
+ high_sum[j] = -high_sum[j];
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ for(size_t j=0; j<WIDTH; ++j) {
|
|
|
|
+ li.scaled_sum[j].ashare = high_sum[j];
|
|
|
|
+ li.scaled_xor[j].xshare = high_xor[j];
|
|
|
|
+ }
|
|
|
|
+ // Exchange low_sum and add them up
|
|
|
|
+ tio.queue_peer(&low_sum, sizeof(low_sum));
|
|
|
|
+ yield();
|
|
|
|
+ value_t peer_low_sum;
|
|
|
|
+ tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
|
|
|
|
+ low_sum += peer_low_sum;
|
|
|
|
+ // The low_sum had better be odd
|
|
|
|
+ assert(low_sum & 1);
|
|
|
|
+ li.unit_sum_inverse = inverse_value_t(low_sum);
|
|
|
|
+ }
|
|
|
|
+ } else if (level == depth-1) {
|
|
|
|
+ yield();
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
// Construct a DPF with the given (XOR-shared) target location, and
|
|
// Construct a DPF with the given (XOR-shared) target location, and
|
|
// of the given depth, to be used for random-access memory reads and
|
|
// of the given depth, to be used for random-access memory reads and
|
|
// writes. The DPF is construction collaboratively by P0 and P1,
|
|
// writes. The DPF is construction collaboratively by P0 and P1,
|
|
@@ -369,362 +826,56 @@ RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
|
|
|
|
|
|
// Construct each intermediate level
|
|
// Construct each intermediate level
|
|
while(level < depth) {
|
|
while(level < depth) {
|
|
|
|
+ LeafNode *leaflevel = NULL;
|
|
if (player < 2) {
|
|
if (player < 2) {
|
|
delete[] curlevel;
|
|
delete[] curlevel;
|
|
curlevel = nextlevel;
|
|
curlevel = nextlevel;
|
|
|
|
+ nextlevel = NULL;
|
|
if (save_expansion && level == depth-1) {
|
|
if (save_expansion && level == depth-1) {
|
|
expansion.resize(1<<depth);
|
|
expansion.resize(1<<depth);
|
|
- nextlevel = (DPFnode *)expansion.data();
|
|
+ leaflevel = expansion.data();
|
|
|
|
+ } else if (level == depth-1) {
|
|
|
|
+ leaflevel = new LeafNode[1<<depth];
|
|
} else {
|
|
} else {
|
|
nextlevel = new DPFnode[1<<(level+1)];
|
|
nextlevel = new DPFnode[1<<(level+1)];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Invariant: curlevel has 2^level elements; nextlevel has
|
|
// Invariant: curlevel has 2^level elements; nextlevel has
|
|
- // 2^{level+1} elements
|
|
+ // 2^{level+1} DPFnode elements if we're not at the last level,
|
|
|
|
+ // and leaflevel has 2^{level+1} LeafNode elements if we are at
|
|
|
|
+ // a leaf level (the last level always, and all levels if we are
|
|
|
|
+ // making an incremental RDPF).
|
|
|
|
|
|
// The bit-shared choice bit is bit (depth-level-1) of the
|
|
// The bit-shared choice bit is bit (depth-level-1) of the
|
|
// XOR-shared target index
|
|
// XOR-shared target index
|
|
RegBS bs_choice = target.bit(depth-level-1);
|
|
RegBS bs_choice = target.bit(depth-level-1);
|
|
- size_t curlevel_size = (size_t(1)<<level);
|
|
+ bool cfbit;
|
|
- DPFnode L = _mm_setzero_si128();
|
|
+
|
|
- DPFnode R = _mm_setzero_si128();
|
|
+ if (level < depth-1) {
|
|
- // The server doesn't need to do this computation, but it does
|
|
+ DPFnode CW;
|
|
- // need to execute mpc_reconstruct_choice so that it sends
|
|
+ create_level(tio, yield, curlevel, nextlevel, player, level,
|
|
- // the AndTriples at the appropriate time.
|
|
+ depth, bs_choice, CW, cfbit, save_expansion, li[0],
|
|
- if (player < 2) {
|
|
+ aes_ops);
|
|
-#ifdef RDPF_MTGEN_TIMING_1
|
|
+ cfbits |= (value_t(cfbit)<<level);
|
|
- if (player == 0) {
|
|
+ if (player < 2) {
|
|
- mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
|
|
+ cw.push_back(CW);
|
|
- nextlevel, aes_ops);
|
|
|
|
- size_t niters = 2048;
|
|
|
|
- if (level > 8) niters = (1<<20)>>level;
|
|
|
|
- for(int t=1;t<=8;++t) {
|
|
|
|
- mtgen_timetest_1(level, t, niters, curlevel,
|
|
|
|
- nextlevel, aes_ops);
|
|
|
|
- }
|
|
|
|
- mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
|
|
|
|
- nextlevel, aes_ops);
|
|
|
|
- }
|
|
|
|
-#endif
|
|
|
|
- // Using the timing results gathered above, decide whether
|
|
|
|
- // to multithread, and if so, how many threads to use.
|
|
|
|
- // tio.cpu_nthreads() is the maximum number we have
|
|
|
|
- // available.
|
|
|
|
- int max_nthreads = tio.cpu_nthreads();
|
|
|
|
- if (max_nthreads == 1 || level < 19) {
|
|
|
|
- // No threading
|
|
|
|
- size_t laes_ops = 0;
|
|
|
|
- for(size_t i=0;i<curlevel_size;++i) {
|
|
|
|
- DPFnode lchild, rchild;
|
|
|
|
- prgboth(lchild, rchild, curlevel[i], laes_ops);
|
|
|
|
- L = (L ^ lchild);
|
|
|
|
- R = (R ^ rchild);
|
|
|
|
- nextlevel[2*i] = lchild;
|
|
|
|
- nextlevel[2*i+1] = rchild;
|
|
|
|
- }
|
|
|
|
- aes_ops += laes_ops;
|
|
|
|
- } else {
|
|
|
|
- size_t curlevel_size = size_t(1)<<level;
|
|
|
|
- int nthreads =
|
|
|
|
- int(ceil(sqrt(double(curlevel_size/6000))));
|
|
|
|
- if (nthreads > max_nthreads) {
|
|
|
|
- nthreads = max_nthreads;
|
|
|
|
- }
|
|
|
|
- DPFnode tL[nthreads];
|
|
|
|
- DPFnode tR[nthreads];
|
|
|
|
- size_t taes_ops[nthreads];
|
|
|
|
- size_t threadstart = 0;
|
|
|
|
- size_t threadchunk = curlevel_size / nthreads;
|
|
|
|
- size_t threadextra = curlevel_size % nthreads;
|
|
|
|
- boost::asio::thread_pool pool(nthreads);
|
|
|
|
- for (int t=0;t<nthreads;++t) {
|
|
|
|
- size_t threadsize = threadchunk + (size_t(t) < threadextra);
|
|
|
|
- size_t threadend = threadstart + threadsize;
|
|
|
|
- boost::asio::post(pool,
|
|
|
|
- [t, &tL, &tR, &taes_ops, threadstart, threadend,
|
|
|
|
- &curlevel, &nextlevel] {
|
|
|
|
- DPFnode L = _mm_setzero_si128();
|
|
|
|
- DPFnode R = _mm_setzero_si128();
|
|
|
|
- size_t aes_ops = 0;
|
|
|
|
- for(size_t i=threadstart;i<threadend;++i) {
|
|
|
|
- DPFnode lchild, rchild;
|
|
|
|
- prgboth(lchild, rchild, curlevel[i], aes_ops);
|
|
|
|
- L = (L ^ lchild);
|
|
|
|
- R = (R ^ rchild);
|
|
|
|
- nextlevel[2*i] = lchild;
|
|
|
|
- nextlevel[2*i+1] = rchild;
|
|
|
|
- }
|
|
|
|
- tL[t] = L;
|
|
|
|
- tR[t] = R;
|
|
|
|
- taes_ops[t] = aes_ops;
|
|
|
|
- });
|
|
|
|
- threadstart = threadend;
|
|
|
|
- }
|
|
|
|
- pool.join();
|
|
|
|
- for (int t=0;t<nthreads;++t) {
|
|
|
|
- L ^= tL[t];
|
|
|
|
- R ^= tR[t];
|
|
|
|
- aes_ops += taes_ops[t];
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- // If we're going left (bs_choice = 0), we want the correction
|
|
|
|
- // word to be the XOR of our right side and our peer's right
|
|
|
|
- // side; if bs_choice = 1, it should be the XOR or our left side
|
|
|
|
- // and our peer's left side.
|
|
|
|
-
|
|
|
|
- // We also have to ensure that the flag bits (the lsb) of the
|
|
|
|
- // side that will end up the same be of course the same, but
|
|
|
|
- // also that the flag bits (the lsb) of the side that will end
|
|
|
|
- // up different _must_ be different. That is, it's not enough
|
|
|
|
- // for the nodes of the child selected by choice to be different
|
|
|
|
- // as 128-bit values; they also have to be different in their
|
|
|
|
- // lsb.
|
|
|
|
-
|
|
|
|
- // This is where we make a small optimization over Appendix C of
|
|
|
|
- // the Duoram paper: instead of keeping separate correction flag
|
|
|
|
- // bits for the left and right children, we observe that the low
|
|
|
|
- // bit of the overall correction word effectively serves as one
|
|
|
|
- // of those bits, so we just need to store one extra bit per
|
|
|
|
- // level, not two. (We arbitrarily choose the one for the right
|
|
|
|
- // child.)
|
|
|
|
-
|
|
|
|
- // Note that the XOR of our left and right child before and
|
|
|
|
- // after applying the correction word won't change, since the
|
|
|
|
- // correction word is applied to either both children or
|
|
|
|
- // neither, depending on the value of the parent's flag. So in
|
|
|
|
- // particular, the XOR of the flag bits won't change, and if our
|
|
|
|
- // children's flag's XOR equals our peer's children's flag's
|
|
|
|
- // XOR, then we won't have different flag bits even for the
|
|
|
|
- // children that have different 128-bit values.
|
|
|
|
-
|
|
|
|
- // So we compute our_parity = lsb(L^R)^player, and we XOR that
|
|
|
|
- // into the R value in the correction word computation. At the
|
|
|
|
- // same time, we exchange these parity values to compute the
|
|
|
|
- // combined parity, which we store in the DPF. Then when the
|
|
|
|
- // DPF is evaluated, if the parent's flag is set, not only apply
|
|
|
|
- // the correction work to both children, but also apply the
|
|
|
|
- // (combined) parity bit to just the right child. Then for
|
|
|
|
- // unequal nodes (where the flag bit is different), exactly one
|
|
|
|
- // of the four children (two for P0 and two for P1) will have
|
|
|
|
- // the parity bit applied, which will set the XOR of the lsb of
|
|
|
|
- // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
|
|
|
|
- // = 1 because everything cancels out except player (for which
|
|
|
|
- // one player is 0 and the other is 1).
|
|
|
|
-
|
|
|
|
- bool our_parity_bit = get_lsb(L ^ R) ^ !!player;
|
|
|
|
- DPFnode our_parity = lsb128_mask[our_parity_bit];
|
|
|
|
-
|
|
|
|
- DPFnode CW;
|
|
|
|
- bool peer_parity_bit;
|
|
|
|
- // Exchange the parities and do mpc_reconstruct_choice at the
|
|
|
|
- // same time (bundled into the same rounds)
|
|
|
|
- run_coroutines(yield,
|
|
|
|
- [this, &tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
|
|
|
|
- tio.queue_peer(&our_parity_bit, 1);
|
|
|
|
- yield();
|
|
|
|
- uint8_t peer_parity_byte;
|
|
|
|
- tio.recv_peer(&peer_parity_byte, 1);
|
|
|
|
- peer_parity_bit = peer_parity_byte & 1;
|
|
|
|
- },
|
|
|
|
- [this, &tio, &CW, &L, &R, &bs_choice, &our_parity](yield_t &yield) {
|
|
|
|
- mpc_reconstruct_choice(tio, yield, CW, bs_choice,
|
|
|
|
- (R ^ our_parity), L);
|
|
|
|
- });
|
|
|
|
- bool parity_bit = our_parity_bit ^ peer_parity_bit;
|
|
|
|
- cfbits |= (value_t(parity_bit)<<level);
|
|
|
|
- DPFnode CWR = CW ^ lsb128_mask[parity_bit];
|
|
|
|
- if (player < 2) {
|
|
|
|
- // The timing of each iteration of the inner loop is
|
|
|
|
- // comparable to the above, so just use the same
|
|
|
|
- // computations. All of this could be tuned, of course.
|
|
|
|
-
|
|
|
|
- if (level < depth-1) {
|
|
|
|
- // Using the timing results gathered above, decide whether
|
|
|
|
- // to multithread, and if so, how many threads to use.
|
|
|
|
- // tio.cpu_nthreads() is the maximum number we have
|
|
|
|
- // available.
|
|
|
|
- int max_nthreads = tio.cpu_nthreads();
|
|
|
|
- if (max_nthreads == 1 || level < 19) {
|
|
|
|
- // No threading
|
|
|
|
- for(size_t i=0;i<curlevel_size;++i) {
|
|
|
|
- bool flag = get_lsb(curlevel[i]);
|
|
|
|
- nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
|
|
|
|
- nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
|
|
|
|
- }
|
|
|
|
- } else {
|
|
|
|
- int nthreads =
|
|
|
|
- int(ceil(sqrt(double(curlevel_size/6000))));
|
|
|
|
- if (nthreads > max_nthreads) {
|
|
|
|
- nthreads = max_nthreads;
|
|
|
|
- }
|
|
|
|
- size_t threadstart = 0;
|
|
|
|
- size_t threadchunk = curlevel_size / nthreads;
|
|
|
|
- size_t threadextra = curlevel_size % nthreads;
|
|
|
|
- boost::asio::thread_pool pool(nthreads);
|
|
|
|
- for (int t=0;t<nthreads;++t) {
|
|
|
|
- size_t threadsize = threadchunk + (size_t(t) < threadextra);
|
|
|
|
- size_t threadend = threadstart + threadsize;
|
|
|
|
- boost::asio::post(pool, [CW, CWR, threadstart, threadend,
|
|
|
|
- &curlevel, &nextlevel] {
|
|
|
|
- for(size_t i=threadstart;i<threadend;++i) {
|
|
|
|
- bool flag = get_lsb(curlevel[i]);
|
|
|
|
- nextlevel[2*i] = xor_if(nextlevel[2*i], CW, flag);
|
|
|
|
- nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
|
|
|
|
- }
|
|
|
|
- });
|
|
|
|
- threadstart = threadend;
|
|
|
|
- }
|
|
|
|
- pool.join();
|
|
|
|
- }
|
|
|
|
- } else {
|
|
|
|
- // Recall there are four potentially useful vectors that
|
|
|
|
- // can come out of a DPF:
|
|
|
|
- // - (single-bit) bitwise unit vector
|
|
|
|
- // - additive-shared unit vector
|
|
|
|
- // - XOR-shared scaled unit vector
|
|
|
|
- // - additive-shared scaled unit vector
|
|
|
|
- //
|
|
|
|
- // (No single DPF should be used for both of the first
|
|
|
|
- // two or both of the last two, though, since they're
|
|
|
|
- // correlated; you _can_ use one of the first two and
|
|
|
|
- // one of the last two.)
|
|
|
|
- //
|
|
|
|
- // For each 128-bit leaf, the low bit is the flag bit,
|
|
|
|
- // and we're guaranteed that the flag bits (and indeed
|
|
|
|
- // the whole 128-bit value) for P0 and P1 are the same
|
|
|
|
- // for every leaf except the target, and that the flag
|
|
|
|
- // bits definitely differ for the target (and the other
|
|
|
|
- // 127 bits are independently random on each side).
|
|
|
|
- //
|
|
|
|
- // We divide the 128-bit leaf into a low 64-bit word and
|
|
|
|
- // a high 64-bit word. We use the low word for the unit
|
|
|
|
- // vector and the high word for the scaled vector; this
|
|
|
|
- // choice is not arbitrary: the flag bit in the low word
|
|
|
|
- // means that the sum of all the low words (with P1's
|
|
|
|
- // low words negated) across both P0 and P1 is
|
|
|
|
- // definitely odd, so we can compute that sum's inverse
|
|
|
|
- // mod 2^64, and store it now during precomputation. At
|
|
|
|
- // evaluation time for the additive-shared unit vector,
|
|
|
|
- // we will output this global inverse times the low word
|
|
|
|
- // of each leaf, which will make the sum of all of those
|
|
|
|
- // values 1. (This technique replaces the protocol in
|
|
|
|
- // Appendix D of the Duoram paper.)
|
|
|
|
- //
|
|
|
|
- // For the scaled vector, we just have to compute shares
|
|
|
|
- // of what the scaled vector is a sharing _of_, but
|
|
|
|
- // that's just XORing or adding all of each party's
|
|
|
|
- // local high words; no communication needed.
|
|
|
|
-
|
|
|
|
- value_t low_sum = 0;
|
|
|
|
- value_t high_sum = 0;
|
|
|
|
- value_t high_xor = 0;
|
|
|
|
- // Using the timing results gathered above, decide whether
|
|
|
|
- // to multithread, and if so, how many threads to use.
|
|
|
|
- // tio.cpu_nthreads() is the maximum number we have
|
|
|
|
- // available.
|
|
|
|
- int max_nthreads = tio.cpu_nthreads();
|
|
|
|
- if (max_nthreads == 1 || level < 19) {
|
|
|
|
- // No threading
|
|
|
|
- for(size_t i=0;i<curlevel_size;++i) {
|
|
|
|
- bool flag = get_lsb(curlevel[i]);
|
|
|
|
- DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
|
|
|
|
- DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
|
|
|
|
- if (save_expansion) {
|
|
|
|
- nextlevel[2*i] = leftchild;
|
|
|
|
- nextlevel[2*i+1] = rightchild;
|
|
|
|
- }
|
|
|
|
- value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
|
|
|
|
- value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
|
|
|
|
- value_t lefthigh =
|
|
|
|
- value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
|
|
|
|
- value_t righthigh =
|
|
|
|
- value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
|
|
|
|
- low_sum += (leftlow + rightlow);
|
|
|
|
- high_sum += (lefthigh + righthigh);
|
|
|
|
- high_xor ^= (lefthigh ^ righthigh);
|
|
|
|
- }
|
|
|
|
- } else {
|
|
|
|
- int nthreads =
|
|
|
|
- int(ceil(sqrt(double(curlevel_size/6000))));
|
|
|
|
- if (nthreads > max_nthreads) {
|
|
|
|
- nthreads = max_nthreads;
|
|
|
|
- }
|
|
|
|
- value_t tlow_sum[nthreads];
|
|
|
|
- value_t thigh_sum[nthreads];
|
|
|
|
- value_t thigh_xor[nthreads];
|
|
|
|
- size_t threadstart = 0;
|
|
|
|
- size_t threadchunk = curlevel_size / nthreads;
|
|
|
|
- size_t threadextra = curlevel_size % nthreads;
|
|
|
|
- boost::asio::thread_pool pool(nthreads);
|
|
|
|
- for (int t=0;t<nthreads;++t) {
|
|
|
|
- size_t threadsize = threadchunk + (size_t(t) < threadextra);
|
|
|
|
- size_t threadend = threadstart + threadsize;
|
|
|
|
- boost::asio::post(pool,
|
|
|
|
- [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
|
|
|
|
- &curlevel, &nextlevel, CW, CWR, save_expansion] {
|
|
|
|
- value_t low_sum = 0;
|
|
|
|
- value_t high_sum = 0;
|
|
|
|
- value_t high_xor = 0;
|
|
|
|
- for(size_t i=threadstart;i<threadend;++i) {
|
|
|
|
- bool flag = get_lsb(curlevel[i]);
|
|
|
|
- DPFnode leftchild = xor_if(nextlevel[2*i], CW, flag);
|
|
|
|
- DPFnode rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
|
|
|
|
- if (save_expansion) {
|
|
|
|
- nextlevel[2*i] = leftchild;
|
|
|
|
- nextlevel[2*i+1] = rightchild;
|
|
|
|
- }
|
|
|
|
- value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild));
|
|
|
|
- value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild));
|
|
|
|
- value_t lefthigh =
|
|
|
|
- value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild,8)));
|
|
|
|
- value_t righthigh =
|
|
|
|
- value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild,8)));
|
|
|
|
- low_sum += (leftlow + rightlow);
|
|
|
|
- high_sum += (lefthigh + righthigh);
|
|
|
|
- high_xor ^= (lefthigh ^ righthigh);
|
|
|
|
- }
|
|
|
|
- tlow_sum[t] = low_sum;
|
|
|
|
- thigh_sum[t] = high_sum;
|
|
|
|
- thigh_xor[t] = high_xor;
|
|
|
|
- });
|
|
|
|
- threadstart = threadend;
|
|
|
|
- }
|
|
|
|
- pool.join();
|
|
|
|
- for (int t=0;t<nthreads;++t) {
|
|
|
|
- low_sum += tlow_sum[t];
|
|
|
|
- high_sum += thigh_sum[t];
|
|
|
|
- high_xor ^= thigh_xor[t];
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- if (player == 1) {
|
|
|
|
- low_sum = -low_sum;
|
|
|
|
- high_sum = -high_sum;
|
|
|
|
- }
|
|
|
|
- li[0].scaled_sum[0].ashare = high_sum;
|
|
|
|
- li[0].scaled_xor[0].xshare = high_xor;
|
|
|
|
- // Exchange low_sum and add them up
|
|
|
|
- tio.queue_peer(&low_sum, sizeof(low_sum));
|
|
|
|
- yield();
|
|
|
|
- value_t peer_low_sum;
|
|
|
|
- tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
|
|
|
|
- low_sum += peer_low_sum;
|
|
|
|
- // The low_sum had better be odd
|
|
|
|
- assert(low_sum & 1);
|
|
|
|
- li[0].unit_sum_inverse = inverse_value_t(low_sum);
|
|
|
|
}
|
|
}
|
|
- cw.push_back(CW);
|
|
+ } else {
|
|
- } else if (level == depth-1) {
|
|
+ LeafNode CW;
|
|
- yield();
|
|
+ create_level(tio, yield, curlevel, leaflevel, player, level,
|
|
|
|
+ depth, bs_choice, CW, cfbit, save_expansion, li[0],
|
|
|
|
+ aes_ops);
|
|
|
|
+ li[0].leaf_cw = CW;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ if (!save_expansion) {
|
|
|
|
+ delete[] leaflevel;
|
|
|
|
+ }
|
|
++level;
|
|
++level;
|
|
}
|
|
}
|
|
|
|
|
|
delete[] curlevel;
|
|
delete[] curlevel;
|
|
- if (!save_expansion || player == 2) {
|
|
+ delete[] nextlevel;
|
|
- delete[] nextlevel;
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
|
|
|
|
// Get the leaf node for the given input
|
|
// Get the leaf node for the given input
|