rdpf.tcc 42 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127
  1. // Templated method implementations for rdpf.hpp
  2. #include "mpcops.hpp"
  3. // Compute the multiplicative inverse of x mod 2^{VALUE_BITS}
  4. // This is the same as computing x to the power of
  5. // 2^{VALUE_BITS-1}-1.
  6. static value_t inverse_value_t(value_t x)
  7. {
  8. int expon = 1;
  9. value_t xe = x;
  10. // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS}
  11. // Goal: compute x^(2^{VALUE_BITS-1} - 1)
  12. while (expon < VALUE_BITS-1) {
  13. xe = xe * xe * x;
  14. ++expon;
  15. }
  16. return xe;
  17. }
  18. // Create a StreamEval object that will start its output at index start.
  19. // It will wrap around to 0 when it hits 2^depth. If use_expansion
  20. // is true, then if the DPF has been expanded, just output values
  21. // from that. If use_expansion=false or if the DPF has not been
  22. // expanded, compute the values on the fly. If xor_offset is non-zero,
  23. // then the outputs are actually
  24. // DPF(start XOR xor_offset)
  25. // DPF((start+1) XOR xor_offset)
  26. // DPF((start+2) XOR xor_offset)
  27. // etc.
  28. template <typename T>
  29. StreamEval<T>::StreamEval(const T &rdpf, address_t start,
  30. address_t xor_offset, size_t &aes_ops,
  31. bool use_expansion) : rdpf(rdpf), aes_ops(aes_ops),
  32. use_expansion(use_expansion), counter_xor_offset(xor_offset)
  33. {
  34. depth = rdpf.depth();
  35. // Prevent overflow of 1<<depth
  36. if (depth < ADDRESS_MAX_BITS) {
  37. indexmask = (address_t(1)<<depth)-1;
  38. } else {
  39. indexmask = ~0;
  40. }
  41. start &= indexmask;
  42. counter_xor_offset &= indexmask;
  43. // Record that we haven't actually output the leaf for index start
  44. // itself yet
  45. nextindex = start;
  46. if (use_expansion && rdpf.has_expansion()) {
  47. // We just need to keep the counter, not compute anything
  48. return;
  49. }
  50. path.resize(depth);
  51. pathindex = start;
  52. path[0] = rdpf.get_seed();
  53. for (nbits_t i=1;i<depth;++i) {
  54. bool dir = !!(pathindex & (address_t(1)<<(depth-i)));
  55. bool xor_offset_bit =
  56. !!(counter_xor_offset & (address_t(1)<<(depth-i)));
  57. path[i] = rdpf.descend(path[i-1], i-1,
  58. dir ^ xor_offset_bit, aes_ops);
  59. }
  60. }
  61. template <typename T>
  62. typename T::LeafNode StreamEval<T>::next()
  63. {
  64. if (use_expansion && rdpf.has_expansion()) {
  65. // Just use the precomputed values
  66. typename T::LeafNode leaf =
  67. rdpf.get_expansion(nextindex ^ counter_xor_offset);
  68. nextindex = (nextindex + 1) & indexmask;
  69. return leaf;
  70. }
  71. // Invariant: in the first call to next(), nextindex = pathindex.
  72. // Otherwise, nextindex = pathindex+1.
  73. // Get the XOR of nextindex and pathindex, and strip the low bit.
  74. // If nextindex and pathindex are equal, or pathindex is even
  75. // and nextindex is the consecutive odd number, index_xor will be 0,
  76. // indicating that we don't have to update the path, but just
  77. // compute the appropriate leaf given by the low bit of nextindex.
  78. //
  79. // Otherwise, say for example pathindex is 010010111 and nextindex
  80. // is 010011000. Then their XOR is 000001111, and stripping the low
  81. // bit yields 000001110, so how_many_1_bits will be 3.
  82. // That indicates (typically) that path[depth-3] was a left child,
  83. // and now we need to change it to a right child by descending right
  84. // from path[depth-4], and then filling the path after that with
  85. // left children.
  86. //
  87. // When we wrap around, however, index_xor will be 111111110 (after
  88. // we strip the low bit), and how_many_1_bits will be depth-1, but
  89. // the new top child (of the root seed) we have to compute will be a
  90. // left, not a right, child.
  91. uint64_t index_xor = (nextindex ^ pathindex) & ~1;
  92. nbits_t how_many_1_bits = __builtin_popcount(index_xor);
  93. if (how_many_1_bits > 0) {
  94. // This will almost always be 1, unless we've just wrapped
  95. // around from the right subtree back to the left, in which case
  96. // it will be 0.
  97. bool top_changed_bit =
  98. !!(nextindex & (address_t(1) << how_many_1_bits));
  99. bool xor_offset_bit =
  100. !!(counter_xor_offset & (address_t(1) << how_many_1_bits));
  101. path[depth-how_many_1_bits] =
  102. rdpf.descend(path[depth-how_many_1_bits-1],
  103. depth-how_many_1_bits-1,
  104. top_changed_bit ^ xor_offset_bit, aes_ops);
  105. for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
  106. bool xor_offset_bit =
  107. !!(counter_xor_offset & (address_t(1) << (depth-i-1)));
  108. path[i+1] = rdpf.descend(path[i], i, xor_offset_bit, aes_ops);
  109. }
  110. }
  111. bool xor_offset_bit = counter_xor_offset & 1;
  112. typename T::LeafNode leaf = rdpf.descend_to_leaf(path[depth-1], depth-1,
  113. (nextindex & 1) ^ xor_offset_bit, aes_ops);
  114. pathindex = nextindex;
  115. nextindex = (nextindex + 1) & indexmask;
  116. return leaf;
  117. }
  118. // Run the parallel evaluator. The type V is the type of the
  119. // accumulator; init should be the "zero" value of the accumulator.
  120. // The type W (process) is a lambda type with the signature
  121. // (int, address_t, const T::node &) -> V
  122. // which will be called like this for each i from 0 to num_evals-1,
  123. // across num_thread threads:
  124. // value_i = process(t, i, DPF((start+i) XOR xor_offset))
  125. // t is the thread number (0 <= t < num_threads).
  126. // The resulting num_evals values will be combined using V's +=
  127. // operator, first accumulating the values within each thread
  128. // (starting with the init value), and then accumulating the totals
  129. // from each thread together (again starting with the init value):
  130. //
  131. // total = init
  132. // for each thread t:
  133. // accum_t = init
  134. // for each accum_i generated by thread t:
  135. // accum_t += value_i
  136. // total += accum_t
  137. template <typename T> template <typename V, typename W>
  138. inline V ParallelEval<T>::reduce(V init, W process)
  139. {
  140. size_t thread_aes_ops[num_threads];
  141. V accums[num_threads];
  142. boost::asio::thread_pool pool(num_threads);
  143. address_t threadstart = 0;
  144. address_t threadchunk = num_evals / num_threads;
  145. address_t threadextra = num_evals % num_threads;
  146. nbits_t depth = rdpf.depth();
  147. address_t indexmask = (depth < ADDRESS_MAX_BITS ?
  148. ((address_t(1)<<depth)-1) : ~0);
  149. for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
  150. address_t threadsize = threadchunk + (address_t(thread_num) < threadextra);
  151. boost::asio::post(pool,
  152. [this, &init, &thread_aes_ops, &accums, &process,
  153. thread_num, threadstart, threadsize, indexmask] {
  154. size_t local_aes_ops = 0;
  155. auto ev = StreamEval(rdpf, (start+threadstart)&indexmask,
  156. xor_offset, local_aes_ops);
  157. V accum = init;
  158. for (address_t x=0;x<threadsize;++x) {
  159. typename T::LeafNode leaf = ev.next();
  160. accum += process(thread_num,
  161. (threadstart+x)&indexmask, leaf);
  162. }
  163. accums[thread_num] = accum;
  164. thread_aes_ops[thread_num] = local_aes_ops;
  165. });
  166. threadstart = (threadstart + threadsize) & indexmask;
  167. }
  168. pool.join();
  169. V total = init;
  170. for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
  171. total += accums[thread_num];
  172. aes_ops += thread_aes_ops[thread_num];
  173. }
  174. return total;
  175. }
  176. // Descend from a node at depth parentdepth to one of its leaf children
  177. // whichchild = 0: left child
  178. // whichchild = 1: right child
  179. //
  180. // Cost: 1 AES operation
  181. template <nbits_t WIDTH>
  182. inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
  183. const DPFnode &parent, nbits_t parentdepth, bit_t whichchild,
  184. size_t &aes_ops) const
  185. {
  186. typename RDPF<WIDTH>::LeafNode prgout;
  187. bool flag = get_lsb(parent);
  188. prg(prgout, parent, whichchild, aes_ops);
  189. if (flag) {
  190. LeafNode CW = li[maxdepth-parentdepth-1].leaf_cw;
  191. LeafNode CWR = CW;
  192. bit_t cfbit = !!(leaf_cfbits &
  193. (value_t(1)<<(maxdepth-parentdepth-1)));
  194. CWR[0] ^= lsb128_mask[cfbit];
  195. prgout ^= (whichchild ? CWR : CW);
  196. }
  197. return prgout;
  198. }
  199. // I/O for RDPFs
  200. template <typename T, nbits_t WIDTH>
  201. T& operator>>(T &is, RDPF<WIDTH> &rdpf)
  202. {
  203. is.read((char *)&rdpf.seed, sizeof(rdpf.seed));
  204. rdpf.whichhalf = get_lsb(rdpf.seed);
  205. uint8_t depth;
  206. // Add 64 to depth to indicate an expanded RDPF, and add 128 to
  207. // indicate an incremental RDPF
  208. is.read((char *)&depth, sizeof(depth));
  209. bool read_expanded = false;
  210. bool read_incremental = false;
  211. if (depth > 128) {
  212. read_incremental = true;
  213. depth -= 128;
  214. }
  215. if (depth > 64) {
  216. read_expanded = true;
  217. depth -= 64;
  218. }
  219. rdpf.maxdepth = depth;
  220. rdpf.curdepth = depth;
  221. assert(depth <= ADDRESS_MAX_BITS);
  222. rdpf.cw.clear();
  223. for (uint8_t i=0; i<depth-1; ++i) {
  224. DPFnode cw;
  225. is.read((char *)&cw, sizeof(cw));
  226. rdpf.cw.push_back(cw);
  227. }
  228. nbits_t num_leaflevels = read_incremental ? depth : 1;
  229. rdpf.li.resize(num_leaflevels);
  230. if (read_expanded) {
  231. for(nbits_t i=0; i<num_leaflevels; ++i) {
  232. nbits_t level = depth-i;
  233. rdpf.li[i].expansion.resize(1<<level);
  234. is.read((char *)rdpf.li[i].expansion.data(),
  235. sizeof(rdpf.li[i].expansion[0])<<level);
  236. }
  237. }
  238. value_t cfbits = 0;
  239. is.read((char *)&cfbits, BITBYTES(depth-1));
  240. rdpf.cfbits = cfbits;
  241. value_t leaf_cfbits = 0;
  242. is.read((char *)&leaf_cfbits, BITBYTES(num_leaflevels));
  243. rdpf.leaf_cfbits = leaf_cfbits;
  244. for (nbits_t i=0; i<num_leaflevels; ++i) {
  245. is.read((char *)&rdpf.li[i].leaf_cw,
  246. sizeof(rdpf.li[i].leaf_cw));
  247. is.read((char *)&rdpf.li[i].unit_sum_inverse,
  248. sizeof(rdpf.li[i].unit_sum_inverse));
  249. is.read((char *)&rdpf.li[i].scaled_sum,
  250. sizeof(rdpf.li[i].scaled_sum));
  251. is.read((char *)&rdpf.li[i].scaled_xor,
  252. sizeof(rdpf.li[i].scaled_xor));
  253. }
  254. return is;
  255. }
  256. // Write the DPF to the output stream. If expanded=true, then include
  257. // the expansion _if_ the DPF is itself already expanded. You can use
  258. // this to write DPFs to files.
  259. template <typename T, nbits_t WIDTH>
  260. T& write_maybe_expanded(T &os, const RDPF<WIDTH> &rdpf,
  261. bool expanded = true)
  262. {
  263. os.write((const char *)&rdpf.seed, sizeof(rdpf.seed));
  264. uint8_t depth = rdpf.maxdepth;
  265. assert(depth <= ADDRESS_MAX_BITS);
  266. // If we're writing an expansion, add 64 to depth
  267. uint8_t expanded_depth = depth;
  268. bool write_expansion = false;
  269. if (expanded && rdpf.li[0].expansion.size() == (size_t(1)<<depth)) {
  270. write_expansion = true;
  271. expanded_depth += 64;
  272. }
  273. // If we're writing an incremental RDPF, add 128 to depth
  274. if (rdpf.li.size() > 1) {
  275. expanded_depth += 128;
  276. }
  277. os.write((const char *)&expanded_depth, sizeof(expanded_depth));
  278. for (uint8_t i=0; i<depth-1; ++i) {
  279. os.write((const char *)&rdpf.cw[i], sizeof(rdpf.cw[i]));
  280. }
  281. nbits_t num_leaflevels = rdpf.li.size();
  282. if (write_expansion) {
  283. for(nbits_t i=0; i<num_leaflevels; ++i) {
  284. nbits_t level = depth-i;
  285. os.write((const char *)rdpf.li[i].expansion.data(),
  286. sizeof(rdpf.li[i].expansion[0])<<level);
  287. }
  288. }
  289. os.write((const char *)&rdpf.cfbits, BITBYTES(depth-1));
  290. os.write((const char *)&rdpf.leaf_cfbits, BITBYTES(num_leaflevels));
  291. for (nbits_t i=0; i<num_leaflevels; ++i) {
  292. os.write((const char *)&rdpf.li[i].leaf_cw,
  293. sizeof(rdpf.li[i].leaf_cw));
  294. os.write((const char *)&rdpf.li[i].unit_sum_inverse,
  295. sizeof(rdpf.li[i].unit_sum_inverse));
  296. os.write((const char *)&rdpf.li[i].scaled_sum,
  297. sizeof(rdpf.li[i].scaled_sum));
  298. os.write((const char *)&rdpf.li[i].scaled_xor,
  299. sizeof(rdpf.li[i].scaled_xor));
  300. }
  301. return os;
  302. }
  303. // The ordinary << version never writes the expansion, since this is
  304. // what we use to send DPFs over the network.
  305. template <typename T, nbits_t WIDTH>
  306. T& operator<<(T &os, const RDPF<WIDTH> &rdpf)
  307. {
  308. return write_maybe_expanded(os, rdpf, false);
  309. }
  310. // I/O for RDPF Triples
  311. // We never write RDPFTriples over the network, so always write
  312. // the DPF expansions if they're available.
  313. template <typename T, nbits_t WIDTH>
  314. T& operator<<(T &os, const RDPFTriple<WIDTH> &rdpftrip)
  315. {
  316. write_maybe_expanded(os, rdpftrip.dpf[0], true);
  317. write_maybe_expanded(os, rdpftrip.dpf[1], true);
  318. write_maybe_expanded(os, rdpftrip.dpf[2], true);
  319. nbits_t depth = rdpftrip.dpf[0].depth();
  320. os.write((const char *)&rdpftrip.as_target.ashare, BITBYTES(depth));
  321. os.write((const char *)&rdpftrip.xs_target.xshare, BITBYTES(depth));
  322. return os;
  323. }
  324. template <typename T, nbits_t WIDTH>
  325. T& operator>>(T &is, RDPFTriple<WIDTH> &rdpftrip)
  326. {
  327. is >> rdpftrip.dpf[0] >> rdpftrip.dpf[1] >> rdpftrip.dpf[2];
  328. nbits_t depth = rdpftrip.dpf[0].depth();
  329. rdpftrip.as_target.ashare = 0;
  330. is.read((char *)&rdpftrip.as_target.ashare, BITBYTES(depth));
  331. rdpftrip.xs_target.xshare = 0;
  332. is.read((char *)&rdpftrip.xs_target.xshare, BITBYTES(depth));
  333. return is;
  334. }
  335. // I/O for RDPF Pairs
  336. // We never write RDPFPairs over the network, so always write
  337. // the DPF expansions if they're available.
  338. template <typename T, nbits_t WIDTH>
  339. T& operator<<(T &os, const RDPFPair<WIDTH> &rdpfpair)
  340. {
  341. write_maybe_expanded(os, rdpfpair.dpf[0], true);
  342. write_maybe_expanded(os, rdpfpair.dpf[1], true);
  343. return os;
  344. }
  345. template <typename T, nbits_t WIDTH>
  346. T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
  347. {
  348. is >> rdpfpair.dpf[0] >> rdpfpair.dpf[1];
  349. return is;
  350. }
  351. // Set a DPFnode to zero
  352. static inline void zero(DPFnode &z)
  353. {
  354. z = _mm_setzero_si128();
  355. }
  356. // Set a LeafNode to zero
  357. template <size_t LWIDTH>
  358. static inline void zero(std::array<DPFnode,LWIDTH> &z)
  359. {
  360. for (size_t j=0;j<LWIDTH;++j) {
  361. zero(z[j]);
  362. }
  363. }
  364. // Set an array of value_r to zero
  365. template <size_t WIDTH>
  366. static inline void zero(std::array<value_t,WIDTH> &z)
  367. {
  368. for (size_t j=0;j<WIDTH;++j) {
  369. z[j] = 0;
  370. }
  371. }
  372. // Expand a level of the RDPF into the next level without threads. This
  373. // just computes the PRGs without computing or applying the correction
  374. // words. L and R will be set to the XORs of the left children and the
  375. // XORs of the right children respectively. NT will be LeafNode if we
  376. // are expanding into a leaf level, DPFnode if not.
  377. template <typename NT>
  378. static inline void expand_level_nothreads(size_t start, size_t end,
  379. const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
  380. size_t &aes_ops)
  381. {
  382. // Only touch registers in the inner loop if possible
  383. NT lL, lR;
  384. zero(lL);
  385. zero(lR);
  386. size_t laes_ops = 0;
  387. for(size_t i=start;i<end;++i) {
  388. NT lchild, rchild;
  389. prgboth(lchild, rchild, curlevel[i], laes_ops);
  390. lL ^= lchild;
  391. lR ^= rchild;
  392. nextlevel[2*i] = lchild;
  393. nextlevel[2*i+1] = rchild;
  394. }
  395. L = lL;
  396. R = lR;
  397. aes_ops += laes_ops;
  398. }
  399. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  400. // timing benchmarks
  401. template <typename NT>
  402. static inline void expand_level(int max_nthreads, nbits_t level,
  403. const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
  404. size_t &aes_ops)
  405. {
  406. size_t curlevel_size = (size_t(1)<<level);
  407. if (max_nthreads == 1 || level < 19) {
  408. // No threading
  409. expand_level_nothreads(0, curlevel_size,
  410. curlevel, nextlevel, L, R, aes_ops);
  411. } else {
  412. int nthreads =
  413. int(ceil(sqrt(double(curlevel_size/6000))));
  414. if (nthreads > max_nthreads) {
  415. nthreads = max_nthreads;
  416. }
  417. NT tL[nthreads];
  418. NT tR[nthreads];
  419. size_t taes_ops[nthreads];
  420. size_t threadstart = 0;
  421. size_t threadchunk = curlevel_size / nthreads;
  422. size_t threadextra = curlevel_size % nthreads;
  423. boost::asio::thread_pool pool(nthreads);
  424. for (int t=0;t<nthreads;++t) {
  425. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  426. size_t threadend = threadstart + threadsize;
  427. taes_ops[t] = 0;
  428. boost::asio::post(pool,
  429. [t, &tL, &tR, &taes_ops, threadstart, threadend,
  430. &curlevel, &nextlevel] {
  431. expand_level_nothreads(threadstart, threadend,
  432. curlevel, nextlevel, tL[t], tR[t], taes_ops[t]);
  433. });
  434. threadstart = threadend;
  435. }
  436. pool.join();
  437. // Again work on registers as much as possible
  438. NT lL, lR;
  439. zero(lL);
  440. zero(lR);
  441. size_t laes_ops = 0;
  442. for (int t=0;t<nthreads;++t) {
  443. lL ^= tL[t];
  444. lR ^= tR[t];
  445. laes_ops += taes_ops[t];
  446. }
  447. L = lL;
  448. R = lR;
  449. aes_ops += laes_ops;
  450. }
  451. }
  452. // Apply the correction words to an expanded non-leaf level (nextlevel),
  453. // based on the flag bits in curlevel. This version does not use
  454. // threads.
  455. static inline void finalize_nonleaf_layer_nothreads(size_t start,
  456. size_t end, const DPFnode *curlevel, DPFnode *nextlevel,
  457. DPFnode CWL, DPFnode CWR)
  458. {
  459. for(size_t i=start;i<end;++i) {
  460. bool flag = get_lsb(curlevel[i]);
  461. nextlevel[2*i] = xor_if(nextlevel[2*i], CWL, flag);
  462. nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
  463. }
  464. }
  465. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  466. // timing benchmarks. The timing of each iteration of the inner loop is
  467. // comparable to the above, so just use the same computations. All of
  468. // this could be tuned, of course.
  469. static inline void finalize_nonleaf_layer(int max_nthreads, nbits_t level,
  470. const DPFnode *curlevel, DPFnode *nextlevel, DPFnode CWL,
  471. DPFnode CWR)
  472. {
  473. size_t curlevel_size = (size_t(1)<<level);
  474. if (max_nthreads == 1 || level < 19) {
  475. // No threading
  476. finalize_nonleaf_layer_nothreads(0, curlevel_size,
  477. curlevel, nextlevel, CWL, CWR);
  478. } else {
  479. int nthreads =
  480. int(ceil(sqrt(double(curlevel_size/6000))));
  481. if (nthreads > max_nthreads) {
  482. nthreads = max_nthreads;
  483. }
  484. size_t threadstart = 0;
  485. size_t threadchunk = curlevel_size / nthreads;
  486. size_t threadextra = curlevel_size % nthreads;
  487. boost::asio::thread_pool pool(nthreads);
  488. for (int t=0;t<nthreads;++t) {
  489. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  490. size_t threadend = threadstart + threadsize;
  491. boost::asio::post(pool,
  492. [threadstart, threadend, CWL, CWR,
  493. &curlevel, &nextlevel] {
  494. finalize_nonleaf_layer_nothreads(threadstart, threadend,
  495. curlevel, nextlevel, CWL, CWR);
  496. });
  497. threadstart = threadend;
  498. }
  499. pool.join();
  500. }
  501. }
  502. // Finalize a leaf layer. This applies the correction words, and
  503. // computes the low and high sums and XORs. This version does not use
  504. // threads. You can pass save_expansion = false here if you don't need
  505. // to save the expansion. LN is a LeafNode.
  506. template <size_t WIDTH, typename LN>
  507. static inline void finalize_leaf_layer_nothreads(size_t start,
  508. size_t end, const DPFnode *curlevel, LN *nextlevel,
  509. bool save_expansion, LN CWL, LN CWR, value_t &low_sum,
  510. std::array<value_t,WIDTH> &high_sum,
  511. std::array<value_t,WIDTH> &high_xor)
  512. {
  513. value_t llow_sum = 0;
  514. std::array<value_t,WIDTH> lhigh_sum;
  515. std::array<value_t,WIDTH> lhigh_xor;
  516. zero(lhigh_sum);
  517. zero(lhigh_xor);
  518. for(size_t i=start;i<end;++i) {
  519. bool flag = get_lsb(curlevel[i]);
  520. LN leftchild = xor_if(nextlevel[2*i], CWL, flag);
  521. LN rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
  522. if (save_expansion) {
  523. nextlevel[2*i] = leftchild;
  524. nextlevel[2*i+1] = rightchild;
  525. }
  526. value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[0]));
  527. value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[0]));
  528. value_t lefthigh =
  529. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[0],8)));
  530. value_t righthigh =
  531. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[0],8)));
  532. llow_sum += (leftlow + rightlow);
  533. lhigh_sum[0] += (lefthigh + righthigh);
  534. lhigh_xor[0] ^= (lefthigh ^ righthigh);
  535. size_t w = 0;
  536. for (size_t j=1; j<WIDTH; j+=2) {
  537. ++w;
  538. value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[w]));
  539. value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[w]));
  540. value_t lefthigh =
  541. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[w],8)));
  542. value_t righthigh =
  543. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[w],8)));
  544. lhigh_sum[j] += (leftlow + rightlow);
  545. lhigh_xor[j] ^= (leftlow ^ rightlow);
  546. if (j+1 < WIDTH) {
  547. lhigh_sum[j+1] += (lefthigh + righthigh);
  548. lhigh_xor[j+1] ^= (lefthigh ^ righthigh);
  549. }
  550. }
  551. }
  552. low_sum = llow_sum;
  553. high_sum = lhigh_sum;
  554. high_xor = lhigh_xor;
  555. }
  556. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  557. // timing benchmarks. The timing of each iteration of the inner loop is
  558. // comparable to the above, so just use the same computations. All of
  559. // this could be tuned, of course.
  560. template <size_t WIDTH, typename LN>
  561. static inline void finalize_leaf_layer(int max_nthreads, nbits_t level,
  562. const DPFnode *curlevel, LN *nextlevel, bool save_expansion,
  563. LN CWL, LN CWR, value_t &low_sum,
  564. std::array<value_t,WIDTH> &high_sum,
  565. std::array<value_t,WIDTH> &high_xor)
  566. {
  567. size_t curlevel_size = (size_t(1)<<level);
  568. if (max_nthreads == 1 || level < 19) {
  569. // No threading
  570. finalize_leaf_layer_nothreads(0, curlevel_size,
  571. curlevel, nextlevel, save_expansion, CWL, CWR,
  572. low_sum, high_sum, high_xor);
  573. } else {
  574. int nthreads =
  575. int(ceil(sqrt(double(curlevel_size/6000))));
  576. if (nthreads > max_nthreads) {
  577. nthreads = max_nthreads;
  578. }
  579. value_t tlow_sum[nthreads];
  580. std::array<value_t,WIDTH> thigh_sum[nthreads];
  581. std::array<value_t,WIDTH> thigh_xor[nthreads];
  582. size_t threadstart = 0;
  583. size_t threadchunk = curlevel_size / nthreads;
  584. size_t threadextra = curlevel_size % nthreads;
  585. boost::asio::thread_pool pool(nthreads);
  586. for (int t=0;t<nthreads;++t) {
  587. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  588. size_t threadend = threadstart + threadsize;
  589. boost::asio::post(pool,
  590. [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
  591. &curlevel, &nextlevel, CWL, CWR, save_expansion] {
  592. finalize_leaf_layer_nothreads(threadstart, threadend,
  593. curlevel, nextlevel, save_expansion, CWL, CWR,
  594. tlow_sum[t], thigh_sum[t], thigh_xor[t]);
  595. });
  596. threadstart = threadend;
  597. }
  598. pool.join();
  599. low_sum = 0;
  600. zero(high_sum);
  601. zero(high_xor);
  602. for (int t=0;t<nthreads;++t) {
  603. low_sum += tlow_sum[t];
  604. high_sum += thigh_sum[t];
  605. high_xor ^= thigh_xor[t];
  606. }
  607. }
  608. }
  609. // Create one level of the RDPF. NT will be as above: LeafNode if we
  610. // are expanding into a leaf level, DPFnode if not. LI will be LeafInfo
  611. // if we are expanding into a leaf level, and it is unused otherwise.
  612. template<typename NT, typename LI>
  613. static inline void create_level(MPCTIO &tio, yield_t &yield,
  614. const DPFnode *curlevel, NT *nextlevel,
  615. int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW,
  616. bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops)
  617. {
  618. // tio.cpu_nthreads() is the maximum number of threads we
  619. // have available.
  620. int max_nthreads = tio.cpu_nthreads();
  621. NT L, R;
  622. zero(L);
  623. zero(R);
  624. // The server doesn't need to do this computation, but it does
  625. // need to execute mpc_reconstruct_choice so that it sends
  626. // the AndTriples at the appropriate time.
  627. if (player < 2) {
  628. #ifdef RDPF_MTGEN_TIMING_1
  629. if (player == 0) {
  630. mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
  631. nextlevel, aes_ops);
  632. size_t niters = 2048;
  633. if (level > 8) niters = (1<<20)>>level;
  634. for(int t=1;t<=8;++t) {
  635. mtgen_timetest_1(level, t, niters, curlevel,
  636. nextlevel, aes_ops);
  637. }
  638. mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
  639. nextlevel, aes_ops);
  640. }
  641. #endif
  642. // Using the timing results gathered above, decide whether
  643. // to multithread, and if so, how many threads to use.
  644. expand_level(max_nthreads, level, curlevel, nextlevel,
  645. L, R, aes_ops);
  646. }
  647. // If we're going left (bs_choice = 0), we want the correction
  648. // word to be the XOR of our right side and our peer's right
  649. // side; if bs_choice = 1, it should be the XOR or our left side
  650. // and our peer's left side.
  651. // We also have to ensure that the flag bits (the lsb) of the
  652. // side that will end up the same be of course the same, but
  653. // also that the flag bits (the lsb) of the side that will end
  654. // up different _must_ be different. That is, it's not enough
  655. // for the nodes of the child selected by choice to be different
  656. // as 128-bit values; they also have to be different in their
  657. // lsb.
  658. // This is where we make a small optimization over Appendix C of
  659. // the Duoram paper: instead of keeping separate correction flag
  660. // bits for the left and right children, we observe that the low
  661. // bit of the overall correction word effectively serves as one
  662. // of those bits, so we just need to store one extra bit per
  663. // level, not two. (We arbitrarily choose the one for the right
  664. // child.)
  665. // Note that the XOR of our left and right child before and
  666. // after applying the correction word won't change, since the
  667. // correction word is applied to either both children or
  668. // neither, depending on the value of the parent's flag. So in
  669. // particular, the XOR of the flag bits won't change, and if our
  670. // children's flag's XOR equals our peer's children's flag's
  671. // XOR, then we won't have different flag bits even for the
  672. // children that have different 128-bit values.
  673. // So we compute our_parity = lsb(L^R)^player, and we XOR that
  674. // into the R value in the correction word computation. At the
  675. // same time, we exchange these parity values to compute the
  676. // combined parity, which we store in the DPF. Then when the
  677. // DPF is evaluated, if the parent's flag is set, not only apply
  678. // the correction work to both children, but also apply the
  679. // (combined) parity bit to just the right child. Then for
  680. // unequal nodes (where the flag bit is different), exactly one
  681. // of the four children (two for P0 and two for P1) will have
  682. // the parity bit applied, which will set the XOR of the lsb of
  683. // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
  684. // = 1 because everything cancels out except player (for which
  685. // one player is 0 and the other is 1).
  686. bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player;
  687. xor_lsb(R, our_parity_bit);
  688. NT CWL;
  689. bool peer_parity_bit;
  690. // Exchange the parities and do mpc_reconstruct_choice at the
  691. // same time (bundled into the same rounds)
  692. run_coroutines(yield,
  693. [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
  694. tio.queue_peer(&our_parity_bit, 1);
  695. yield();
  696. uint8_t peer_parity_byte;
  697. tio.recv_peer(&peer_parity_byte, 1);
  698. peer_parity_bit = peer_parity_byte & 1;
  699. },
  700. [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) {
  701. mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L);
  702. });
  703. cfbit = our_parity_bit ^ peer_parity_bit;
  704. CW = CWL;
  705. NT CWR = CWL;
  706. xor_lsb(CWR, cfbit);
  707. if (player < 2) {
  708. // The timing of each iteration of the inner loop is
  709. // comparable to the above, so just use the same
  710. // computations. All of this could be tuned, of course.
  711. if constexpr (std::is_same_v<NT, DPFnode>) {
  712. finalize_nonleaf_layer(max_nthreads, level, curlevel,
  713. nextlevel, CWL, CWR);
  714. } else {
  715. // Recall there are four potentially useful vectors that
  716. // can come out of a DPF:
  717. // - (single-bit) bitwise unit vector
  718. // - additive-shared unit vector
  719. // - XOR-shared scaled unit vector
  720. // - additive-shared scaled unit vector
  721. //
  722. // (No single DPF should be used for both of the first
  723. // two or both of the last two, though, since they're
  724. // correlated; you _can_ use one of the first two and
  725. // one of the last two.)
  726. //
  727. // For each 128-bit leaf, the low bit is the flag bit,
  728. // and we're guaranteed that the flag bits (and indeed
  729. // the whole 128-bit value) for P0 and P1 are the same
  730. // for every leaf except the target, and that the flag
  731. // bits definitely differ for the target (and the other
  732. // 127 bits are independently random on each side).
  733. //
  734. // We divide the 128-bit leaf into a low 64-bit word and
  735. // a high 64-bit word. We use the low word for the unit
  736. // vector and the high word for the scaled vector; this
  737. // choice is not arbitrary: the flag bit in the low word
  738. // means that the sum of all the low words (with P1's
  739. // low words negated) across both P0 and P1 is
  740. // definitely odd, so we can compute that sum's inverse
  741. // mod 2^64, and store it now during precomputation. At
  742. // evaluation time for the additive-shared unit vector,
  743. // we will output this global inverse times the low word
  744. // of each leaf, which will make the sum of all of those
  745. // values 1. (This technique replaces the protocol in
  746. // Appendix D of the Duoram paper.)
  747. //
  748. // For the scaled vector, we just have to compute shares
  749. // of what the scaled vector is a sharing _of_, but
  750. // that's just XORing or adding all of each party's
  751. // local high words; no communication needed.
  752. value_t low_sum;
  753. const size_t WIDTH = LI::W;
  754. std::array<value_t,WIDTH> high_sum;
  755. std::array<value_t,WIDTH> high_xor;
  756. finalize_leaf_layer(max_nthreads, level, curlevel,
  757. nextlevel, save_expansion, CWL, CWR, low_sum, high_sum,
  758. high_xor);
  759. if (player == 1) {
  760. low_sum = -low_sum;
  761. for(size_t j=0; j<WIDTH; ++j) {
  762. high_sum[j] = -high_sum[j];
  763. }
  764. }
  765. for(size_t j=0; j<WIDTH; ++j) {
  766. li.scaled_sum[j].ashare = high_sum[j];
  767. li.scaled_xor[j].xshare = high_xor[j];
  768. }
  769. // Exchange low_sum and add them up
  770. tio.queue_peer(&low_sum, sizeof(low_sum));
  771. yield();
  772. value_t peer_low_sum;
  773. tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
  774. low_sum += peer_low_sum;
  775. // The low_sum had better be odd
  776. assert(low_sum & 1);
  777. li.unit_sum_inverse = inverse_value_t(low_sum);
  778. }
  779. } else if constexpr (!std::is_same_v<NT, DPFnode>) {
  780. yield();
  781. }
  782. }
  783. // Construct a DPF with the given (XOR-shared) target location, and
  784. // of the given depth, to be used for random-access memory reads and
  785. // writes. The DPF is construction collaboratively by P0 and P1,
  786. // with the server P2 helping by providing various kinds of
  787. // correlated randomness, such as MultTriples and AndTriples.
  788. //
  789. // This algorithm is based on Appendix C from the Duoram paper, with a
  790. // small optimization noted below.
  791. template <nbits_t WIDTH>
  792. RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
  793. RegXS target, nbits_t depth, bool incremental, bool save_expansion)
  794. {
  795. int player = tio.player();
  796. size_t &aes_ops = tio.aes_ops();
  797. // Choose a random seed
  798. arc4random_buf(&seed, sizeof(seed));
  799. // Ensure the flag bits (the lsb of each node) are different
  800. seed = set_lsb(seed, !!player);
  801. cfbits = 0;
  802. leaf_cfbits = 0;
  803. whichhalf = (player == 1);
  804. maxdepth = depth;
  805. curdepth = depth;
  806. // The root level is just the seed
  807. nbits_t level = 0;
  808. DPFnode *curlevel = NULL;
  809. DPFnode *nextlevel = new DPFnode[1];
  810. nextlevel[0] = seed;
  811. li.resize(incremental ? depth : 1);
  812. // Prefetch the right number of nodeselecttriples
  813. tio.request_nodeselecttriples(yield, incremental ? 2*depth-1 : depth);
  814. // Construct each intermediate level
  815. while(level < depth) {
  816. LeafNode *leaflevel = NULL;
  817. if (player < 2) {
  818. delete[] curlevel;
  819. curlevel = nextlevel;
  820. nextlevel = NULL;
  821. if (save_expansion && (incremental || level == depth-1)) {
  822. li[depth-1-level].expansion.resize(1<<(level+1));
  823. leaflevel = li[depth-1-level].expansion.data();
  824. } else if (incremental || level == depth-1) {
  825. leaflevel = new LeafNode[1<<(level+1)];
  826. }
  827. if (level < depth-1) {
  828. nextlevel = new DPFnode[1<<(level+1)];
  829. }
  830. }
  831. // Invariant: curlevel has 2^level DPFnode elements; nextlevel
  832. // has 2^{level+1} DPFnode elements if we're not at the last
  833. // level, and leaflevel has 2^{level+1} LeafNode elements if we
  834. // are at a leaf level (the last level always, and all levels if
  835. // we are making an incremental RDPF).
  836. // The bit-shared choice bit is bit (depth-level-1) of the
  837. // XOR-shared target index
  838. RegBS bs_choice = target.bit(depth-level-1);
  839. // At each layer, we can create the next internal layer and the
  840. // leaf layer in parallel coroutines if we're making an
  841. // incremental RDPF. If not, exactly one of these coroutines
  842. // will be created, and we just run that one.
  843. std::vector<coro_t> coroutines;
  844. if (level < depth-1) {
  845. coroutines.emplace_back([this, &tio, curlevel, nextlevel,
  846. player, level, depth, bs_choice, save_expansion,
  847. &aes_ops] (yield_t &yield) {
  848. DPFnode CW;
  849. bool cfbit;
  850. // This field is ignored when we're not expanding to a leaf
  851. // level, but it needs to be an lvalue reference.
  852. int noleafinfo = 0;
  853. create_level(tio, yield, curlevel, nextlevel, player, level,
  854. depth, bs_choice, CW, cfbit, save_expansion, noleafinfo,
  855. aes_ops);
  856. cfbits |= (value_t(cfbit)<<level);
  857. if (player < 2) {
  858. cw.push_back(CW);
  859. }
  860. });
  861. }
  862. if (incremental || level == depth-1) {
  863. coroutines.emplace_back([this, &tio, curlevel, leaflevel,
  864. player, level, depth, bs_choice, save_expansion,
  865. &aes_ops](yield_t &yield) {
  866. LeafNode CW;
  867. bool cfbit;
  868. create_level(tio, yield, curlevel, leaflevel, player,
  869. level, depth, bs_choice, CW, cfbit, save_expansion,
  870. li[depth-level-1], aes_ops);
  871. leaf_cfbits |= (value_t(cfbit)<<(depth-level-1));
  872. li[depth-level-1].leaf_cw = CW;
  873. });
  874. }
  875. run_coroutines(yield, coroutines);
  876. if (!save_expansion) {
  877. delete[] leaflevel;
  878. }
  879. ++level;
  880. }
  881. delete[] curlevel;
  882. delete[] nextlevel;
  883. }
  884. // Get the leaf node for the given input
  885. template <nbits_t WIDTH>
  886. typename RDPF<WIDTH>::LeafNode
  887. RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
  888. {
  889. // If we have a precomputed expansion, just use it
  890. if (li[maxdepth-curdepth].expansion.size()) {
  891. return li[maxdepth-curdepth].expansion[input];
  892. }
  893. DPFnode node = seed;
  894. for (nbits_t d=0;d<curdepth-1;++d) {
  895. bit_t dir = !!(input & (address_t(1)<<(curdepth-d-1)));
  896. node = descend(node, d, dir, aes_ops);
  897. }
  898. bit_t dir = (input & 1);
  899. return descend_to_leaf(node, curdepth-1, dir, aes_ops);
  900. }
  901. // Expand one leaf layer of the DPF if it's not already expanded
  902. //
  903. // This routine is slightly more efficient (except for incremental
  904. // RDPFs) than repeatedly calling StreamEval::next(), but it uses a lot
  905. // more memory.
  906. template <nbits_t WIDTH>
  907. void RDPF<WIDTH>::expand_leaf_layer(nbits_t li_index, size_t &aes_ops)
  908. {
  909. nbits_t depth = maxdepth - li_index;
  910. size_t num_leaves = size_t(1)<<depth;
  911. if (li[li_index].expansion.size() == num_leaves) return;
  912. li[li_index].expansion.resize(num_leaves);
  913. address_t index = 0;
  914. address_t lastindex = 0;
  915. DPFnode *path = new DPFnode[depth];
  916. path[0] = seed;
  917. for (nbits_t i=1;i<depth;++i) {
  918. path[i] = descend(path[i-1], i-1, 0, aes_ops);
  919. }
  920. li[maxdepth-depth].expansion[index++] =
  921. descend_to_leaf(path[depth-1], depth-1, 0, aes_ops);
  922. li[maxdepth-depth].expansion[index++] =
  923. descend_to_leaf(path[depth-1], depth-1, 1, aes_ops);
  924. while(index < num_leaves) {
  925. // Invariant: lastindex and index will both be even, and
  926. // index=lastindex+2
  927. uint64_t index_xor = index ^ lastindex;
  928. nbits_t how_many_1_bits = __builtin_popcount(index_xor);
  929. // If lastindex -> index goes for example from (in binary)
  930. // 010010110 -> 010011000, then index_xor will be
  931. // 000001110 and how_many_1_bits will be 3.
  932. // That indicates that path[depth-3] was a left child, and now
  933. // we need to change it to a right child by descending right
  934. // from path[depth-4], and then filling the path after that with
  935. // left children.
  936. path[depth-how_many_1_bits] =
  937. descend(path[depth-how_many_1_bits-1],
  938. depth-how_many_1_bits-1, 1, aes_ops);
  939. for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
  940. path[i+1] = descend(path[i], i, 0, aes_ops);
  941. }
  942. lastindex = index;
  943. li[li_index].expansion[index++] =
  944. descend_to_leaf(path[depth-1], depth-1, 0, aes_ops);
  945. li[li_index].expansion[index++] =
  946. descend_to_leaf(path[depth-1], depth-1, 1, aes_ops);
  947. }
  948. delete[] path;
  949. }
  950. // Expand the DPF if it's not already expanded
  951. //
  952. // This routine is slightly more efficient (except for incremental
  953. // RDPFs) than repeatedly calling StreamEval::next(), but it uses a lot
  954. // more memory.
  955. template <nbits_t WIDTH>
  956. void RDPF<WIDTH>::expand(size_t &aes_ops)
  957. {
  958. nbits_t num_leaf_layers = li.size();
  959. for (nbits_t li_index=0; li_index < num_leaf_layers; ++li_index) {
  960. expand_leaf_layer(li_index, aes_ops);
  961. }
  962. }
  963. // Construct three RDPFs of the given depth all with the same randomly
  964. // generated target index.
  965. template <nbits_t WIDTH>
  966. RDPFTriple<WIDTH>::RDPFTriple(MPCTIO &tio, yield_t &yield,
  967. nbits_t depth, bool incremental, bool save_expansion)
  968. {
  969. // Pick a random XOR share of the target
  970. xs_target.randomize(depth);
  971. // Now create three RDPFs with that target, and also convert the XOR
  972. // shares of the target to additive shares
  973. std::vector<coro_t> coroutines;
  974. for (int i=0;i<3;++i) {
  975. coroutines.emplace_back(
  976. [this, &tio, depth, i, incremental,
  977. save_expansion](yield_t &yield) {
  978. dpf[i] = RDPF<WIDTH>(tio, yield, xs_target, depth,
  979. incremental, save_expansion);
  980. });
  981. }
  982. coroutines.emplace_back(
  983. [this, &tio, depth](yield_t &yield) {
  984. mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false);
  985. });
  986. run_coroutines(yield, coroutines);
  987. }
  988. template <nbits_t WIDTH>
  989. typename RDPFTriple<WIDTH>::node RDPFTriple<WIDTH>::descend(
  990. const RDPFTriple<WIDTH>::node &parent,
  991. nbits_t parentdepth, bit_t whichchild,
  992. size_t &aes_ops) const
  993. {
  994. auto [P0, P1, P2] = parent;
  995. DPFnode C0, C1, C2;
  996. C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
  997. C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
  998. C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops);
  999. return std::make_tuple(C0,C1,C2);
  1000. }
  1001. template <nbits_t WIDTH>
  1002. typename RDPFTriple<WIDTH>::LeafNode RDPFTriple<WIDTH>::descend_to_leaf(
  1003. const RDPFTriple<WIDTH>::node &parent,
  1004. nbits_t parentdepth, bit_t whichchild,
  1005. size_t &aes_ops) const
  1006. {
  1007. auto [P0, P1, P2] = parent;
  1008. typename RDPF<WIDTH>::LeafNode C0, C1, C2;
  1009. C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
  1010. C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
  1011. C2 = dpf[2].descend_to_leaf(P2, parentdepth, whichchild, aes_ops);
  1012. return std::make_tuple(C0,C1,C2);
  1013. }
  1014. template <nbits_t WIDTH>
  1015. typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
  1016. const RDPFPair<WIDTH>::node &parent,
  1017. nbits_t parentdepth, bit_t whichchild,
  1018. size_t &aes_ops) const
  1019. {
  1020. auto [P0, P1] = parent;
  1021. DPFnode C0, C1;
  1022. C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
  1023. C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
  1024. return std::make_tuple(C0,C1);
  1025. }
  1026. template <nbits_t WIDTH>
  1027. typename RDPFPair<WIDTH>::LeafNode RDPFPair<WIDTH>::descend_to_leaf(
  1028. const RDPFPair<WIDTH>::node &parent,
  1029. nbits_t parentdepth, bit_t whichchild,
  1030. size_t &aes_ops) const
  1031. {
  1032. auto [P0, P1] = parent;
  1033. typename RDPF<WIDTH>::LeafNode C0, C1;
  1034. C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
  1035. C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
  1036. return std::make_tuple(C0,C1);
  1037. }
  1038. template <nbits_t WIDTH>
  1039. typename RDPF2of3<WIDTH>::node RDPF2of3<WIDTH>::descend(
  1040. const RDPF2of3<WIDTH>::node &parent,
  1041. nbits_t parentdepth, bit_t whichchild,
  1042. size_t &aes_ops) const
  1043. {
  1044. auto [P0, P1] = parent;
  1045. DPFnode C0, C1;
  1046. C0 = dpf0.descend(P0, parentdepth, whichchild, aes_ops);
  1047. C1 = dpf1.descend(P1, parentdepth, whichchild, aes_ops);
  1048. return std::make_tuple(C0,C1);
  1049. }
  1050. template <nbits_t WIDTH>
  1051. typename RDPF2of3<WIDTH>::LeafNode RDPF2of3<WIDTH>::descend_to_leaf(
  1052. const RDPF2of3<WIDTH>::node &parent,
  1053. nbits_t parentdepth, bit_t whichchild,
  1054. size_t &aes_ops) const
  1055. {
  1056. auto [P0, P1] = parent;
  1057. typename RDPF<WIDTH>::LeafNode C0, C1;
  1058. C0 = dpf0.descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
  1059. C1 = dpf1.descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
  1060. return std::make_tuple(C0,C1);
  1061. }