rdpf.cpp 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #include <bsd/stdlib.h> // arc4random_buf
  2. #include "rdpf.hpp"
  3. #include "bitutils.hpp"
  4. #undef RDPF_MTGEN_TIMING_1
  5. #ifdef RDPF_MTGEN_TIMING_1
  6. // Timing tests for multithreaded generation of RDPFs
  7. // nthreads = 0 to not launch threads at all
  8. // run for num_iters iterations, output the number of millisections
  9. // total for all of the iterations
  10. //
  11. // Results: roughly 50 µs to launch the thread pool with 1 thread, and
  12. // roughly 30 additional µs for each additional thread. Each iteration
  13. // of the inner loop takes about 4 to 5 ns. This works out to around
  14. // level 19 where it starts being worth it to multithread, and you
  15. // should use at most sqrt(2^{level}/6000) threads.
  16. static void mtgen_timetest_1(nbits_t level, int nthreads,
  17. size_t num_iters, const DPFnode *curlevel,
  18. DPFnode *nextlevel, size_t &aes_ops)
  19. {
  20. if (num_iters == 0) {
  21. num_iters = 1;
  22. }
  23. size_t prev_aes_ops = aes_ops;
  24. DPFnode L = _mm_setzero_si128();
  25. DPFnode R = _mm_setzero_si128();
  26. // The tweak causes us to compute something slightly different every
  27. // iteration of the loop, so that the compiler doesn't notice we're
  28. // doing the same thing num_iters times and optimize it away
  29. DPFnode tweak = _mm_setzero_si128();
  30. auto start = boost::chrono::steady_clock::now();
  31. for(size_t iter=0;iter<num_iters;++iter) {
  32. tweak += 1; // This actually adds the 128-bit value whose high
  33. // and low 64-bits words are both 1, but that's
  34. // fine.
  35. size_t curlevel_size = size_t(1)<<level;
  36. if (nthreads == 0) {
  37. size_t laes_ops = 0;
  38. for(size_t i=0;i<curlevel_size;++i) {
  39. DPFnode lchild, rchild;
  40. prgboth(lchild, rchild, curlevel[i]^tweak, laes_ops);
  41. L = (L ^ lchild);
  42. R = (R ^ rchild);
  43. nextlevel[2*i] = lchild;
  44. nextlevel[2*i+1] = rchild;
  45. }
  46. aes_ops += laes_ops;
  47. } else {
  48. DPFnode tL[nthreads];
  49. DPFnode tR[nthreads];
  50. size_t taes_ops[nthreads];
  51. size_t threadstart = 0;
  52. size_t threadchunk = curlevel_size / nthreads;
  53. size_t threadextra = curlevel_size % nthreads;
  54. boost::asio::thread_pool pool(nthreads);
  55. for (int t=0;t<nthreads;++t) {
  56. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  57. size_t threadend = threadstart + threadsize;
  58. boost::asio::post(pool,
  59. [t, &tL, &tR, &taes_ops, threadstart, threadend,
  60. &curlevel, &nextlevel, tweak] {
  61. DPFnode L = _mm_setzero_si128();
  62. DPFnode R = _mm_setzero_si128();
  63. size_t aes_ops = 0;
  64. for(size_t i=threadstart;i<threadend;++i) {
  65. DPFnode lchild, rchild;
  66. prgboth(lchild, rchild, curlevel[i]^tweak, aes_ops);
  67. L = (L ^ lchild);
  68. R = (R ^ rchild);
  69. nextlevel[2*i] = lchild;
  70. nextlevel[2*i+1] = rchild;
  71. }
  72. tL[t] = L;
  73. tR[t] = R;
  74. taes_ops[t] = aes_ops;
  75. });
  76. threadstart = threadend;
  77. }
  78. pool.join();
  79. for (int t=0;t<nthreads;++t) {
  80. L ^= tL[t];
  81. R ^= tR[t];
  82. aes_ops += taes_ops[t];
  83. }
  84. }
  85. }
  86. auto elapsed =
  87. boost::chrono::steady_clock::now() - start;
  88. std::cout << "timetest_1 " << int(level) << " " << nthreads << " "
  89. << num_iters << " " << boost::chrono::duration_cast
  90. <boost::chrono::milliseconds>(elapsed) << " " <<
  91. (aes_ops-prev_aes_ops) << " AES\n";
  92. dump_node(L);
  93. dump_node(R);
  94. }
  95. #endif