| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 | #include <bsd/stdlib.h> // arc4random_buf#include "rdpf.hpp"#include "bitutils.hpp"#undef RDPF_MTGEN_TIMING_1#ifdef RDPF_MTGEN_TIMING_1// Timing tests for multithreaded generation of RDPFs// nthreads = 0 to not launch threads at all// run for num_iters iterations, output the number of millisections// total for all of the iterations//// Results: roughly 50 µs to launch the thread pool with 1 thread, and// roughly 30 additional µs for each additional thread.  Each iteration// of the inner loop takes about 4 to 5 ns.  This works out to around// level 19 where it starts being worth it to multithread, and you// should use at most sqrt(2^{level}/6000) threads.static void mtgen_timetest_1(nbits_t level, int nthreads,    size_t num_iters, const DPFnode *curlevel,    DPFnode *nextlevel, size_t &aes_ops){    if (num_iters == 0) {        num_iters = 1;    }    size_t prev_aes_ops = aes_ops;    DPFnode L = _mm_setzero_si128();    DPFnode R = _mm_setzero_si128();    // The tweak causes us to compute something slightly different every    // iteration of the loop, so that the compiler doesn't notice we're    // doing the same thing num_iters times and optimize it away    DPFnode tweak = _mm_setzero_si128();    auto start = boost::chrono::steady_clock::now();    for(size_t iter=0;iter<num_iters;++iter) {        tweak += 1;  // This actually adds the 128-bit value whose high                     // and low 64-bits words are both 1, but that's                     // fine.        size_t curlevel_size = size_t(1)<<level;        if (nthreads == 0) {            size_t laes_ops = 0;            for(size_t i=0;i<curlevel_size;++i) {                DPFnode lchild, rchild;                prgboth(lchild, rchild, curlevel[i]^tweak, laes_ops);                L = (L ^ lchild);                R = (R ^ rchild);                nextlevel[2*i] = lchild;                nextlevel[2*i+1] = rchild;            }            aes_ops += laes_ops;        } else {            DPFnode tL[nthreads];            DPFnode tR[nthreads];            size_t taes_ops[nthreads];            size_t threadstart = 0;            size_t threadchunk = curlevel_size / nthreads;            size_t threadextra = curlevel_size % nthreads;            boost::asio::thread_pool pool(nthreads);            for (int t=0;t<nthreads;++t) {                size_t threadsize = threadchunk + (size_t(t) < threadextra);                size_t threadend = threadstart + threadsize;                boost::asio::post(pool,                    [t, &tL, &tR, &taes_ops, threadstart, threadend,                    &curlevel, &nextlevel, tweak] {                        DPFnode L = _mm_setzero_si128();                        DPFnode R = _mm_setzero_si128();                        size_t aes_ops = 0;                        for(size_t i=threadstart;i<threadend;++i) {                            DPFnode lchild, rchild;                            prgboth(lchild, rchild, curlevel[i]^tweak, aes_ops);                            L = (L ^ lchild);                            R = (R ^ rchild);                            nextlevel[2*i] = lchild;                            nextlevel[2*i+1] = rchild;                        }                        tL[t] = L;                        tR[t] = R;                        taes_ops[t] = aes_ops;                    });                threadstart = threadend;            }            pool.join();            for (int t=0;t<nthreads;++t) {                L ^= tL[t];                R ^= tR[t];                aes_ops += taes_ops[t];            }        }    }    auto elapsed =        boost::chrono::steady_clock::now() - start;    std::cout << "timetest_1 " << int(level) << " " << nthreads << " "        << num_iters << " " << boost::chrono::duration_cast        <boost::chrono::milliseconds>(elapsed) << " " <<        (aes_ops-prev_aes_ops) << " AES\n";    dump_node(L);    dump_node(R);}#endif
 |