iang
/
prac


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
							#include <bsd/stdlib.h> // arc4random_buf

#include "rdpf.hpp"
#include "bitutils.hpp"

#undef RDPF_MTGEN_TIMING_1

#ifdef RDPF_MTGEN_TIMING_1
// Timing tests for multithreaded generation of RDPFs
// nthreads = 0 to not launch threads at all
// run for num_iters iterations, output the number of millisections
// total for all of the iterations
//
// Results: roughly 50 µs to launch the thread pool with 1 thread, and
// roughly 30 additional µs for each additional thread.  Each iteration
// of the inner loop takes about 4 to 5 ns.  This works out to around
// level 19 where it starts being worth it to multithread, and you
// should use at most sqrt(2^{level}/6000) threads.
static void mtgen_timetest_1(nbits_t level, int nthreads,
    size_t num_iters, const DPFnode *curlevel,
    DPFnode *nextlevel, size_t &aes_ops)
{
    if (num_iters == 0) {
        num_iters = 1;
    }
    size_t prev_aes_ops = aes_ops;
    DPFnode L = _mm_setzero_si128();
    DPFnode R = _mm_setzero_si128();
    // The tweak causes us to compute something slightly different every
    // iteration of the loop, so that the compiler doesn't notice we're
    // doing the same thing num_iters times and optimize it away
    DPFnode tweak = _mm_setzero_si128();
    auto start = boost::chrono::steady_clock::now();
    for(size_t iter=0;iter<num_iters;++iter) {
        tweak += 1;  // This actually adds the 128-bit value whose high
                     // and low 64-bits words are both 1, but that's
                     // fine.
        size_t curlevel_size = size_t(1)<<level;
        if (nthreads == 0) {
            size_t laes_ops = 0;
            for(size_t i=0;i<curlevel_size;++i) {
                DPFnode lchild, rchild;
                prgboth(lchild, rchild, curlevel[i]^tweak, laes_ops);
                L = (L ^ lchild);
                R = (R ^ rchild);
                nextlevel[2*i] = lchild;
                nextlevel[2*i+1] = rchild;
            }
            aes_ops += laes_ops;
        } else {
            DPFnode tL[nthreads];
            DPFnode tR[nthreads];
            size_t taes_ops[nthreads];
            size_t threadstart = 0;
            size_t threadchunk = curlevel_size / nthreads;
            size_t threadextra = curlevel_size % nthreads;
            boost::asio::thread_pool pool(nthreads);
            for (int t=0;t<nthreads;++t) {
                size_t threadsize = threadchunk + (size_t(t) < threadextra);
                size_t threadend = threadstart + threadsize;
                boost::asio::post(pool,
                    [t, &tL, &tR, &taes_ops, threadstart, threadend,
                    &curlevel, &nextlevel, tweak] {
                        DPFnode L = _mm_setzero_si128();
                        DPFnode R = _mm_setzero_si128();
                        size_t aes_ops = 0;
                        for(size_t i=threadstart;i<threadend;++i) {
                            DPFnode lchild, rchild;
                            prgboth(lchild, rchild, curlevel[i]^tweak, aes_ops);
                            L = (L ^ lchild);
                            R = (R ^ rchild);
                            nextlevel[2*i] = lchild;
                            nextlevel[2*i+1] = rchild;
                        }
                        tL[t] = L;
                        tR[t] = R;
                        taes_ops[t] = aes_ops;
                    });
                threadstart = threadend;
            }
            pool.join();
            for (int t=0;t<nthreads;++t) {
                L ^= tL[t];
                R ^= tR[t];
                aes_ops += taes_ops[t];
            }
        }
    }
    auto elapsed =
        boost::chrono::steady_clock::now() - start;
    std::cout << "timetest_1 " << int(level) << " " << nthreads << " "
        << num_iters << " " << boost::chrono::duration_cast
        <boost::chrono::milliseconds>(elapsed) << " " <<
        (aes_ops-prev_aes_ops) << " AES\n";
    dump_node(L);
    dump_node(R);
}

#endif