#include // std::is_same<> #include // std::numeric_limits<> #include // CHAR_BIT #include // std::log2, std::ceil, std::floor #include // std::runtime_error #include // std::array<> #include // std::istream and std::ostream #include // std::vector<> #include // std::shared_ptr<> #include // std::move #include // std::copy #include // std::memcpy #include // arc4random_buf #include // SSE and AVX intrinsics #include size_t communication_cost = 0; #include "bitutils.h" #include "block.h" #include "prg.h" #include "prg_aes_impl.h" #include #include #include #include "block.h" #include #include #include #include #include #include #include using boost::asio::ip::tcp; #include #include using socket_t = boost::asio::ip::tcp::socket; typedef unsigned char byte_t; typedef __m128i node_t; block<__m128i> seed_for_blinds; constexpr size_t leaf_size = 1; typedef __m128i leaf_type; typedef std::array leaf_t; size_t bits_per_leaf = std::is_same::value ? 1 : sizeof(leaf_t) * CHAR_BIT; bool is_packed = (sizeof(leaf_t) < sizeof(node_t)); size_t leaves_per_node = is_packed ? sizeof(node_t) * CHAR_BIT / bits_per_leaf : 1; size_t input_bits(const size_t nitems) { return std::ceil(std::log2(nitems)); } leaf_t val; using namespace dpf; #include "mpc.h" void generate_random_targets(uint8_t **target_share_read, size_t n_threads, bool party, size_t expo) { for (size_t j = 0; j < 64; ++j) { for (size_t i = 0; i < n_threads; ++i) { uint8_t random_value; arc4random_buf(&random_value, sizeof(uint8_t)); target_share_read[i][j] = random_value; // rand(); } } } void compute_CW(bool party, tcp::socket &sout, __m128i L, __m128i R, uint8_t bit, __m128i &CW) { // struct cw_construction //{ __m128i rand_b, gamma_b; uint8_t bit_b; //}; __m128i *X, *Y; if (party) { std::string qfile = std::string("./gamma1"); int qfd = open(qfile.c_str(), O_RDWR); X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i), PROT_READ, MAP_PRIVATE, qfd, 0); close(qfd); qfile = std::string("./x1"); qfd = open(qfile.c_str(), O_RDWR); Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i), PROT_READ, MAP_PRIVATE, qfd, 0); close(qfd); } if (!party) { std::string qfile = std::string("./gamma0"); int qfd = open(qfile.c_str(), O_RDWR); X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i), PROT_READ, MAP_PRIVATE, qfd, 0); close(qfd); qfile = std::string("./x0"); qfd = open(qfile.c_str(), O_RDWR); Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i), PROT_READ, MAP_PRIVATE, qfd, 0); close(qfd); } // cw_construction computecw; // read(sin, boost::asio::buffer(&computecw, sizeof(computecw))); // computecw.rand_b; //__m128i gamma_b = computecw.gamma_b; if (party) { rand_b = Y[0]; //_mm_set_epi32(0x6fef9434, 0x6768121e, 0x20942286, 0x1b59f7a7); gamma_b = X[0]; // _mm_set_epi32(0x6a499109 , 0x803067dd , 0xd1e2281b , 0xe71b6262); bit_b = 1; // computecw.bit_b; } else { rand_b = Y[0]; // _mm_set_epi32(0xb29747df, 0xf7300f6d, 0x9476d971, 0xd5f75d98); gamma_b = X[0]; // _mm_set_epi32(0xb73142e2 , 0x10687aae , 0x06500d3ec , 0x29b5c85d); bit_b = 1; // computecw.bit_b; } uint8_t blinded_bit, blinded_bit_read; blinded_bit = bit ^ bit_b; __m128i blinded_L = L ^ R ^ rand_b; __m128i blinded_L_read; struct BlindsCW { __m128i blinded_message; uint8_t blinded_bit; }; BlindsCW blinds_sent, blinds_recv; blinds_sent.blinded_bit = blinded_bit; blinds_sent.blinded_message = blinded_L; boost::asio::write(sout, boost::asio::buffer(&blinds_sent, sizeof(blinds_sent))); boost::asio::read(sout, boost::asio::buffer(&blinds_recv, sizeof(blinds_recv))); communication_cost += sizeof(blinds_recv); blinded_bit_read = blinds_recv.blinded_bit; blinded_L_read = blinds_recv.blinded_message; __m128i out_ = R ^ gamma_b; //_mm_setzero_si128; if (bit) { out_ ^= (L ^ R ^ blinded_L_read); } if (blinded_bit_read) { out_ ^= rand_b; } __m128i out_reconstruction; boost::asio::write(sout, boost::asio::buffer(&out_, sizeof(out_))); boost::asio::read(sout, boost::asio::buffer(&out_reconstruction, sizeof(out_reconstruction))); communication_cost += sizeof(out_reconstruction); out_reconstruction = out_ ^ out_reconstruction; CW = out_reconstruction; #ifdef DEBUG uint8_t bit_reconstruction; boost::asio::write(sout, boost::asio::buffer(&bit, sizeof(bit))); boost::asio::read(sout, boost::asio::buffer(&bit_reconstruction, sizeof(bit_reconstruction))); bit_reconstruction = bit ^ bit_reconstruction; __m128i L_reconstruction; boost::asio::write(sout, boost::asio::buffer(&L, sizeof(L))); boost::asio::read(sout, boost::asio::buffer(&L_reconstruction, sizeof(L_reconstruction))); L_reconstruction = L ^ L_reconstruction; __m128i R_reconstruction; boost::asio::write(sout, boost::asio::buffer(&R, sizeof(R))); boost::asio::read(sout, boost::asio::buffer(&R_reconstruction, sizeof(R_reconstruction))); R_reconstruction = R ^ R_reconstruction; __m128i CW_debug; if (bit_reconstruction != 0) { CW_debug = L_reconstruction; } else { CW_debug = R_reconstruction; } assert(CW_debug[0] == CW[0]); assert(CW_debug[1] == CW[1]); #endif munmap(X, 8 * sizeof(__m128i)); munmap(Y, 8 * sizeof(__m128i)); } __m128i bit_mask_avx2_msb(unsigned int n) { __m128i ones = _mm_set1_epi32(-1); __m128i cnst32_128 = _mm_set_epi32(32, 64, 96, 128); __m128i shift = _mm_set1_epi32(n); shift = _mm_subs_epu16(cnst32_128, shift); return _mm_sllv_epi32(ones, shift); } __m128i bit_mask_avx2_lsb(unsigned int n) { __m128i ones = _mm_set1_epi32(-1); __m128i cnst32_128 = _mm_set_epi32(128, 96, 64, 32); __m128i shift = _mm_set1_epi32(n); shift = _mm_subs_epu16(cnst32_128, shift); return _mm_srlv_epi32(ones, shift); } template static inline void traverse(const prgkey_t &prgkey, const node_t &seed, node_t s[2]) { dpf::PRG(prgkey, clear_lsb(seed, 0b11), s, 2); } // dpf::expand inline void evalfull_mpc(const size_t &nodes_per_leaf, const size_t &depth, const size_t &nbits, const size_t &nodes_in_interval, const AES_KEY &prgkey, uint8_t target_share[64], std::vector &socketsPb, const size_t from, const size_t to, __m128i *output, int8_t *_t, __m128i &final_correction_word, bool party, size_t socket_no = 0) { __m128i root; arc4random_buf(&root, sizeof(root)); root = set_lsb(root, party); const size_t from_node = std::floor(static_cast(from) / nodes_per_leaf); __m128i *s[2] = { reinterpret_cast<__m128i *>(output) + nodes_in_interval * (nodes_per_leaf - 1), s[0] + nodes_in_interval / 2 }; int8_t *t[2] = {_t, _t + nodes_in_interval / 2}; int curlayer = depth % 2; s[curlayer][0] = root; t[curlayer][0] = get_lsb(root, 0b01); __m128i *CW = (__m128i *)std::aligned_alloc(sizeof(__m256i), depth * sizeof(__m128i)); for (size_t layer = 0; layer < depth; ++layer) { #ifdef VERBOSE printf("layer = %zu\n", layer); #endif curlayer = 1 - curlayer; size_t i = 0, j = 0; auto nextbit = (from_node >> (nbits - layer - 1)) & 1; size_t nodes_in_prev_layer = std::ceil(static_cast(nodes_in_interval) / (1ULL << (depth - layer))); size_t nodes_in_cur_layer = std::ceil(static_cast(nodes_in_interval) / (1ULL << (depth - layer - 1))); __m128i L = _mm_setzero_si128(); __m128i R = _mm_setzero_si128(); for (i = nextbit, j = nextbit; j < nodes_in_prev_layer - 1; ++j, i += 2) { traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]); L ^= s[curlayer][i]; R ^= s[curlayer][i + 1]; } if (nodes_in_prev_layer > j) { if (i < nodes_in_cur_layer - 1) { traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]); L ^= s[curlayer][i]; R ^= s[curlayer][i + 1]; } } compute_CW(party, socketsPb[socket_no], L, R, target_share[layer], CW[layer]); uint8_t advice_L = get_lsb(L) ^ target_share[layer]; uint8_t advice_R = get_lsb(R) ^ target_share[layer]; uint8_t cwt_L, cwt_R; uint8_t advice[2]; uint8_t cwts[2]; advice[0] = advice_L; advice[1] = advice_R; boost::asio::write(socketsPb[socket_no + 1], boost::asio::buffer(&advice, sizeof(advice))); boost::asio::read(socketsPb[socket_no + 1], boost::asio::buffer(&cwts, sizeof(cwts))); cwt_L = cwts[0]; cwt_R = cwts[1]; cwt_L = cwt_L ^ advice_L ^ 1; cwt_R = cwt_R ^ advice_R; for (size_t j = 0; j < nodes_in_prev_layer; ++j) { t[curlayer][2 * j] = get_lsb(s[curlayer][2 * j]) ^ (cwt_L & t[1 - curlayer][j]); s[curlayer][2 * j] = clear_lsb(xor_if(s[curlayer][2 * j], CW[layer], !t[1 - curlayer][j]), 0b11); t[curlayer][(2 * j) + 1] = get_lsb(s[curlayer][(2 * j) + 1]) ^ (cwt_R & t[1 - curlayer][j]); s[curlayer][(2 * j) + 1] = clear_lsb(xor_if(s[curlayer][(2 * j) + 1], CW[layer], !t[1 - curlayer][j]), 0b11); } } free(CW); __m128i Gamma = _mm_setzero_si128(); for (size_t i = 0; i < to + 1; ++i) { Gamma[0] += output[i][0]; Gamma[1] += output[i][1]; } if (party) { Gamma[0] = -Gamma[0]; Gamma[1] = -Gamma[1]; } boost::asio::write(socketsPb[socket_no + 3], boost::asio::buffer(&Gamma, sizeof(Gamma))); boost::asio::read(socketsPb[socket_no + 3], boost::asio::buffer(&final_correction_word, sizeof(final_correction_word))); communication_cost += sizeof(Gamma); final_correction_word = Gamma; // final_correction_word + Gamma; } // dpf::__evalinterval void convert_shares(__m128i **output, int8_t **flags, size_t n_threads, size_t db_nitems, __m128i *final_correction_word, tcp::socket &sb, bool party) { for (size_t j = 0; j < db_nitems; ++j) { for (size_t k = 0; k < n_threads; ++k) { if (party) { output[k][j] = -output[k][j]; flags[k][j] = -flags[k][j]; } } #ifdef DEBUG int8_t out = flags[0][j]; int8_t out_rec; boost::asio::write(sb, boost::asio::buffer(&out, sizeof(out))); boost::asio::read(sb, boost::asio::buffer(&out_rec, sizeof(out_rec))); out_rec = out_rec + out; if (out_rec != 0) std::cout << j << "(flags) --> " << (int)out_rec << std::endl << std::endl; __m128i out2 = output[0][j]; __m128i out_rec2; boost::asio::write(sb, boost::asio::buffer(&out2, sizeof(out2))); boost::asio::read(sb, boost::asio::buffer(&out_rec2, sizeof(out_rec2))); out_rec2 = out_rec2 + out2; if (out_rec2[0] != 0) std::cout << j << "--> " << out_rec2[0] << std::endl; #endif } for (size_t i = 0; i < n_threads; ++i) { int64_t pm = 0; int64_t rb; arc4random_buf(&rb, sizeof(rb)); for (size_t j = 0; j < db_nitems; ++j) { if (party) { if (flags[i][j] != 0) pm -= 1; } if (!party) { if (flags[i][j] != 0) pm += 1; // flags[0][j]; } } } } void accept_conncections_from_Pb(boost::asio::io_context &io_context, std::vector &socketsPb, int port, size_t j) { tcp::acceptor acceptor_a(io_context, tcp::endpoint(tcp::v4(), port)); tcp::socket sb_a(acceptor_a.accept()); socketsPb[j] = std::move(sb_a); } int main(int argc, char *argv[]) { boost::asio::io_context io_context; tcp::resolver resolver(io_context); const std::string host1 = argv[1]; const size_t n_threads = atoi(argv[2]); const size_t number_of_sockets = 5 * n_threads; const size_t expo = atoi(argv[3]); const size_t maxRAM = atoi(argv[4]); const size_t db_nitems = 1ULL << expo; size_t RAM_needed_per_thread = 164 * db_nitems; std::cout << "RAM needed = " << n_threads*RAM_needed_per_thread << " bytes = " << n_threads*RAM_needed_per_thread/1073741824 << " GiB" << std::endl; std::cout << "RAM needed per thread = " << RAM_needed_per_thread << " bytes = " << (RAM_needed_per_thread>>30) << " GiB" << std::endl; size_t thread_per_batch = std::floor(double(maxRAM<<30)/RAM_needed_per_thread); if (thread_per_batch > n_threads) { thread_per_batch = n_threads; } std::cout << "thread_per_batch = " << thread_per_batch << std::endl; if (thread_per_batch < 1) { std::cout << "You need more RAM" << std::endl; exit(0); } size_t n_batches = std::ceil(double(n_threads)/thread_per_batch); std::cout << "n_batches = " << n_batches << std::endl; std::vector socketsPb; for (size_t j = 0; j < number_of_sockets + 1; ++j) { tcp::socket emptysocket(io_context); socketsPb.emplace_back(std::move(emptysocket)); } socketsPb.reserve(number_of_sockets + 1); std::vector ports; for (size_t j = 0; j < number_of_sockets; ++j) { int port = 6000; ports.push_back(port + j); } std::vector ports2_0; for (size_t j = 0; j < number_of_sockets; ++j) { int port = 20000; ports2_0.push_back(port + j); } std::vector ports2_1; for (size_t j = 0; j < number_of_sockets; ++j) { int port = 40000; ports2_1.push_back(port + j); } bool party; #if (PARTY == 0) party = false; for (size_t j = 0; j < number_of_sockets; ++j) { tcp::socket sb_a(io_context); boost::asio::connect(sb_a, resolver.resolve({host1, std::to_string(ports[j])})); socketsPb[j] = std::move(sb_a); } #else party = true; boost::asio::thread_pool pool2(number_of_sockets); for (size_t j = 0; j < number_of_sockets; ++j) { boost::asio::post(pool2, std::bind(accept_conncections_from_Pb, std::ref(io_context), std::ref(socketsPb), ports[j], j)); } pool2.join(); #endif __m128i *final_correction_word = (__m128i *)std::aligned_alloc(sizeof(__m256i), thread_per_batch * sizeof(__m128i)); AES_KEY aeskey; __m128i **output = (__m128i **)malloc(sizeof(__m128i *) * thread_per_batch); int8_t **flags = (int8_t **)malloc(sizeof(uint8_t *) * thread_per_batch); for (size_t j = 0; j < thread_per_batch; ++j) { output[j] = (__m128i *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(__m128i)); flags[j] = (int8_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(uint8_t)); } const size_t bits_per_leaf = std::is_same::value ? 1 : sizeof(leaf_t) * CHAR_BIT; const bool is_packed = (sizeof(leaf_t) < sizeof(node_t)); const size_t nodes_per_leaf = is_packed ? 1 : std::ceil(static_cast(bits_per_leaf) / (sizeof(node_t) * CHAR_BIT)); const size_t depth = std::ceil(std::log2(db_nitems)); const size_t nbits = std::ceil(std::log2(db_nitems)); const size_t nodes_in_interval = db_nitems - 1; auto start = std::chrono::steady_clock::now(); #ifdef VERBOSE printf("n_threads = %zu\n\n", n_threads); #endif uint8_t **target_share_read = new uint8_t *[thread_per_batch]; for (size_t i = 0; i < n_threads; i++) target_share_read[i] = new uint8_t[64]; for(size_t iters = 0; iters < n_batches; ++iters) { if (n_batches > 1) { printf("Starting evalfull_mpc batch %lu / %lu\n", iters+1, n_batches); } generate_random_targets(target_share_read, thread_per_batch, party, expo); boost::asio::thread_pool pool(thread_per_batch); for (size_t j = 0; j < thread_per_batch; ++j) { boost::asio::post(pool, std::bind(evalfull_mpc, std::ref(nodes_per_leaf), std::ref(depth), std::ref(nbits), std::ref(nodes_in_interval), std::ref(aeskey), target_share_read[j], std::ref(socketsPb), 0, db_nitems - 1, output[j], flags[j], std::ref(final_correction_word[j]), party, 5 * j)); } pool.join(); convert_shares(output, flags, thread_per_batch, db_nitems, final_correction_word, socketsPb[0], party); } for(size_t j = 0; j < thread_per_batch; ++j) { free(output[j]); free(flags[j]); delete[] target_share_read[j]; } free(output); free(flags); free(final_correction_word); delete[] target_share_read; auto end = std::chrono::steady_clock::now(); std::chrono::duration elapsed_seconds = end - start; std::cout << "WallClockTime: " << elapsed_seconds.count() << " s" << std::endl; std::cout << "CommunicationCost: " << communication_cost << " bytes" << std::endl; return 0; }