iang
/
duoram-prac-repro


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
							#include <type_traits> // std::is_same<>
#include <limits>	   // std::numeric_limits<>
#include <climits>	   // CHAR_BIT
#include <cmath>	   // std::log2, std::ceil, std::floor
#include <stdexcept>   // std::runtime_error
#include <array>	   // std::array<>
#include <iostream>	   // std::istream and std::ostream
#include <vector>	   // std::vector<>
#include <memory>	   // std::shared_ptr<>
#include <utility>	   // std::move
#include <algorithm>   // std::copy
#include <cstring>	   // std::memcpy

#include <bsd/stdlib.h> // arc4random_buf
#include <x86intrin.h>	// SSE and AVX intrinsics
#include <boost/asio/thread_pool.hpp>

size_t communication_cost = 0;

#include "bitutils.h"
#include "block.h"
#include "prg.h"

#include "prg_aes_impl.h"

#include <iostream>

#include <fcntl.h>
#include <cstdlib>
#include "block.h"
#include <chrono>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <future>
#include <boost/asio.hpp>

using boost::asio::ip::tcp;

#include <mutex>
#include <boost/lexical_cast.hpp>

using socket_t = boost::asio::ip::tcp::socket;

typedef unsigned char byte_t;

typedef __m128i node_t;
block<__m128i> seed_for_blinds;
constexpr size_t leaf_size = 1;
typedef __m128i leaf_type;

typedef std::array<leaf_type, leaf_size> leaf_t;

size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) * CHAR_BIT;
bool is_packed = (sizeof(leaf_t) < sizeof(node_t));
size_t leaves_per_node = is_packed ? sizeof(node_t) * CHAR_BIT / bits_per_leaf : 1;

size_t input_bits(const size_t nitems) { return std::ceil(std::log2(nitems)); }

leaf_t val;

using namespace dpf;

#include "mpc.h"
void generate_random_targets(uint8_t **target_share_read, size_t n_threads, bool party, size_t expo)
{
	for (size_t i = 0; i < n_threads; i++)
	{
		target_share_read[i] = new uint8_t[64];
	}

	for (size_t j = 0; j < 64; ++j)
	{
		for (size_t i = 0; i < n_threads; ++i)
		{
			uint8_t random_value;
			arc4random_buf(&random_value, sizeof(uint8_t));
			target_share_read[i][j] = random_value; // rand();
		}
	}
}

void compute_CW(bool party, tcp::socket &sout, __m128i L, __m128i R, uint8_t bit, __m128i &CW)
{

	// struct cw_construction
	//{
	__m128i rand_b, gamma_b;
	uint8_t bit_b;
	//};

	__m128i *X, *Y;

	if (party)
	{
		std::string qfile = std::string("./gamma1");
		int qfd = open(qfile.c_str(), O_RDWR);
		X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),
							PROT_READ, MAP_PRIVATE, qfd, 0);

		qfile = std::string("./x1");
		qfd = open(qfile.c_str(), O_RDWR);
		Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),
							PROT_READ, MAP_PRIVATE, qfd, 0);
	}

	if (!party)
	{
		std::string qfile = std::string("./gamma0");
		int qfd = open(qfile.c_str(), O_RDWR);
		X = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),
							PROT_READ, MAP_PRIVATE, qfd, 0);

		qfile = std::string("./x0");
		qfd = open(qfile.c_str(), O_RDWR);
		Y = (__m128i *)mmap(NULL, 8 * sizeof(__m128i),
							PROT_READ, MAP_PRIVATE, qfd, 0);
	}

	// cw_construction computecw;
	//	read(sin, boost::asio::buffer(&computecw, sizeof(computecw)));

	// computecw.rand_b;
	//__m128i gamma_b = computecw.gamma_b;

	if (party)
	{
		rand_b = Y[0];	//_mm_set_epi32(0x6fef9434, 0x6768121e, 0x20942286, 0x1b59f7a7);
		gamma_b = X[0]; // _mm_set_epi32(0x6a499109 , 0x803067dd , 0xd1e2281b , 0xe71b6262);
		bit_b = 1;		// computecw.bit_b;
	}
	else
	{
		rand_b = Y[0];	// _mm_set_epi32(0xb29747df, 0xf7300f6d, 0x9476d971, 0xd5f75d98);
		gamma_b = X[0]; // _mm_set_epi32(0xb73142e2 , 0x10687aae , 0x06500d3ec , 0x29b5c85d);
		bit_b = 1;		// computecw.bit_b;
	}

	uint8_t blinded_bit, blinded_bit_read;
	blinded_bit = bit ^ bit_b;

	__m128i blinded_L = L ^ R ^ rand_b;
	__m128i blinded_L_read;

	struct BlindsCW
	{
		__m128i blinded_message;
		uint8_t blinded_bit;
	};

	BlindsCW blinds_sent, blinds_recv;

	blinds_sent.blinded_bit = blinded_bit;
	blinds_sent.blinded_message = blinded_L;

	boost::asio::write(sout, boost::asio::buffer(&blinds_sent, sizeof(blinds_sent)));
	boost::asio::read(sout, boost::asio::buffer(&blinds_recv, sizeof(blinds_recv)));
 communication_cost += sizeof(blinds_recv);
	
 blinded_bit_read = blinds_recv.blinded_bit;
	blinded_L_read = blinds_recv.blinded_message;

	__m128i out_ = R ^ gamma_b; //_mm_setzero_si128;

	if (bit)
	{
		out_ ^= (L ^ R ^ blinded_L_read);
	}
	if (blinded_bit_read)
	{
		out_ ^= rand_b;
	}

	__m128i out_reconstruction;
	boost::asio::write(sout, boost::asio::buffer(&out_, sizeof(out_)));
	boost::asio::read(sout, boost::asio::buffer(&out_reconstruction, sizeof(out_reconstruction)));
 communication_cost += sizeof(out_reconstruction);
	
 out_reconstruction = out_ ^ out_reconstruction;

	CW = out_reconstruction;

 #ifdef DEBUG
  uint8_t bit_reconstruction;
  boost::asio::write(sout, boost::asio::buffer(&bit, sizeof(bit)));
  boost::asio::read(sout, boost::asio::buffer(&bit_reconstruction, sizeof(bit_reconstruction)));
  bit_reconstruction = bit ^ bit_reconstruction;

  __m128i L_reconstruction;
  boost::asio::write(sout, boost::asio::buffer(&L, sizeof(L)));
  boost::asio::read(sout, boost::asio::buffer(&L_reconstruction, sizeof(L_reconstruction)));
  L_reconstruction = L ^ L_reconstruction;

  __m128i R_reconstruction;
  boost::asio::write(sout, boost::asio::buffer(&R, sizeof(R)));
  boost::asio::read(sout, boost::asio::buffer(&R_reconstruction, sizeof(R_reconstruction)));
  R_reconstruction = R ^ R_reconstruction;

  __m128i CW_debug;

  if (bit_reconstruction != 0)
  {
   CW_debug = L_reconstruction;
  }
  else
  {
   CW_debug = R_reconstruction;
  }

  assert(CW_debug[0] == CW[0]);
  assert(CW_debug[1] == CW[1]);
 #endif
}

__m128i bit_mask_avx2_msb(unsigned int n)
{
	__m128i ones = _mm_set1_epi32(-1);
	__m128i cnst32_128 = _mm_set_epi32(32, 64, 96, 128);

	__m128i shift = _mm_set1_epi32(n);
	shift = _mm_subs_epu16(cnst32_128, shift);
	return _mm_sllv_epi32(ones, shift);
}

__m128i bit_mask_avx2_lsb(unsigned int n)
{
	__m128i ones = _mm_set1_epi32(-1);
	__m128i cnst32_128 = _mm_set_epi32(128, 96, 64, 32);

	__m128i shift = _mm_set1_epi32(n);
	shift = _mm_subs_epu16(cnst32_128, shift);
	return _mm_srlv_epi32(ones, shift);
}

template <typename node_t, typename prgkey_t>
static inline void traverse(const prgkey_t &prgkey, const node_t &seed, node_t s[2])
{
	dpf::PRG(prgkey, clear_lsb(seed, 0b11), s, 2);
} // dpf::expand

inline void evalfull_mpc(const size_t &nodes_per_leaf, const size_t &depth, const size_t &nbits, const size_t &nodes_in_interval,
						 const AES_KEY &prgkey, uint8_t target_share[64], std::vector<socket_t> &socketsPb,
						 const size_t from, const size_t to, __m128i *output, int8_t *_t, __m128i &final_correction_word, bool party, size_t socket_no = 0)
{

	__m128i root;

	arc4random_buf(&root, sizeof(root));

	root = set_lsb(root, party);

	const size_t from_node = std::floor(static_cast<double>(from) / nodes_per_leaf);

	__m128i *s[2] = {
		reinterpret_cast<__m128i *>(output) + nodes_in_interval * (nodes_per_leaf - 1),
		s[0] + nodes_in_interval / 2};

	int8_t *t[2] = {_t, _t + nodes_in_interval / 2};

	int curlayer = depth % 2;

	s[curlayer][0] = root;
	t[curlayer][0] = get_lsb(root, 0b01);

	__m128i *CW = (__m128i *)std::aligned_alloc(sizeof(__m256i), depth * sizeof(__m128i));

	for (size_t layer = 0; layer < depth; ++layer)
	{
   #ifdef VERBOSE
		  printf("layer = %zu\n", layer);
   #endif
		 curlayer = 1 - curlayer;

		 size_t i = 0, j = 0;
		 auto nextbit = (from_node >> (nbits - layer - 1)) & 1;
		 size_t nodes_in_prev_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth - layer)));
		 size_t nodes_in_cur_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth - layer - 1)));

		 __m128i L = _mm_setzero_si128();
		 __m128i R = _mm_setzero_si128();

   for (i = nextbit, j = nextbit; j < nodes_in_prev_layer - 1; ++j, i += 2)
   {
    traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]);
    L ^= s[curlayer][i];
    R ^= s[curlayer][i + 1];
   }

   if (nodes_in_prev_layer > j)
   {
    if (i < nodes_in_cur_layer - 1)
    {
     traverse(prgkey, s[1 - curlayer][j], &s[curlayer][i]);
     L ^= s[curlayer][i];
     R ^= s[curlayer][i + 1];
    }
   }

		 compute_CW(party, socketsPb[socket_no], L, R, target_share[layer], CW[layer]);

		 uint8_t advice_L = get_lsb(L) ^ target_share[layer];
		 uint8_t advice_R = get_lsb(R) ^ target_share[layer];

		 uint8_t cwt_L, cwt_R;
		 uint8_t advice[2];
		 uint8_t cwts[2];
		 advice[0] = advice_L;
		 advice[1] = advice_R;

		 boost::asio::write(socketsPb[socket_no + 1], boost::asio::buffer(&advice, sizeof(advice)));
		 boost::asio::read(socketsPb[socket_no + 1], boost::asio::buffer(&cwts, sizeof(cwts)));

		 cwt_L = cwts[0];
		 cwt_R = cwts[1];

		 cwt_L = cwt_L ^ advice_L ^ 1;
		 cwt_R = cwt_R ^ advice_R;

		 for (size_t j = 0; j < nodes_in_prev_layer; ++j)
		 {
			 t[curlayer][2 * j] = get_lsb(s[curlayer][2 * j]) ^ (cwt_L & t[1 - curlayer][j]);
			 s[curlayer][2 * j] = clear_lsb(xor_if(s[curlayer][2 * j], CW[layer], !t[1 - curlayer][j]), 0b11);
			 t[curlayer][(2 * j) + 1] = get_lsb(s[curlayer][(2 * j) + 1]) ^ (cwt_R & t[1 - curlayer][j]);
			 s[curlayer][(2 * j) + 1] = clear_lsb(xor_if(s[curlayer][(2 * j) + 1], CW[layer], !t[1 - curlayer][j]), 0b11);
		 }
	 }

	__m128i Gamma = _mm_setzero_si128();

	for (size_t i = 0; i < to + 1; ++i)
	{
		Gamma[0] += output[i][0];
		Gamma[1] += output[i][1];
	}

	if (party)
	{
		Gamma[0] = -Gamma[0];
		Gamma[1] = -Gamma[1];
	}

	boost::asio::write(socketsPb[socket_no + 3], boost::asio::buffer(&Gamma, sizeof(Gamma)));
	boost::asio::read(socketsPb[socket_no + 3], boost::asio::buffer(&final_correction_word, sizeof(final_correction_word)));
 communication_cost += sizeof(Gamma);
	final_correction_word = Gamma; // final_correction_word + Gamma;

} // dpf::__evalinterval

void convert_shares(__m128i **output, int8_t **flags, size_t n_threads, size_t db_nitems, __m128i *final_correction_word, tcp::socket &sb, bool party)
{

	for (size_t j = 0; j < db_nitems; ++j)
	{
		for (size_t k = 0; k < n_threads; ++k)
		{
			if (party)
			{
				output[k][j] = -output[k][j];
				flags[k][j] = -flags[k][j];
			}
		}

		 #ifdef DEBUG
		int8_t out = flags[0][j];
		int8_t out_rec;

		boost::asio::write(sb, boost::asio::buffer(&out, sizeof(out)));
		boost::asio::read(sb, boost::asio::buffer(&out_rec, sizeof(out_rec)));
		out_rec = out_rec + out;

		
		if (out_rec != 0)
			std::cout << j << "(flags) --> " << (int)out_rec << std::endl
					  << std::endl;

		__m128i out2 = output[0][j];
		__m128i out_rec2;

		boost::asio::write(sb, boost::asio::buffer(&out2, sizeof(out2)));
		boost::asio::read(sb, boost::asio::buffer(&out_rec2, sizeof(out_rec2)));
		out_rec2 = out_rec2 + out2;
		if (out_rec2[0] != 0)
			std::cout << j << "--> " << out_rec2[0] << std::endl;
		 #endif
	}

	for (size_t i = 0; i < n_threads; ++i)
	{

		int64_t pm = 0;
		int64_t rb;

		arc4random_buf(&rb, sizeof(rb));
		for (size_t j = 0; j < db_nitems; ++j)
		{
			if (party)
			{
				if (flags[i][j] != 0)
					pm -= 1;
			}
			if (!party)
			{
				if (flags[i][j] != 0)
					pm += 1; // flags[0][j];
			}
		}
	}
}

void accept_conncections_from_Pb(boost::asio::io_context &io_context, std::vector<socket_t> &socketsPb, int port, size_t j)
{
	tcp::acceptor acceptor_a(io_context, tcp::endpoint(tcp::v4(), port));
	tcp::socket sb_a(acceptor_a.accept());
	socketsPb[j] = std::move(sb_a);
}

int main(int argc, char *argv[])
{

	boost::asio::io_context io_context;
	tcp::resolver resolver(io_context);
	const std::string host1 = argv[1];
 

	const size_t n_threads = atoi(argv[2]);
	const size_t number_of_sockets = 5 * n_threads;
	const size_t expo = atoi(argv[3]);

	const size_t maxRAM = atoi(argv[4]);

	const size_t db_nitems = 1ULL << expo;

      size_t RAM_needed_per_thread = 164 * db_nitems;
      std::cout << "RAM needed = " << n_threads*RAM_needed_per_thread << " bytes = " << n_threads*RAM_needed_per_thread/1073741824 << " GiB" << std::endl;
      std::cout << "RAM needed per thread = " << RAM_needed_per_thread << " bytes = " << (RAM_needed_per_thread>>30) << " GiB" << std::endl;
      size_t thread_per_batch = std::floor(double(maxRAM<<30)/RAM_needed_per_thread);
      if (thread_per_batch > n_threads) {
	thread_per_batch = n_threads;
      }
      std::cout << "thread_per_batch = " << thread_per_batch << std::endl;
      if (thread_per_batch < 1) {
       std::cout << "You need more RAM" << std::endl;
       exit(0);
      }
      size_t n_batches = std::ceil(double(n_threads)/thread_per_batch);
      std::cout << "n_batches = " << n_batches << std::endl;

	std::vector<socket_t> socketsPb;
	for (size_t j = 0; j < number_of_sockets + 1; ++j)
	{
		tcp::socket emptysocket(io_context);
		socketsPb.emplace_back(std::move(emptysocket));
	}
	socketsPb.reserve(number_of_sockets + 1);


	std::vector<int> ports;
	for (size_t j = 0; j < number_of_sockets; ++j)
	{
		int port = 6000;
		ports.push_back(port + j);
	}

	std::vector<int> ports2_0;
	for (size_t j = 0; j < number_of_sockets; ++j)
	{
		int port = 20000;
		ports2_0.push_back(port + j);
	}

	std::vector<int> ports2_1;
	for (size_t j = 0; j < number_of_sockets; ++j)
	{
		int port = 40000;
		ports2_1.push_back(port + j);
	}

bool party;

#if (PARTY == 0)
	party = false; 
	for (size_t j = 0; j < number_of_sockets; ++j)
	{
		tcp::socket sb_a(io_context);
		boost::asio::connect(sb_a, resolver.resolve({host1, std::to_string(ports[j])}));
		socketsPb[j] = std::move(sb_a);
	}
#else
	party = true;	
	boost::asio::thread_pool pool2(number_of_sockets);
	for (size_t j = 0; j < number_of_sockets; ++j)
	{
		boost::asio::post(pool2, std::bind(accept_conncections_from_Pb, std::ref(io_context), std::ref(socketsPb), ports[j], j));
	}

	pool2.join();
#endif

 
	__m128i *final_correction_word = (__m128i *)std::aligned_alloc(sizeof(__m256i), thread_per_batch * sizeof(__m128i));

	AES_KEY aeskey;

	__m128i **output = (__m128i **)malloc(sizeof(__m128i *) * thread_per_batch);
	int8_t **flags = (int8_t **)malloc(sizeof(uint8_t *) * thread_per_batch);

	for (size_t j = 0; j < thread_per_batch; ++j)
	{
		output[j] = (__m128i *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(__m128i));
		flags[j] = (int8_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(uint8_t));
	}

	const size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) * CHAR_BIT;
	const bool is_packed = (sizeof(leaf_t) < sizeof(node_t));
	const size_t nodes_per_leaf = is_packed ? 1 : std::ceil(static_cast<double>(bits_per_leaf) / (sizeof(node_t) * CHAR_BIT));
	const size_t depth = std::ceil(std::log2(db_nitems));
	const size_t nbits = std::ceil(std::log2(db_nitems));
	const size_t nodes_in_interval = db_nitems - 1;
	auto start = std::chrono::steady_clock::now();


#ifdef VERBOSE
		printf("n_threads = %zu\n\n", n_threads);
#endif
 

 for(size_t iters = 0; iters < n_batches; ++iters)
{
   if (n_batches > 1) {
    printf("Starting evalfull_mpc batch %lu / %lu\n", iters+1, n_batches);
   }
   uint8_t **target_share_read = new uint8_t *[thread_per_batch];
   generate_random_targets(target_share_read, thread_per_batch, party, expo);
   boost::asio::thread_pool pool(thread_per_batch);
   for (size_t j = 0; j < thread_per_batch; ++j)
   {
    boost::asio::post(pool, std::bind(evalfull_mpc, std::ref(nodes_per_leaf), std::ref(depth), std::ref(nbits), std::ref(nodes_in_interval),
              std::ref(aeskey), target_share_read[j], std::ref(socketsPb), 0, db_nitems - 1, output[j],
              flags[j], std::ref(final_correction_word[j]), party, 5 * j));
   }

   pool.join();


   convert_shares(output, flags, thread_per_batch, db_nitems, final_correction_word, socketsPb[0], party);
}
 auto end = std::chrono::steady_clock::now();
	std::chrono::duration<double> elapsed_seconds = end - start;
	std::cout << "WallClockTime: " << elapsed_seconds.count() << " s" << std::endl;
 std::cout << "CommunicationCost: " << communication_cost << " bytes" << std::endl;
 

	if(!party)
	{
		char const *p0_filename0;
		p0_filename0 = "../duoram-online/preprocflags/party0_read_flags_b";
		int w0 = open(p0_filename0, O_WRONLY | O_CREAT, S_IWRITE | S_IREAD);
		int written = write(w0, flags[0], db_nitems * sizeof(flags[0][0]));
  if (written < 0) perror("Write error");
		close(w0);
	}
	else
	{
		char const *p0_filename0;
		p0_filename0 = "../duoram-online/preprocflags/party1_read_flags_b";
		int w0 = open(p0_filename0, O_WRONLY | O_CREAT, S_IWRITE | S_IREAD);
		int written = write(w0, flags[0], db_nitems * sizeof(flags[0][0]));
		if (written < 0) perror("Write error"); 
		close(w0);
	}

	return 0;
}