avadapal
/
duoram


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
							#include <type_traits>  // std::is_same<>
#include <limits>       // std::numeric_limits<>
#include <climits>      // CHAR_BIT
#include <cmath>        // std::log2, std::ceil, std::floor
#include <stdexcept>    // std::runtime_error
#include <array>        // std::array<>
#include <iostream>     // std::istream and std::ostream
#include <vector>       // std::vector<>
#include <memory>       // std::shared_ptr<>
#include <utility>      // std::move
#include <algorithm>    // std::copy
#include <cstring>      // std::memcpy
#include <bsd/stdlib.h> // arc4random_buf
#include <x86intrin.h>  // SSE and AVX intrinsics
#include <../boost/asio/thread_pool.hpp>
#include <../boost/asio.hpp>
#include <../boost/lexical_cast.hpp>
#include <iostream> 
#include <chrono>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <mutex>
#include "bitutils.h"
#include "block.h"
#include "prg_aes_impl.h" 
#include "filesio.h"

using boost::asio::ip::tcp;

using namespace dpf;
 

typedef __m128i leaf_type;
typedef __m128i leaf_t;
typedef __m128i node_t;
 
using socket_t = boost::asio::ip::tcp::socket;

size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) * CHAR_BIT;
bool is_packed = (sizeof(leaf_t) < sizeof(node_t));
size_t leaves_per_node = is_packed ? sizeof(node_t) * CHAR_BIT / bits_per_leaf : 1;
size_t __depth(const size_t nitems) { return std::ceil(std::log2(std::ceil(static_cast<double>(nitems) / leaves_per_node))); }
size_t communication_cost = 0;
#include "mpc.h"
#include "dpfgen.h"
#include "share-conversion.h"

void mpc_gen (const size_t depth, AES_KEY& prgkey, const size_t db_nitems, 
              const size_t n_threads, std::vector<socket_t>& sockets0, std::vector<socket_t>& sockets1, __m128i** output0, int8_t ** flags0, __m128i** output1, int8_t ** flags1, dpfP2 * dpf_instance0 , 
              dpfP2 * dpf_instance1, size_t ind,size_t socket_no = 0)
{	
  evaluate_dpfs(db_nitems,  dpf_instance0[ind],   prgkey,  0, db_nitems-1,	output0[ind],  flags0[ind],  false,  ind);
  evaluate_dpfs(db_nitems,  dpf_instance1[ind],   prgkey,  0, db_nitems-1, output1[ind],  flags1[ind],  true ,  ind);

  #ifdef DEBUG
   for(size_t j = 0; j < db_nitems; ++j)
   {
     std::cout << j << "-> "  << (int) flags0[0][j] << " <-> " << (int) flags1[0][j] << std::endl;
     std::cout << j << "-> " << output0[0][j][0] << " <-> " << output1[0][j][0] << std::endl << std::endl;
   }
  #endif
}
 
void accept_conncections_from_Pb(boost::asio::io_context&io_context, std::vector<socket_t>& sockets0, int port, size_t j)
{
  tcp::acceptor acceptor2_(io_context, tcp::endpoint(tcp::v4(), port));
  tcp::socket s2(acceptor2_.accept());
  sockets0[j] = std::move(s2);
}

int main(int argc, char* argv[])
{ 
  
 AES_KEY aeskey;
 boost::asio::io_context io_context;
 tcp::resolver resolver(io_context);
   
 const std::string host0 = (argc < 2) ? "127.0.0.1" : argv[1];
 const std::string host1 = (argc < 3) ? "127.0.0.1" : argv[2];  
 const size_t n_threads = atoi(argv[3]);
 const size_t number_of_sockets = 5 * n_threads;
 const size_t depth = atoi(argv[4]);
 const size_t db_nitems = 1ULL << depth;
 const size_t maxRAM = atoi(argv[5]);

      size_t RAM_needed_per_thread = 164 * db_nitems;
      std::cout << "RAM needed = " << n_threads*RAM_needed_per_thread << " bytes = " << n_threads*RAM_needed_per_thread/1073741824 << " GiB" << std::endl;
      std::cout << "RAM needed per thread = " << RAM_needed_per_thread << " bytes = " << (RAM_needed_per_thread>>30) << " GiB" << std::endl;
      size_t thread_per_batch = std::floor(double(maxRAM<<30)/RAM_needed_per_thread);
      if (thread_per_batch > n_threads) {
	thread_per_batch = n_threads;
      }
      std::cout << "thread_per_batch = " << thread_per_batch << std::endl;
      if (thread_per_batch < 1) {
       std::cout << "You need more RAM" << std::endl;
       exit(0);
      }
      size_t n_batches = std::ceil(double(n_threads)/thread_per_batch);
      std::cout << "n_batches = " << n_batches << std::endl;

 std::vector<int> ports2_0;
 for(size_t j = 0; j < number_of_sockets; ++j) 
 {
   int port = 22000;
   ports2_0.push_back(port + j);
 }

 std::vector<int> ports2_1;
 for(size_t j = 0; j < number_of_sockets; ++j) 
 {
   int port = 42000;
   ports2_1.push_back(port + j);
 }


 std::vector<socket_t> sockets0;
 std::vector<socket_t> sockets1;
 sockets0.reserve(number_of_sockets + 1);
 sockets1.reserve(number_of_sockets + 1);

 boost::asio::thread_pool pool2(number_of_sockets * 2); 
 
 for(size_t j = 0; j < number_of_sockets; ++j)
 {
   boost::asio::post(pool2, std::bind(accept_conncections_from_Pb,  std::ref(io_context), std::ref(sockets1), ports2_1[j],  j));
 }

 for(size_t j = 0; j < number_of_sockets; ++j)
 {
  boost::asio::post(pool2, std::bind(accept_conncections_from_Pb,  std::ref(io_context), std::ref(sockets0), ports2_0[j],  j));
 }

 pool2.join();


   auto start = std::chrono::steady_clock::now(); 


    cw_construction computecw0_array, computecw1_array;
  
  // for(size_t i = 0; i < 128; ++i)
  // {
  //   for(size_t j = 0; j < depth; ++j)
  //   {
     
        __m128i rand0, rand1, gamma0, gamma1;

        arc4random_buf(&rand0, sizeof(__m128i));
        arc4random_buf(&rand1, sizeof(__m128i));
        uint8_t bit0, bit1; 
        bit0 = rand();
        bit0 = bit0 % 2;
        bit1 = rand();
        bit1 = bit1 %2;

        gamma0 = (bit1 == 1) ? rand0 : _mm_setzero_si128();
        gamma1 = (bit0 == 1) ? rand1 : _mm_setzero_si128();


        computecw0_array.rand_b   = rand0;
        computecw0_array.gamma_b  = gamma0;
        computecw0_array.bit_b    = bit0;

        computecw1_array.rand_b   = rand1;
        computecw1_array.gamma_b  = gamma1;
        computecw1_array.bit_b    = bit1;

    //   }
    // }


  boost::asio::write(sockets0[0], boost::asio::buffer(&computecw0_array,  sizeof(computecw0_array)));
  boost::asio::write(sockets1[0], boost::asio::buffer(&computecw1_array,  sizeof(computecw1_array)));
  communication_cost += sizeof(computecw0_array);
  communication_cost += sizeof(computecw1_array);
 
  dpfP2 * dpf_instance0 = (dpfP2 * ) malloc (sizeof(dpfP2) * n_threads);
  dpfP2 * dpf_instance1 = (dpfP2 * ) malloc (sizeof(dpfP2) * n_threads);

  boost::asio::read(sockets0[0], boost::asio::buffer(dpf_instance0, n_threads * sizeof(dpfP2)));
  boost::asio::read(sockets1[0], boost::asio::buffer(dpf_instance1, n_threads * sizeof(dpfP2))); 


  __m128i ** output0 = (__m128i ** ) malloc(sizeof(__m128i *) * thread_per_batch);
  int8_t  ** flags0  = (int8_t ** ) malloc(sizeof(uint8_t *) * thread_per_batch);
   
  for(size_t j = 0; j < thread_per_batch; ++j)
  {
    output0[j] = (__m128i *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(__m128i));
    flags0[j]  = (int8_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(uint8_t));
  }
  
  __m128i ** output1 = (__m128i ** ) malloc(sizeof(__m128i *) * thread_per_batch);
  int8_t  ** flags1  = (int8_t ** ) malloc(sizeof(uint8_t *) * thread_per_batch);
   
  for(size_t j = 0; j < thread_per_batch; ++j)
  {
    output1[j] = (__m128i *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(__m128i));
    flags1[j]  = (int8_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(uint8_t));
  }

  for(size_t iter = 0; iter < n_batches; ++iter)
  { 
    if (n_batches > 1) {
      printf("Starting mpc_gen batch %lu / %lu\n", iter+1, n_batches);
    }
    boost::asio::thread_pool pool(thread_per_batch); 
    for(size_t j = 0; j < thread_per_batch; ++j)
    {
     boost::asio::post(pool, std::bind(mpc_gen,  std::ref(depth), std::ref(aeskey), db_nitems, n_threads,  std::ref(sockets0), std::ref(sockets1), 
                                        output0, flags0,  output1, flags1,  std::ref(dpf_instance0), std::ref(dpf_instance1), j, 5 * j));    
    }  

    pool.join();
  }
 

  free(dpf_instance0);
  free(dpf_instance1);

  boost::asio::thread_pool pool3(thread_per_batch); 
  
 int64_t ** leaves0    = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch);
 int64_t ** leafbits0  = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch); 
 int64_t ** leaves1    = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch);
 int64_t ** leafbits1  = (int64_t ** ) malloc(sizeof(int64_t *) * thread_per_batch); 


 for(size_t j = 0; j < thread_per_batch; ++j)
 {
  leaves0[j]    = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));
  leafbits0[j]  = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));
  leaves1[j]    = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));
  leafbits1[j]  = (int64_t *)std::aligned_alloc(sizeof(node_t), db_nitems * sizeof(int64_t));
 }

 /* The function convert_sharesP2 appears in share-conversion.h */
  for(size_t j = 0; j < thread_per_batch; ++j)
  {
   boost::asio::post(pool3, std::bind(convert_sharesP2, db_nitems,  output0, flags0,  output1, flags1, leaves0, leafbits0, leaves1, leafbits1,  std::ref(sockets0), std::ref(sockets1), j, j));    
  }

  pool3.join(); 
  
  /* The function P2_xor_to_additive appears in share-conversion.h */
  boost::asio::thread_pool pool4(thread_per_batch); 
  for(size_t j = 0; j < thread_per_batch; ++j)
  {
   boost::asio::post(pool4,  std::bind(P2_xor_to_additive, std::ref(sockets0[j]), std::ref(sockets1[j]), j));
  }
  pool4.join();
 
  for(size_t j = 0; j < thread_per_batch; ++j)
  {
    free(leafbits0[j]); 
    free(leafbits1[j]);
    free(output0[j]);
    free(output1[j]);
  }  

  free(leafbits0);
  free(leafbits1);
  free(output1);
  free(output0);

    /* For the artifact, don't actually write these in order to not use very
     * large amounts of storage

  for(size_t i = 0; i < thread_per_batch; ++i)
  {
   P2_write_evalfull_outs_into_a_file(false, i, db_nitems,  flags0[i], 	leaves0[i]);
   P2_write_evalfull_outs_into_a_file(true,  i, db_nitems,  flags1[i], 	leaves1[i]);
  }
  */

  for(size_t j = 0; j < thread_per_batch; ++j)
  {
    free(leaves0[j]); 
    free(leaves1[j]);
    free(flags0[j]);
    free(flags1[j]);
  } 

  free(leaves0);
  free(leaves1);
  free(flags0);
  free(flags1);

  #ifdef DEBUG
   for(size_t ind = 0; ind < n_threads; ++ind)
   {
    for(size_t j = 0; j < db_nitems; ++j)
    {
     if(flags0[ind][j] + flags1[ind][j] != 0)
     {
      std::cout << j << "-> "  << (int) (flags0[ind][j] + flags1[ind][j]) << " = " << (int) (flags0[ind][j])  << " + " << (int) (flags1[ind][j]) << std::endl;
      std::cout << j << "-> " << output0[ind][j][0] << " <-> " << output1[ind][j][0] << std::endl << std::endl;
     }
    }
   }
  #endif


    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> elapsed_seconds = end-start;
    //std::cout << "time to generate and evaluate " << n_threads << " dpfs of size 2^" << atoi(argv[4]) << " is: " << elapsed_seconds.count() << "s\n";
    std::cout << "WallClockTime: "  << elapsed_seconds.count() << std::endl;
    std::cout << "CommunicationCost: " << communication_cost << " bytes" << std::endl;
  return 0;
}