struct dpfP2
	{
		__m128i root;
		__m128i CW[26];
		uint8_t cwt_L[26];
		uint8_t cwt_R[26];
	};

void generate_random_targets(uint8_t ** target_share_read, size_t n_threads, bool party, size_t expo)
{
    for(size_t i = 0; i < n_threads; i++)
	{
		target_share_read[i] = new uint8_t[64];
	}

	for(size_t j = 0; j < 64; ++j)
	{	
 		for(size_t i = 0; i < n_threads; ++i)
		{
			srand(3);
			uint8_t random_value;
			arc4random_buf(&random_value, sizeof(uint8_t));
			target_share_read[i][j] = random_value;//rand();
			target_share_read[i][j] = 0;//target_share_read[i][j] % 2;
			if(party) target_share_read[i][expo-2] = 1;
		}
	}
}

struct cw_construction
{
  __m128i rand_b, gamma_b;
  uint8_t bit_b;
};

struct BlindsCW
{
	__m128i blinded_message;
	uint8_t blinded_bit;
};


void compute_CW_bits(tcp::socket& sout,   __m128i L, __m128i R, uint8_t bit, uint8_t &cwt_L, uint8_t &cwt_R)
{

	uint8_t advice_L = get_lsb(L) ^ bit;
	uint8_t advice_R = get_lsb(R) ^ bit;

	uint8_t advice[2];
	uint8_t cwts[2];	

	advice[0] = advice_L;
	advice[1] = advice_R;
		
	boost::asio::write(sout, boost::asio::buffer(&advice, sizeof(advice)));
	boost::asio::read(sout, boost::asio::buffer(&cwts, sizeof(cwts)));
		
	cwt_L = cwts[0];
	cwt_R = cwts[1];

	cwt_L = cwt_L ^ advice_L ^ 1;
	cwt_R = cwt_R ^ advice_R;	
}

void compute_CW(tcp::socket& sout, tcp::socket& sin, __m128i L, __m128i R, uint8_t bit, __m128i & CW)
{

	cw_construction computecw;

	//Communication from P2 
	read(sin, boost::asio::buffer(&computecw, sizeof(computecw)));

	__m128i rand_b  = computecw.rand_b;
	__m128i gamma_b = computecw.gamma_b;
	uint8_t bit_b   = computecw.bit_b;
	
	#ifdef DEBUG 
		__m128i rand_b2, gamma_b2;
		uint8_t bit_b2;
	 	read(sin, boost::asio::buffer(&rand_b2, sizeof(rand_b)));
	    read(sin, boost::asio::buffer(&gamma_b2, sizeof(gamma_b)));
	    read(sin, boost::asio::buffer(&bit_b2, sizeof(bit_b)));

	    assert(rand_b2[0] == rand_b[0]);
	    assert(rand_b2[1] == rand_b[1]);
		assert(gamma_b2[0] == gamma_b[0]);
	    assert(gamma_b2[1] == gamma_b[1]);
		assert(bit_b2 == bit_b);
 	#endif

	uint8_t blinded_bit, blinded_bit_read;
	blinded_bit = bit ^ bit_b;

	__m128i blinded_L = L ^ R ^ rand_b;
	__m128i blinded_L_read; 

	BlindsCW blinds_sent, blinds_recv;
	blinds_sent.blinded_bit = blinded_bit;
	blinds_sent.blinded_message = blinded_L;

	//exchange blinded shares for OSWAP.
 	boost::asio::write(sout, boost::asio::buffer(&blinds_sent, sizeof(BlindsCW)));
	boost::asio::read(sout, boost::asio::buffer(&blinds_recv, sizeof(BlindsCW)));
	
	blinded_bit_read = blinds_recv.blinded_bit;
	blinded_L_read   =  blinds_recv.blinded_message;

	__m128i out_ = R ^ gamma_b;//_mm_setzero_si128;

	if(bit)
	{
	 out_ ^= (L ^ R ^ blinded_L_read);
	}
	if(blinded_bit_read)
	{
	 out_ ^= rand_b;
	}

	//__m128i out_reconstruction; 
	boost::asio::write(sout, boost::asio::buffer(&out_, sizeof(out_)));
	boost::asio::read(sout, boost::asio::buffer(&CW, sizeof(CW)));
	CW = out_ ^ CW;

	// The following asserts the correctness of ComputeCW
	#ifdef DEBUG
		uint8_t bit_reconstruction; 
		boost::asio::write(sout, boost::asio::buffer(&bit, sizeof(bit)));
		boost::asio::read(sout, boost::asio::buffer(&bit_reconstruction, sizeof(bit_reconstruction)));
		bit_reconstruction = bit ^ bit_reconstruction;

		__m128i L_reconstruction; 
		boost::asio::write(sout, boost::asio::buffer(&L, sizeof(L)));
		boost::asio::read(sout, boost::asio::buffer(&L_reconstruction, sizeof(L_reconstruction)));
		L_reconstruction = L ^ L_reconstruction;

		__m128i R_reconstruction; 
		boost::asio::write(sout, boost::asio::buffer(&R, sizeof(R)));
		boost::asio::read(sout, boost::asio::buffer(&R_reconstruction, sizeof(R_reconstruction)));
		R_reconstruction = R ^ R_reconstruction;

	 	__m128i CW_debug;

		if(bit_reconstruction != 0)
		{
		  CW_debug = L_reconstruction;
		}
		else
		{
		  CW_debug = R_reconstruction;
		}

		assert(CW_debug[0] == CW[0]);
		assert(CW_debug[1] == CW[1]);
	#endif
}


template<typename node_t, typename prgkey_t>
static inline void traverse(const prgkey_t & prgkey, const node_t & seed,	node_t s[2])
{	
	dpf::PRG(prgkey, clear_lsb(seed, 0b11), s, 2);
} // dpf::expand


/**
 * @brief 
 * 
 * @param nodes_per_leaf 
 * @param depth 
 * @param nbits 
 * @param nodes_in_interval 
 * @param prgkey The PRG Key
 * @param target_share 
 * @param socketsPb Array of sockets to write to Pb
 * @param socketsP2 Array of sockets to write to P2
 * @param from 
 * @param to 
 * @param output 
 * @param _t 
 * @param final_correction_word the final correction word is written in to this 
 * @param party Party
 * @param socket_no 
 */
inline void create_dpfs (size_t db_nitems,
						 const AES_KEY& prgkey,  uint8_t target_share[64], std::vector<socket_t>& socketsPb, std::vector<socket_t>& socketsP2,
						 const size_t from, const size_t to, __m128i * output, int8_t * _t, __m128i& final_correction_word, bool party, size_t socket_no, size_t ind = 0)
{
    const size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) * CHAR_BIT;
	const bool  is_packed = (sizeof(leaf_t) < sizeof(node_t));
	const size_t nodes_per_leaf = is_packed ? 1 : std::ceil(static_cast<double>(bits_per_leaf) / (sizeof(node_t) * CHAR_BIT));

    const size_t depth = std::ceil(std::log2(db_nitems));
	const size_t nbits = std::ceil(std::log2(db_nitems));
	const size_t nodes_in_interval = db_nitems-1; 
	
	__m128i root;
	
	arc4random_buf(&root, sizeof(root));
    
    root =	set_lsb(root, party);

	const size_t from_node = std::floor(static_cast<double>(from) / nodes_per_leaf);

	__m128i * s[2] = {
	    reinterpret_cast<__m128i *>(output) + nodes_in_interval * (nodes_per_leaf - 1),
	    s[0] + nodes_in_interval / 2
	};
	
	int8_t * t[2] = { _t, _t + nodes_in_interval / 2};

	int curlayer = depth % 2;

	s[curlayer][0] = root;
	t[curlayer][0] = get_lsb(root, 0b01);
	
	__m128i * CW = (__m128i *) std::aligned_alloc(sizeof(__m256i), depth * sizeof(__m128i));
	
	#ifdef VERBOSE
		if(ind == 0)
		{
			std::cout << "root = "         << root[0] << " " << root[1] << std::endl;
			std::cout << "t[curlayer][0] " << (int) t[curlayer][0] << std::endl;
		}
	#endif

 
	dpfP2 dpf_instance;
	dpf_instance.root = root;
	for (size_t layer = 0; layer < depth; ++layer)
	{
			#ifdef VERBOSE	
				printf("layer = %zu\n", layer);
			#endif

			curlayer = 1-curlayer;

			size_t i=0, j=0;
			auto nextbit = (from_node >> (nbits-layer-1)) & 1;
			size_t nodes_in_prev_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth-layer)));
			size_t nodes_in_cur_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth-layer-1)));

			__m128i L =  _mm_setzero_si128();
			__m128i R =  _mm_setzero_si128();
			
			for (i = nextbit, j = nextbit; j < nodes_in_prev_layer-1; ++j, i+=2)
			{
				traverse(prgkey, s[1-curlayer][j], &s[curlayer][i]);
				L ^= s[curlayer][i];
				R ^= s[curlayer][i+1];
			}
			
			if (nodes_in_prev_layer > j)
			{
				if (i < nodes_in_cur_layer - 1) 
				{
					traverse(prgkey, s[1-curlayer][j], &s[curlayer][i]);
					L ^= s[curlayer][i];
					R ^= s[curlayer][i+1];
				}
			}

			// Computes the correction word using OSWAP
			compute_CW(socketsPb[socket_no], socketsP2[socket_no], L,  R, target_share[layer], CW[layer]);

			uint8_t cwt_L, cwt_R;

			// Computes the correction word bits
			compute_CW_bits(socketsPb[socket_no+1], L,  R, target_share[layer], cwt_L,  cwt_R);
			
			#ifdef DEBUG
				if(ind == 0) 
				{ 
					std::cout << "CW reconstruction  = " << CW[layer][0] << " " << CW[layer][1] << std::endl;
					std::cout << "			   cwt_L = " << (int) cwt_L << std::endl;
					std::cout << "			   cwt_R = " << (int) cwt_R << std::endl;
				}
			#endif

			dpf_instance.CW[layer] 		=  CW[layer];
			dpf_instance.cwt_L[layer]   =  cwt_L;
			dpf_instance.cwt_R[layer]   =  cwt_R;
			
			for(size_t j = 0; j < nodes_in_prev_layer; ++j)
			{	
				t[curlayer][2*j] 	 = get_lsb(s[curlayer][2*j]) ^ (cwt_L & t[1-curlayer][j]);
				s[curlayer][2*j] 	 = clear_lsb(xor_if(s[curlayer][2*j], CW[layer], !t[1-curlayer][j]), 0b11);
				t[curlayer][(2*j)+1] = get_lsb(s[curlayer][(2*j)+1]) ^ (cwt_R & t[1-curlayer][j]);
				s[curlayer][(2*j)+1] = clear_lsb(xor_if(s[curlayer][(2*j)+1], CW[layer], !t[1-curlayer][j]), 0b11);
			}
	}
 
	boost::asio::write(socketsP2[socket_no+1], boost::asio::buffer(&dpf_instance, sizeof(dpfP2)));
	 
	__m128i Gamma  =  _mm_setzero_si128();
	 
	for (size_t i = 0; i < to + 1; ++i)
	{
	  Gamma[0] += output[i][0]; // the correction word for duoram update
	  Gamma[1] += output[i][1]; // the correction word for share conversion
	}
		
	if(party) 
	{
	  Gamma[0] = -Gamma[0];  // the correction word for duoram update
	  Gamma[1] = -Gamma[1];  // the correction word for share conversion
	}
	
	#ifdef DEBUG
		boost::asio::write(socketsPb[socket_no + 3], boost::asio::buffer(&Gamma, sizeof(Gamma)));
 		boost::asio::read(socketsPb[socket_no + 3], boost::asio::buffer(&final_correction_word, sizeof(final_correction_word)));
	#endif

 	final_correction_word = Gamma;  

} // create_dpfs



inline void evaluate_dpfs( size_t db_nitems,  dpfP2 dpfinstance,  const AES_KEY& prgkey,   const size_t from, const size_t to, 
							__m128i * output, int8_t * _t,  bool party,  size_t ind)
{

 const size_t bits_per_leaf = std::is_same<leaf_t, bool>::value ? 1 : sizeof(leaf_t) * CHAR_BIT;
	const bool  is_packed = (sizeof(leaf_t) < sizeof(node_t));
	const size_t nodes_per_leaf = is_packed ? 1 : std::ceil(static_cast<double>(bits_per_leaf) / (sizeof(node_t) * CHAR_BIT));

 const size_t depth = std::ceil(std::log2(db_nitems));
	const size_t nbits = std::ceil(std::log2(db_nitems));
	const size_t nodes_in_interval = db_nitems-1; 
	
	__m128i root = dpfinstance.root;
	__m128i * CW = (__m128i *) std::aligned_alloc(sizeof(__m256i), depth * sizeof(__m128i));
	
	uint8_t * cwt_L = (uint8_t *) std::aligned_alloc(sizeof(__m256i), depth * sizeof(uint8_t));
	uint8_t * cwt_R = (uint8_t *) std::aligned_alloc(sizeof(__m256i), depth * sizeof(uint8_t));
	
	for(size_t j = 0; j < depth; ++j)
	{
		CW[j] 	  = dpfinstance.CW[j];
		cwt_L[j]  = dpfinstance.cwt_L[j];
		cwt_R[j]  = dpfinstance.cwt_R[j];
	}
	
 
 root =	set_lsb(root, party);

	const size_t from_node = std::floor(static_cast<double>(from) / nodes_per_leaf);

	__m128i * s[2] = {
	    reinterpret_cast<__m128i *>(output) + nodes_in_interval * (nodes_per_leaf - 1),
	    s[0] + nodes_in_interval / 2
	};
	
	int8_t * t[2] = { _t, _t + nodes_in_interval / 2};

	int curlayer = depth % 2;

	s[curlayer][0] = root;
	t[curlayer][0] = get_lsb(root, 0b01);
	
	#ifdef VERBOSE
		if(ind == 0)
		{
			std::cout << "root = "         << root[0] << " " << root[1] << std::endl;
			std::cout << "t[curlayer][0] " << (int) t[curlayer][0] << std::endl;
		}
	#endif

	for (size_t layer = 0; layer < depth; ++layer)
	{
			#ifdef VERBOSE	
				printf("layer = %zu\n", layer);
			#endif

			curlayer = 1-curlayer;

			size_t i=0, j=0;
			auto nextbit = (from_node >> (nbits-layer-1)) & 1;
			size_t nodes_in_prev_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth-layer)));
			size_t nodes_in_cur_layer = std::ceil(static_cast<double>(nodes_in_interval) / (1ULL << (depth-layer-1)));
			
 			for (i = nextbit, j = nextbit; j < nodes_in_prev_layer-1; ++j, i+=2)
			{
				traverse(prgkey, s[1-curlayer][j], &s[curlayer][i]);
			}
			
			if (nodes_in_prev_layer > j)
			{
				if (i < nodes_in_cur_layer - 1) 
				{
					traverse(prgkey, s[1-curlayer][j], &s[curlayer][i]);
				}
			}

			#ifdef VERBOSE
				if(ind == 0) 
				{ 
					std::cout << "CW reconstruction  = " << CW[layer][0] << " " << CW[layer][1] << std::endl;
					std::cout << "			   cwt_L = " << (int) cwt_L[layer] << std::endl;
					std::cout << "			   cwt_R = " << (int) cwt_R[layer] << std::endl;
				}
			#endif

			for(size_t j = 0; j < nodes_in_prev_layer; ++j)
			{	
				t[curlayer][2*j] 	 = get_lsb(s[curlayer][2*j]) ^ (cwt_L[layer] & t[1-curlayer][j]);
				s[curlayer][2*j] 	 = clear_lsb(xor_if(s[curlayer][2*j], CW[layer], !t[1-curlayer][j]), 0b11);
				t[curlayer][(2*j)+1] = get_lsb(s[curlayer][(2*j)+1]) ^ (cwt_R[layer] & t[1-curlayer][j]);
				s[curlayer][(2*j)+1] = clear_lsb(xor_if(s[curlayer][(2*j)+1], CW[layer], !t[1-curlayer][j]), 0b11);
			}
	}
} // evaluate_dpfs