/* Copyright (C) 2014 Carlos Aguilar Melchor, Joris Barrier, Marc-Olivier Killijian * This file is part of XPIR. * * XPIR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * XPIR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with XPIR. If not, see . */ #include "NFLlib.hpp" #include void DEBUG_MESSAGE(const char *s, poly64 p, unsigned int n){ #ifdef DEBUG std::cout< temp*phi**(polyDegree-1) = phi**(-1) invphi = mulmod(temp, phis[currentModulus][polyDegree-1], moduli[currentModulus]); // Computation of the inverse of polyDegree using the inverse of kMaxPolyDegree invpolyDegree[currentModulus] = mulmod(invkMaxPolyDegree[currentModulus], kMaxPolyDegree/polyDegree, moduli[currentModulus]); // Now we can compute the table invpoly_times_invphis temp = invpolyDegree[currentModulus]; for (unsigned int i = 0 ; i < polyDegree ; i++) { invpoly_times_invphis[currentModulus][i] = temp; shoupinvpoly_times_invphis[currentModulus][i] = ((uint128_t) temp << 64) / moduli[currentModulus]; // This is invpolyDegree*invphi**(i+1) temp = mulmod(temp, invphi, moduli[currentModulus]); } // For the omegas it is easy, we just use the function of David Harvey modified for our needs omega = mulmod(phi, phi, moduli[currentModulus]); prep_wtab(omegas[currentModulus], shoupomegas[currentModulus], omega, polyDegree, moduli[currentModulus]); // And again for the invomegas invomega = mulmod(invphi, invphi, moduli[currentModulus]); prep_wtab(invomegas[currentModulus], shoupinvomegas[currentModulus], invomega, polyDegree, moduli[currentModulus]); // Inverse-CRT constants mpz_t tmpz, mpz_inverse; mpz_init(tmpz); mpz_init(mpz_inverse); // Compute first the product of all moduli but the current mpz_init_set_ui(liftingIntegers[currentModulus],1UL); for (unsigned int i = 0 ; i < nbModuli ; i++) { if (i == currentModulus) continue; mpz_import(tmpz, 1, 1, sizeof(uint64_t), 0, 0, &P64[i]); mpz_mul(liftingIntegers[currentModulus], liftingIntegers[currentModulus], tmpz); } // Compute the inverse of the product modulo the current modulus and multiply it with the product mpz_import(tmpz, 1, 1, sizeof(uint64_t), 0, 0, &moduli[currentModulus]); mpz_invert(mpz_inverse, liftingIntegers[currentModulus], tmpz); mpz_mul(liftingIntegers[currentModulus], liftingIntegers[currentModulus], mpz_inverse); mpz_clear(tmpz); mpz_clear(mpz_inverse); } // Compute the product of all moduli mpz_t tmpz; mpz_init(tmpz); mpz_init_set_ui(moduliProduct,1UL); for (unsigned int i = 0 ; i < nbModuli ; i++) { mpz_import(tmpz, 1, 1, sizeof(uint64_t), 0, 0, &moduli[i]); mpz_mul(moduliProduct, moduliProduct, tmpz); } mpz_clear(tmpz); uint64_t* inv_indexes_tmp = malloc_align<32, uint64_t>(polyDegree); inv_indexes = malloc_align<16>(polyDegree, inv_indexes); // Compute permutation indexes for inv_ntt for (size_t i = 0; i < polyDegree; i++) { size_t ii = i, r = 0; for (unsigned h = 1; h < polyDegree; h=h<<1) { r = (r << 1) | (ii & 1); ii >>= 1; } inv_indexes_tmp[i] = r; } // Invert the previous permutation. I think we can do better than that :) for (size_t i = 0; i < polyDegree; i++) { for (size_t j = 0; j < polyDegree; j++) { if (inv_indexes_tmp[j] == i) { inv_indexes[i] = j; break; } } } free(inv_indexes_tmp); alreadyInit = nbModuli; } // ********************************************************************* // Fundamental setters (resulting on a call to the init function above) // ********************************************************************* // Set or reset the degrees of the polynomials used and the modulus size (which fixes the modulus) void NFLlib::setNewParameters(unsigned int polyDegree_, unsigned int aggregatedModulusBitsize_) { // We don't use setpolyDegree to avoid configureNTT being called twice polyDegree = polyDegree_; // We use a special function for setting the modulus as it can be called independently // and requires some processing. This function also ensures that the NTT params are configured. setmodulus(aggregatedModulusBitsize_); } // Sets the modulus size AND configures NTT parmeters (which fixes a given modulus) void NFLlib::setmodulus(uint64_t aggregatedModulusBitsize_) { // For the CRT, from the aggregated modulus bitsize, we compute the number of necessary moduli if (aggregatedModulusBitsize_ % kModulusBitsize != 0) { std::cout << "NFLlib: CRITICAL. Modulus of " << aggregatedModulusBitsize_ << " requested but only integer multiples of " << kModulusBitsize << " bits implemented. Exiting ..." << std::endl; exit(-1); } nbModuli=aggregatedModulusBitsize_/kModulusBitsize; configureNTT(); } // Sets polyDegree AND configures NTT void NFLlib::setpolyDegree(unsigned int polyDegree_) { polyDegree = polyDegree_; configureNTT(); } // ********************************************************* // Getters // ********************************************************* uint64_t* NFLlib::getmoduli() { return moduli; } unsigned short NFLlib::getnbModuli() { return nbModuli; } unsigned int NFLlib::getpolyDegree() { return polyDegree; } void NFLlib::copymoduliProduct(mpz_t dest) { mpz_init_set(dest, moduliProduct); } // ************************************** // Random polynomial generation functions // ************************************** // Two modes uniform or bounded (if uniform is false) // WARNING : The bounded mode only works for a bound // below the smaller of the moduli -> we use or for bounded noise // Allocates and sets a bounded random polynomial in FFT form calling setBoundedRandomPoly poly64 NFLlib::allocBoundedRandomPoly(uint64_t upperBound_, bool uniform_) { poly64 res = (poly64)calloc(polyDegree * nbModuli, sizeof(uint64_t)); setBoundedRandomPoly(res, upperBound_, uniform_); return res; } // Sets a pre-allocated random polynomial in FFT form // If uniform = true upperBound is ignored and the coefficients are uniformly random // ASSUMPTION: if uniform = false upperBound is below all of the moduli used void NFLlib::setBoundedRandomPoly(poly64 res, uint64_t upperBound_, bool uniform_) { poly64 rnd, rnd_orig; uint64_t mask; if (uniform_ == false){ // In bounded mode upperBound must be below the smaller of the moduli for (unsigned int cm = 0 ; cm < nbModuli ; cm++) { if (upperBound_ >= moduli[cm]) { std::cout << "NFLlib: upperBound is larger than the moduli in setBoundedRandomPoly."; std::cout << " Unpredictable results ..." << std::endl; break; } } // We play with the rnd pointer (in the uniform case), and thus // we need to remember the allocated pointer to free it at the end rnd_orig = (poly64) malloc(polyDegree * sizeof(uint64_t)); rnd = rnd_orig; // Get some randomness from the PRNG fastrandombytes((unsigned char *)rnd, polyDegree * sizeof(uint64_t)); // upperBound is below the moduli so we create the same mask for all the moduli mask=(1ULL << (unsigned int)ceil(log2(upperBound_))) -1; for(unsigned int i=0;i=upperBound_) { rnd[i]-=upperBound_; } for (unsigned int cm = 0 ; cm < nbModuli ; cm++) { res[polyDegree*cm+i] = rnd[i]; } } } else // uniform == true { // In uniform mode we need randomness for all the polynomials in the CRT rnd_orig = (poly64) malloc(polyDegree * nbModuli * sizeof(uint64_t)); // We play with the rnd pointer (in the uniform case), and thus // we need to remember the allocated pointer to free it at the end rnd = rnd_orig; fastrandombytes((unsigned char *)rnd, polyDegree * nbModuli * sizeof(uint64_t)); for (unsigned int cm = 0 ; cm < nbModuli ; cm++) { // In the uniform case, instead of getting a big random (within the general moduli), // We rather prefer, for performance issues, to get smaller randoms for each module // The mask should be the same for all moduli (because they are the same size) // But for generality we prefer to compute it for each moduli so that we could have // moduli of different bitsize mask=(1ULL << (int)ceil(log2(moduli[cm]))) -1; for(unsigned int i=0;i=moduli[cm]) { rnd[i]-=moduli[cm]; } res[i] = rnd[i]; } rnd+=polyDegree; res+=polyDegree; } } free(rnd_orig); } // ********************************************************* // Data import and export main functions // ********************************************************* // Takes an array of buffers and: // 1) Converts them into a set of polynomials with arbitrary large coefficients // 2) Reduces the polys through CRT to have nbModuli contiguous polys with uint64_t coefficients // 3) Does the NTT transform // - inArrayOfBuffers array of buffers to take the bits from // - nbrOfBuffers nbr of buffers in the array // - dataBitsizePerBuffer bits that can be taken from each buffer // - bitsPerCoordinate bits used to create each coefficient (can be > 64 !) // - polyNumber set by the function to say how many polynomials are in the returned pointer poly64 *NFLlib::deserializeDataNFL(unsigned char **inArrayOfBuffers, uint64_t nbrOfBuffers, uint64_t dataBitsizePerBuffer, unsigned bitsPerCoordinate, uint64_t &polyNumber) { // We need to handle dataBitsize bits of data per buffer // each poly can take publicParams.getAbsorptionBitsize() bits so polyNumber = ceil((double)dataBitsizePerBuffer*(double)nbrOfBuffers/(double)(bitsPerCoordinate*polyDegree)); // The uint64_t arrays are allocated and filled with zeros // So that we do not have to pad with zeros beyond the limit poly64* deserData = (poly64 *) calloc(polyNumber, sizeof(poly64)); // bitsplitter does all the hard work WITHOUT using large numbers ! deserData[0] = bitsplitter(inArrayOfBuffers, nbrOfBuffers, dataBitsizePerBuffer, bitsPerCoordinate); // We finish the work by applying the NTT transform #ifdef MULTI_THREAD #pragma omp parallel for #endif for (unsigned int i = 0 ; i < polyNumber ; i++) { deserData[i] = deserData[0]+i*nbModuli*polyDegree; #ifndef SIMULATE_PRE_NTT_DATA nttAndPowPhi(deserData[i]); #endif } return deserData; } // Serialize an array of poly64 elements into a compact byte buffer // Takes a set of polynomial coefficients and outputs their concatenation // - indata points to the polynomial coefficients // - outdata points to the concatenation obtained // - bitsPerChunk defines how many bits has each coefficient // - nb_of_uint64 defines how many coefficients must be concatenated // ASSUMPTION: all the polynomials are contiguously allocated // ASSUMPTION: outdata has allocated one more uint64_t than needed // ASSUMPTION: all the coefficients have the same size which is below 56 bits void NFLlib::serializeData64 (uint64_t* indata, unsigned char* outdata, unsigned int bitsPerChunk, uint64_t nb_of_uint64) { unsigned char *tmppointer; uint64_t *pointer64; pointer64 = (uint64_t *) outdata; uint32_t bitswritten=0; // Tricky approach playing with pointers to be able to infinitely add // up to 56 bits to any bit string present for (uint64_t i = 0 ; i < nb_of_uint64 ;) { while(bitswritten + bitsPerChunk <= 64) { *pointer64 |= (*indata++)<>3; pointer64 = (uint64_t *) (tmppointer); bitswritten -=8*(bitswritten>>3); } } // Serialize an array of poly64 elements into a compact byte buffer // Takes a set of polynomial coefficients and outputs their concatenation // - indata points to the polynomial coefficients // - outdata points to the concatenation obtained // - bitsPerChunk defines how many bits has each coefficient // - nb_of_uint64 defines how many coefficients must be concatenated // ASSUMPTION: all the polynomials are contiguously allocated // ASSUMPTION: outdata has allocated one more uint64_t than needed // IMPORTANT NOTE: Unlike in serializeData64 bitsPerChunk can be arbitrarily large. This function considers that large coefficients are retrieved by blocks of varying size up to 32 bit. For example 100-bit coefficients will be retrieved by three blocks of 32 bits and a block of 4 looping that way for each 100-bit coefficient. void NFLlib::serializeData32 (uint32_t* indata, unsigned char* outdata, unsigned int bitsPerChunk, uint64_t nb_of_uint32){ unsigned char *tmppointer; uint64_t *pointer64; pointer64 = (uint64_t *) outdata; uint32_t bitswritten=0; // See through how many block (=sub-chunk) we will have to loop to get each coefficient (=chunk) const double uint32PerChunk = (double)bitsPerChunk/32; const uint64_t int_uint32PerChunk = ceil(uint32PerChunk); const bool isint_uint32PerChunk = (uint32PerChunk==(double)int_uint32PerChunk); // Build masks for each sub-chunk uint64_t subchunkMasks[int_uint32PerChunk]; unsigned int subchunkSizes[int_uint32PerChunk]; // Increment with subchunkIndex=((subchunkIndex+1)%int_uint64PerChunk) // and use with subchunkSizes[subchunkIndex]; unsigned int subchunkIndex = 0; for (int i = 0 ; i < int_uint32PerChunk - 1 ; i++) { subchunkSizes[i]=32; subchunkMasks[i] = (1ULL<<32)-1; // const mask for extracting 32 bits } subchunkSizes[int_uint32PerChunk-1] = bitsPerChunk - 32 * (int_uint32PerChunk - 1); subchunkMasks[int_uint32PerChunk-1] = (1ULL<<(subchunkSizes[int_uint32PerChunk-1]))-1; // Apply the same approach than in serializeData64 but with varying sizes for (uint64_t i = 0 ; i < nb_of_uint32 ;) { while(bitswritten + subchunkSizes[subchunkIndex] <= 64) { *pointer64 |= ((uint64_t)(*indata++))<>3; pointer64 = (uint64_t *) (tmppointer); bitswritten -=8*(bitswritten>>3); } } // ********************************************************* // Helper functions // ********************************************************* // Allocate a polynomial potentially with all coefficients set to zero if nullpoly = true poly64 NFLlib::allocpoly(bool nullpoly) { if (nullpoly == true) return (poly64) calloc(polyDegree*nbModuli,sizeof(uint64_t)); else return (poly64) malloc(polyDegree*nbModuli*sizeof(uint64_t)); } // Lift a polynomial in CRT representation, into a polynomial with large integer coefficients mpz_t* NFLlib::poly2mpz(poly64 p) { mpz_t* resultmpz; mpz_t* tmpzbuffer; resultmpz=new mpz_t[polyDegree]; tmpzbuffer=new mpz_t[nbModuli]; for(unsigned i=0;i>32)<<(unsigned int) p[i] << " "; } std::cout << "]" << std::dec << std::endl; } // Debug printing function void NFLlib::print_poly64(poly64 p, unsigned int coeff_nbr) { std::cout << "["; for (unsigned int i = 0 ; i < coeff_nbr ; i++) { std::cout << p[i] << " "; } std::cout << "]" << std::endl; } // ********************************************************* // Destructors and closing or freeing functions // ********************************************************* // Destructor NFLlib::~NFLlib() { freeNTTMemory(); } // Everything is commented out because of hard to predict issues with MacOsX // Uncomment on a computer with that OS before releasing void NFLlib::freeNTTMemory(){ // alreadyInit says how many arrays we have allocated for the NTT for (unsigned i = 0 ; i < alreadyInit; i++) { free(phis[i]); free(shoupphis[i]); free(invpoly_times_invphis[i]); free(shoupinvpoly_times_invphis[i]); free(omegas[i]); free(invomegas[i]); mpz_clear(liftingIntegers[i]); if (i == alreadyInit - 1) { free(phis); free(shoupphis); free(invpoly_times_invphis); free(shoupinvpoly_times_invphis); free(omegas); free(shoupomegas); free(invomegas); free(shoupinvomegas); free(invpolyDegree); delete[] liftingIntegers; free(inv_indexes); mpz_clear(moduliProduct); delete[] moduli; } } alreadyInit = 0; } // **************************************************************************************** // THE DEN: Uncommented howling functions and pointer blood magic. Enter at your own risk. // **************************************************************************************** // We define first a back to back funtion to test our bitsplitter function // If DEBUG_BITSPLIT_B2B the function is used, else it is ignored //#define DEBUG_BITSPLIT_B2B #ifdef DEBUG_BITSPLIT_B2B #define DEBUG_BITSPLIT #define B2BTEST(inDataBuffers, nbrOfBuffers, bitsPerBuffer, bitsPerChunk, totalbitsread, bitsread, pointer64, bitsToRead) bitsplitter_backtoback_internal_test (inDataBuffers, nbrOfBuffers, bitsPerBuffer, bitsPerChunk, totalbitsread, bitsread, pointer64, bitsToRead) #else #define B2BTEST(inDataBuffers, nbrOfBuffers, bitsPerBuffer, bitsPerChunk, totalbitsread, bitsread, pointer64, bitsToRead) if(0); #endif uint64_t* NFLlib::bitsplitter_backtoback_internal_test (unsigned char** inDataBuffers, uint64_t nbrOfBuffers, uint64_t bitsPerBuffer, unsigned int bitsPerChunk, uint64_t totalbitsread,uint64_t bitsread, uint64_t* pointer64, unsigned int bitsToRead) { unsigned int bufferIndex = totalbitsread/bitsPerBuffer; uint64_t bitPositionInBuffer = totalbitsread - bufferIndex*bitsPerBuffer; uint64_t bytePositionInBuffer = bitPositionInBuffer/8; unsigned int bitPositionInByte = bitPositionInBuffer%8; if (bitPositionInBuffer+sizeof(uint64_t)*8 > bitsPerBuffer) { std::cerr << "WARNING: Bitsplit goes beyond buffer size (let's hope it is allocated)" << std::endl; } if (bitPositionInBuffer + bitsToRead > bitsPerBuffer) { std::cerr << "CRITICAL: Bitsplit reads AND USES data beyond buffer space" << std::endl; std::cerr << "totalbitsread " << totalbitsread << std::endl; std::cerr << "bufferIndex " << bufferIndex << std::endl; std::cerr << "bitPositionInBuffer " << bitPositionInBuffer << std::endl; std::cerr << "bytePositionInBuffer " << bytePositionInBuffer << std::endl; std::cerr << "CRITICAL: On B2B test called for " << bitsToRead << " bits" << std::endl; exit(-1); } uint64_t b2bresult = ((*((uint64_t *)(inDataBuffers[bufferIndex]+bytePositionInBuffer)))>>bitPositionInByte) & ((1ULL<>bitsread) & ((1ULL<>bitsread) & subchunkMasks[subchunkIndex]; tmpdata++;i++; bitsread += subchunkSizes[subchunkIndex]; subchunkIndex= (subchunkIndex+1 == int_uint64PerChunk ? 0 : subchunkIndex + 1); if(i==totalSubChunks) break; } if (bitstoread > 128) { unsigned shift = bitsread >>3; tmppointer = (unsigned char*) pointer64; tmppointer += shift; pointer64 = (uint64_t *) (tmppointer); bitstoread-= shift<<3; bitsread -= shift<<3; } else { tmppointer = (unsigned char*) pointer64; while ((64 - bitsread < subchunkSizes[subchunkIndex]) && bitstoread > 0) { tmppointer++; bitsread -= 8; bitstoread -= 8; } pointer64 = (uint64_t *) (tmppointer); } } // If there is a last partial subchunk in this buffer, read it part from this buffer and part // from next buffer if available bitsremaining = (uint64_t) round((nbChunks-int_nbChunks)*bitsPerChunk - cumulatedsize ); #ifdef DEBUG_BITSPLIT std::cout<<"bitsplit2 bitsremaining (should be <56)="<>bitsread) & ((1ULL<1) { outdata=(poly64) calloc(polyNumber*nbModuli*polyDegree + 1,sizeof(uint64_t)); internalLongIntegersToCRT( splitData, outdata, int_uint64PerChunk, ceil(((double)bitsPerBuffer*nbrOfBuffers)/bitsPerChunk)); free(splitData); } else { if (nbModuli > 1) { outdata=(poly64) calloc(polyNumber*nbModuli*polyDegree + 1,sizeof(uint64_t)); for (unsigned i = 0 ; i < polyNumber ; i++) { for (int cm = 0 ; cm < nbModuli ; cm++) { memcpy(outdata + i*polyDegree*nbModuli + cm*polyDegree, splitData + i*polyDegree, polyDegree*sizeof(uint64_t)); } } free(splitData); } else { outdata=splitData; } } } // This function does all the hard work of deserializeDataNFL // 1) Converts input into a set of polynomials with arbitrary large coefficients // 2) Reduces the polys through CRT to have nbModuli contiguous polys with uint64_t coefficients // Do not try to understand it, it is a nightmare, we won't try to explain it :) uint64_t* NFLlib::bitsplitter (unsigned char** inDataBuffers, uint64_t nbrOfBuffers, uint64_t bitsPerBuffer, unsigned int bitsPerChunk) { // If you don't need to change me don't try to understand me // If you need to change me, build me again from scratch :) // How many uint64_t are needed to encode a chunk const double uint64PerChunk = (double)bitsPerChunk/56; const uint64_t int_uint64PerChunk = ceil(uint64PerChunk); // How many polynomials are needed to encode the data uint64_t polyNumber = ceil((double)bitsPerBuffer*(double)nbrOfBuffers/(double)(bitsPerChunk*polyDegree)); uint64_t* splitData = (uint64_t*)(calloc(polyNumber*polyDegree*int_uint64PerChunk+1,sizeof(uint64_t))); uint64_t* tmpdata=splitData; uint64_t bitsread=0; size_t subchunkIndex=0; // Loop over the buffers for (uint64_t h = 0 ; h < nbrOfBuffers ; h++) { bs_loop (inDataBuffers, nbrOfBuffers, bitsPerBuffer, bitsPerChunk, tmpdata, h, bitsread, subchunkIndex); } poly64 outdata; bs_finish(outdata, int_uint64PerChunk, polyNumber, splitData, nbrOfBuffers, bitsPerBuffer, bitsPerChunk); return outdata; } // Subroutine for bitsplitter, the daemonic function. This is the function that allows // us to circumvect GMP void NFLlib::internalLongIntegersToCRT(uint64_t* tmpdata, poly64 outdata, uint64_t int_uint64PerChunk, uint64_t totalNbChunks) { uint64_t* outdataPtr=outdata; uint64_t* indataPtr=tmpdata; uint64_t multiplier[nbModuli][int_uint64PerChunk]; uint64_t* chunkParts[int_uint64PerChunk]; for(int cm=0;cm