/* Copyright (C) 2014 Carlos Aguilar Melchor, Joris Barrier, Marc-Olivier Killijian
* This file is part of XPIR.
*
* XPIR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* XPIR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with XPIR. If not, see .
*
* This file incorporates work covered by the following copyright and
* permission notice:
*
* Demonstration code for the paper "Faster arithmetic for number-theoretic
* transforms", May 2012
*
* Copyright (C) 2014, David Harvey
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DEF_NFLlib
#define DEF_NFLlib
#define SHOUP
#include
#include
#include
#include
#include
#include
#include "NFLParams.hpp"
#include "prng/fastrandombytes.h"
#include
#include
#include
// SSE/AVX(2) instructions
#include
#include
using namespace std;
// Polynomials are pointers to arrays of uint64_t coefficients
typedef uint64_t* poly64;
template
static inline T* malloc_align(const size_t n)
{
T* ret;
if(posix_memalign((void**) &ret, Align, n*sizeof(T)))
std::cout << "NFLlib: WARNING posix_memalign failed" << std::endl;
return ret;
}
template
static inline T* malloc_align(const size_t n, T const* const)
{
return malloc_align(n);
}
template
static inline T* calloc_align(const size_t n)
{
T* ret = malloc_align(n);
memset(ret, 0, n*sizeof(T));
return ret;
}
template
static inline T* calloc_align(const size_t n, T const* const)
{
T* ret = malloc_align(n);
memset(ret, 0, n*sizeof(T));
return ret;
}
template
static inline T* calloc_align_other(const size_t n)
{
T* ret = malloc_align(n);
memset(ret, 0, n*sizeof(V));
return ret;
}
// Number-Theoretic Transform Tools class
class NFLlib
{
public:
// Constructors and initializing functions
NFLlib();
void configureNTT(); // Only called from setmodulus and setpolyDegree
// Fundamental setters (result in a call to initializing functions above)
void setmodulus(uint64_t modulus);
void setpolyDegree(unsigned int polyDegree);
void setNewParameters(unsigned int polyDegree, unsigned int modulusBitsize);
// Getters
uint64_t* getmoduli();
unsigned int getpolyDegree();
unsigned short getnbModuli();
void copymoduliProduct(mpz_t dest);
// Random polynomial generation functions
void setBoundedRandomPoly(poly64 res, uint64_t upperBound_, bool uniform);
poly64 allocBoundedRandomPoly(uint64_t upperBound_, bool uniform);
// Data import and export main functions
poly64* deserializeDataNFL(unsigned char **inArrayOfBuffers, uint64_t nbrOfBuffers,
uint64_t dataBitsizePerBuffer, unsigned bitsPerCoordinate, uint64_t &polyNumber);
static void serializeData64 (uint64_t* indata, unsigned char* outdata,
unsigned int bitsPerChunk, uint64_t nb_of_uint64);
static void serializeData32 (uint32_t* indata, unsigned char* outdata, unsigned int bitsPerChunk,
uint64_t nb_of_uint32);
// Helper functions
poly64 allocpoly(bool nullpoly);
mpz_t* poly2mpz (poly64 p);
static void print_poly64(poly64 p, unsigned int coeff_nbr);
static void print_poly64hex(poly64 p, unsigned int coeff_nbr);
// Destructors memory freeing routines
~NFLlib();
void freeNTTMemory(); // Only called from configureNTT and ~NFLlib
// ****************************************************
// Modular and Polynomial computation inlined functions
// ****************************************************
///////////////////// Operations over uint64_t (polynomial coefficients)
// Additions
static uint64_t addmod(uint64_t x, uint64_t y, uint64_t p);
static uint64_t submod(uint64_t x, uint64_t y, uint64_t p);
// Multiplications
static uint64_t mulmod(uint64_t x, uint64_t y, uint64_t p);
static uint64_t mulmodShoup(uint64_t x, uint64_t y, uint64_t yprime, uint64_t p);
// Fused Multiplications-Additions
static void mulandadd(uint64_t &rop, uint64_t x, uint64_t y, uint64_t p);
static void mulandaddShoup(uint64_t &rop, uint64_t x, uint64_t y, uint64_t yprime, uint64_t p);
///////////////////// Operations over polynomials
// Additions
void addmodPoly(poly64 rop, poly64 op1, poly64 op2);
void submodPoly(poly64 rop, poly64 op1, poly64 op2);
// Multiplications
void mulmodPolyNTT(poly64 rop, poly64 op1, poly64 op2);
void mulmodPolyNTTShoup(poly64 rop, poly64 op1, poly64 op2, poly64 op2prime);
// Fused Multiplications-Additions
void mulandaddPolyNTT(poly64 rop, poly64 op1, poly64 op2);
void mulandaddPolyNTTShoup(poly64 rop, poly64 op1, poly64 op2, poly64 op2prime);
// Number-Theoretic Transorm functions
static void ntt(uint64_t* x, const uint64_t* wtab, const uint64_t* winvtab,
const unsigned K, const uint64_t p);
static void inv_ntt(uint64_t* const x, const uint64_t* const inv_wtab, const uint64_t* const inv_winvtab,
const unsigned K, const uint64_t invK, const uint64_t p, const uint64_t* const inv_indexes);
static void prep_wtab(uint64_t* wtab, uint64_t* winvtab, uint64_t w, unsigned K, uint64_t p);
void nttAndPowPhi(poly64 op);
void invnttAndPowInvPhi(poly64);
// Pre-computation functions
poly64 allocandcomputeShouppoly(poly64 x);
private:
// Says whether the object has been initialized for the int amount of moduli it is set to
unsigned int alreadyInit;
// Polynomial attributes
uint64_t *moduli;
unsigned short nbModuli;
unsigned int polyDegree;
// Chinese Remainder Theorem (CRT) and Inverse CRT related attributes
mpz_t *liftingIntegers;
mpz_t moduliProduct;
// NTT and inversse NTT related attributes
// omega**i values (omega = polydegree-th primitive root of unity), and their inverses
// phi**i values (phi = square root of omega), and their inverses multiplied by a constant
// Shoup variables contain redundant data to speed up the process
// NOTE : omega and derived values are ordered following David Harvey's algorithm
// w**0 w**1 ... w**(polyDegree/2) w**0 w**2 ... w**(polyDegree/2) w**0 w**4 ...
// w**(polyDegree/2) etc. (polyDegree values)
uint64_t **phis, **shoupphis, **invpoly_times_invphis,
**shoupinvpoly_times_invphis, **omegas,
**shoupomegas, **invomegas, **shoupinvomegas, *invpolyDegree;
uint64_t* inv_indexes;
// WARNING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// ENTERING THE DEN... Internal functions you should never read and we will never comment
uint64_t* bitsplitter (unsigned char** inDataBuffers, uint64_t nbrOfBuffers,
uint64_t bitsPerBuffer, unsigned int bitsPerChunk);
void internalLongIntegersToCRT(uint64_t* tmpdata, poly64 outdata, uint64_t int_uint64PerChunk,
uint64_t totalNbChunks) ;
void bs_finish(poly64 &outdata, uint64_t int_uint64PerChunk, uint64_t polyNumber,
uint64_t *splitData, uint64_t nbrOfBuffers, uint64_t bitsPerBuffer, unsigned int bitsPerChunk);
void bs_loop (unsigned char** inDataBuffers, uint64_t nbrOfBuffers,
uint64_t bitsPerBuffer, unsigned int bitsPerChunk, uint64_t *&tmpdata,
uint64_t bufferIndex, uint64_t &bitsread, size_t &subchunkIndex);
uint64_t* bitsplitter_backtoback_internal_test (unsigned char** inDataBuffers,
uint64_t nbrOfBuffers, uint64_t bitsPerBuffer, unsigned int bitsPerChunk,
uint64_t totalbitsread,uint64_t bitsread, uint64_t* pointer64, unsigned int bitsToRead);
// END OF THE DEN
};
// We hate C++ inlining
// These function sources were included here to force inlining
//
/* WARNING: For performance, our modular functions often do not do a complete modular reduction.
* For example when adding two integers x, y modulo an integer p we do not check if they are
* larger than p or not and only subtract to the result at most p. This is noted x + y lazymod p
* in our comments. Applications using our functions should ensure that no overflow is possible given
* our implementation. This is dangerous and a bad programming habit but mandatory for high
* performance.
* */
// *********************************************************
// Operations over uint64_t (polynomial coefficients)
// *********************************************************
// Modular addition plain and simple : add then test if sum is superior
// to the modulus in which case substract the modulus to the sum
// ASSUMPTION: x + y < 2**64
// OUTPUT: x + y lazymod p
inline uint64_t NFLlib::addmod(uint64_t x, uint64_t y, uint64_t p)
{
uint64_t z = x + y;
return z -= ((z >= p) ? p : 0);
}
// Modular subtract trick: if y is smaller than p return x+p-y else return x+p-(y%p)
// OUTPUT: x - y lazymod p
inline uint64_t NFLlib::submod(uint64_t x, uint64_t y, uint64_t p)
{
return (y < p) ? addmod(x, p-y, p) : addmod(x, p - (y % p), p);
}
// Modular multiplication: trivial (and very costly) approach with complete reduction
// OUTPUT: x * y mod p
inline uint64_t NFLlib::mulmod(uint64_t x, uint64_t y, uint64_t p)
{
return (uint128_t) x * y % p;
}
// Modular multiplication: much faster alternative
// Works only if yprime = ((uint128_t) y << 64) / p has been pre-computed
// ASSUMPTION: p is at most 62 bits
// OUTPUT: x * y lazymod p (if x and y are below p the result is x * y mod p)
inline uint64_t NFLlib::mulmodShoup(uint64_t x, uint64_t y, uint64_t yprime, uint64_t p)
{
uint128_t res;
uint64_t q = ((uint128_t) x * yprime) >> 64;
res = x * y - q * p;
res -= ((res>=p) ? p : 0);
return res;
}
// Fused Multiply and Addition (FMA) : trivial approach
inline void NFLlib::mulandadd(uint64_t& rop, uint64_t x, uint64_t y, uint64_t p)
{
rop = (((uint128_t) (x) * y) + rop) % p;
}
// Fused Multiply and Addition (FMA) : Shoup's precomputed approach again
// Works only if yprime = ((uint128_t) y << 64) / p has been pre-computed
// ASSUMPTION: p is at most 62 bits
// OUTPUT: x * y lazymod p (if x and y are below p the result is below 2p)
inline void NFLlib::mulandaddShoup(uint64_t& rop, const uint64_t x, const uint64_t y, const uint64_t yprime, const uint64_t p)
{
//mulandadd(rop, x, y, p);
#ifdef DEBUG
// This must be before the real computation modifies rop
uint128_t res = ((uint128_t) x * y + rop) % p;
#endif
uint64_t q = ((uint128_t) x * yprime) >> 64;
rop += x * y - q * p;
rop -= ((rop>=p) ? p : 0);
#ifdef DEBUG
// And this must be after the modification of rop
if((res%p)!=(rop%p))
{
std::cout<<"NFLlib: CRITICAL Shoup multiplication failed (prob. orig. precomputation or bounds)"<= 2*p) ? (2*p) : 0;
t1 += ((int64_t) t1 < 0) ? (2*p) : 0;
x[0] = t0;
x[1] = t1;
return;
}
size_t N0 = K; // size of block
//size_t M0 = 1; // number of blocks
// log2(N)-2
const int J = _bit_scan_reverse(K)-2;
for (int w = 0; w < J; w++) {
const size_t M = 1 << w;
const size_t N = N0 >> w;
//#pragma omp parallel for schedule(static) num_threads(4)
for (size_t r = 0; r < M; r++) {
for (size_t i = 0; i < N/2; i++) {
uint64_t u0 = x[N * r + i];
uint64_t u1 = x[N * r + i + N/2];
uint64_t t0 = u0 + u1;
t0 -= ((t0 >= 2*p) ? (2*p) : 0);
uint64_t t1 = u0 - u1 + 2*p;
uint64_t q = ((uint128_t) t1 * winvtab[i]) >> 64;
uint64_t t2 = t1 * wtab[i] - q * p;
x[N * r + i] = t0;
x[N * r + i + N/2] = t2;
}
}
wtab += N/2;
winvtab += N/2;
}
const size_t M = 1 << J;
// last two layers
for (size_t r = 0; r < M; r++, x += 4)
{
uint64_t u0 = x[0];
uint64_t u1 = x[1];
uint64_t u2 = x[2];
uint64_t u3 = x[3];
uint64_t v0 = u0 + u2;
v0 -= (v0 >= 2*p) ? (2*p) : 0;
uint64_t v2 = u0 - u2;
v2 += ((int64_t) v2 < 0) ? (2*p) : 0;
uint64_t v1 = u1 + u3;
v1 -= (v1 >= 2*p) ? (2*p) : 0;
uint64_t t = u1 - u3 + 2*p;
uint64_t q = ((uint128_t) t * winvtab[1]) >> 64;
uint64_t v3 = t * wtab[1] - q * p;
uint64_t z0 = v0 + v1;
z0 -= (z0 >= 2*p) ? (2*p) : 0;
uint64_t z1 = v0 - v1;
z1 += ((int64_t) z1 < 0) ? (2*p) : 0;
uint64_t z2 = v2 + v3;
z2 -= (z2 >= 2*p) ? (2*p) : 0;
uint64_t z3 = v2 - v3;
z3 += ((int64_t) z3 < 0) ? (2*p) : 0;
x[0] = z0 ;
x[1] = z1 ;
x[2] = z2 ;
x[3] = z3 ;
}
}
inline static void permut(uint64_t* const y, uint64_t const* const x, uint64_t const* const inv_indexes, size_t K)
{
for (size_t i = 0; i < K; i += 1)
{
y[inv_indexes[i]] = x[i];
}
}
// Inverse NTT: replaces values representation by the classic coefficient representation
inline void NFLlib::inv_ntt(uint64_t* const x, const uint64_t* const inv_wtab, const uint64_t* const inv_winvtab,
const unsigned K, const uint64_t invK, const uint64_t p, const uint64_t* const inv_indexes)
{
// Do we need to align on 32 ?
uint64_t* y = calloc_align<16, uint64_t>(K+1);
if (K == 1)
return;
// bit-reverse
permut(y, x, inv_indexes, K);
ntt(y, inv_wtab, inv_winvtab, K, p);
// bit-reverse again
permut(x, y, inv_indexes, K);
free(y);
}
// This function just computes the powers of the root of unity and its inverse
// It is included here just to have all the NTT functions together
inline void NFLlib::prep_wtab(uint64_t* wtab, uint64_t* wtabshoup, uint64_t w, unsigned K, uint64_t p)
{
while (K >= 2)
{
uint64_t wi = 1; // w^i
for (size_t i = 0; i < K/2; i++)
{
*wtab++ = wi;
*wtabshoup++ = ((uint128_t) wi << 64) / p;
wi = mulmod(wi, w, p);
}
w = mulmod(w, w, p);
K /= 2;
}
}
// *****************************************************************
// END OF NTT functions from David Harvey
// From : http://web.maths.unsw.edu.au/~davidharvey/papers/fastntt/
// Most of the functions have been modified for our needs
// The associated copyright notice is at the beginning of this file
// *****************************************************************
// In order to have polynomial operations mod X**n + 1 as we want
// we must multiply the polynomial coefficients by phi powers before doing the NTT
inline void NFLlib::nttAndPowPhi(poly64 op)
{
for (unsigned int cm = 0 ; cm < nbModuli ; cm++)
{
for (unsigned int i = 0; i < polyDegree; i++)
{
op[i] = mulmodShoup(op[i], phis[cm][i], shoupphis[cm][i], moduli[cm]);
}
ntt(op, omegas[cm], shoupomegas[cm], polyDegree, moduli[cm]);
op+=polyDegree;
}
}
// In order to have polynomial operations mod X**n + 1 as we want
// we must multiply the polynomial coefficients by invphi powers before doing the inverse-NTT
inline void NFLlib::invnttAndPowInvPhi(poly64 op)
{
for(unsigned short currentModulus=0;currentModulus