NFLlib.hpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
  1. /* Copyright (C) 2014 Carlos Aguilar Melchor, Joris Barrier, Marc-Olivier Killijian
  2. * This file is part of XPIR.
  3. *
  4. * XPIR is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation, either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * XPIR is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with XPIR. If not, see <http://www.gnu.org/licenses/>.
  16. *
  17. * This file incorporates work covered by the following copyright and
  18. * permission notice:
  19. *
  20. * Demonstration code for the paper "Faster arithmetic for number-theoretic
  21. * transforms", May 2012
  22. *
  23. * Copyright (C) 2014, David Harvey
  24. *
  25. * All rights reserved.
  26. *
  27. * Redistribution and use in source and binary forms, with or without
  28. * modification, are permitted provided that the following conditions are met:
  29. *
  30. * * Redistributions of source code must retain the above copyright notice, this
  31. * list of conditions and the following disclaimer.
  32. * * Redistributions in binary form must reproduce the above copyright notice,
  33. * this list of conditions and the following disclaimer in the documentation
  34. * and/or other materials provided with the distribution.
  35. *
  36. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  37. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  38. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  40. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  41. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  42. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  43. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  44. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  45. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  46. */
  47. #ifndef DEF_NFLlib
  48. #define DEF_NFLlib
  49. #define SHOUP
  50. #include <stdlib.h>
  51. #include <inttypes.h>
  52. #include <math.h>
  53. #include <cstddef>
  54. #include <gmp.h>
  55. #include <iostream>
  56. #include "NFLParams.hpp"
  57. #include "prng/fastrandombytes.h"
  58. #include <cstring>
  59. #include <fstream>
  60. #include <cassert>
  61. // SSE/AVX(2) instructions
  62. #include <immintrin.h>
  63. #include <x86intrin.h>
  64. using namespace std;
  65. // Polynomials are pointers to arrays of uint64_t coefficients
  66. typedef uint64_t* poly64;
  67. template <size_t Align, class T>
  68. static inline T* malloc_align(const size_t n)
  69. {
  70. T* ret;
  71. if(posix_memalign((void**) &ret, Align, n*sizeof(T)))
  72. std::cout << "NFLlib: WARNING posix_memalign failed" << std::endl;
  73. return ret;
  74. }
  75. template <size_t Align, class T>
  76. static inline T* malloc_align(const size_t n, T const* const)
  77. {
  78. return malloc_align<Align, T>(n);
  79. }
  80. template <size_t Align, class T>
  81. static inline T* calloc_align(const size_t n)
  82. {
  83. T* ret = malloc_align<Align, T>(n);
  84. memset(ret, 0, n*sizeof(T));
  85. return ret;
  86. }
  87. template <size_t Align, class T>
  88. static inline T* calloc_align(const size_t n, T const* const)
  89. {
  90. T* ret = malloc_align<Align, T>(n);
  91. memset(ret, 0, n*sizeof(T));
  92. return ret;
  93. }
  94. template <size_t Align, class T, class V>
  95. static inline T* calloc_align_other(const size_t n)
  96. {
  97. T* ret = malloc_align<Align, T>(n);
  98. memset(ret, 0, n*sizeof(V));
  99. return ret;
  100. }
  101. // Number-Theoretic Transform Tools class
  102. class NFLlib
  103. {
  104. public:
  105. // Constructors and initializing functions
  106. NFLlib();
  107. void configureNTT(); // Only called from setmodulus and setpolyDegree
  108. // Fundamental setters (result in a call to initializing functions above)
  109. void setmodulus(uint64_t modulus);
  110. void setpolyDegree(unsigned int polyDegree);
  111. void setNewParameters(unsigned int polyDegree, unsigned int modulusBitsize);
  112. // Getters
  113. uint64_t* getmoduli();
  114. unsigned int getpolyDegree();
  115. unsigned short getnbModuli();
  116. void copymoduliProduct(mpz_t dest);
  117. // Random polynomial generation functions
  118. void setBoundedRandomPoly(poly64 res, uint64_t upperBound_, bool uniform);
  119. poly64 allocBoundedRandomPoly(uint64_t upperBound_, bool uniform);
  120. // Data import and export main functions
  121. poly64* deserializeDataNFL(unsigned char **inArrayOfBuffers, uint64_t nbrOfBuffers,
  122. uint64_t dataBitsizePerBuffer, unsigned bitsPerCoordinate, uint64_t &polyNumber);
  123. static void serializeData64 (uint64_t* indata, unsigned char* outdata,
  124. unsigned int bitsPerChunk, uint64_t nb_of_uint64);
  125. static void serializeData32 (uint32_t* indata, unsigned char* outdata, unsigned int bitsPerChunk,
  126. uint64_t nb_of_uint32);
  127. // Helper functions
  128. poly64 allocpoly(bool nullpoly);
  129. mpz_t* poly2mpz (poly64 p);
  130. static void print_poly64(poly64 p, unsigned int coeff_nbr);
  131. static void print_poly64hex(poly64 p, unsigned int coeff_nbr);
  132. // Destructors memory freeing routines
  133. ~NFLlib();
  134. void freeNTTMemory(); // Only called from configureNTT and ~NFLlib
  135. // ****************************************************
  136. // Modular and Polynomial computation inlined functions
  137. // ****************************************************
  138. ///////////////////// Operations over uint64_t (polynomial coefficients)
  139. // Additions
  140. static uint64_t addmod(uint64_t x, uint64_t y, uint64_t p);
  141. static uint64_t submod(uint64_t x, uint64_t y, uint64_t p);
  142. // Multiplications
  143. static uint64_t mulmod(uint64_t x, uint64_t y, uint64_t p);
  144. static uint64_t mulmodShoup(uint64_t x, uint64_t y, uint64_t yprime, uint64_t p);
  145. // Fused Multiplications-Additions
  146. static void mulandadd(uint64_t &rop, uint64_t x, uint64_t y, uint64_t p);
  147. static void mulandaddShoup(uint64_t &rop, uint64_t x, uint64_t y, uint64_t yprime, uint64_t p);
  148. ///////////////////// Operations over polynomials
  149. // Additions
  150. void addmodPoly(poly64 rop, poly64 op1, poly64 op2);
  151. void submodPoly(poly64 rop, poly64 op1, poly64 op2);
  152. // Multiplications
  153. void mulmodPolyNTT(poly64 rop, poly64 op1, poly64 op2);
  154. void mulmodPolyNTTShoup(poly64 rop, poly64 op1, poly64 op2, poly64 op2prime);
  155. // Fused Multiplications-Additions
  156. void mulandaddPolyNTT(poly64 rop, poly64 op1, poly64 op2);
  157. void mulandaddPolyNTTShoup(poly64 rop, poly64 op1, poly64 op2, poly64 op2prime);
  158. // Number-Theoretic Transorm functions
  159. static void ntt(uint64_t* x, const uint64_t* wtab, const uint64_t* winvtab,
  160. const unsigned K, const uint64_t p);
  161. static void inv_ntt(uint64_t* const x, const uint64_t* const inv_wtab, const uint64_t* const inv_winvtab,
  162. const unsigned K, const uint64_t invK, const uint64_t p, const uint64_t* const inv_indexes);
  163. static void prep_wtab(uint64_t* wtab, uint64_t* winvtab, uint64_t w, unsigned K, uint64_t p);
  164. void nttAndPowPhi(poly64 op);
  165. void invnttAndPowInvPhi(poly64);
  166. // Pre-computation functions
  167. poly64 allocandcomputeShouppoly(poly64 x);
  168. private:
  169. // Says whether the object has been initialized for the int amount of moduli it is set to
  170. unsigned int alreadyInit;
  171. // Polynomial attributes
  172. uint64_t *moduli;
  173. unsigned short nbModuli;
  174. unsigned int polyDegree;
  175. // Chinese Remainder Theorem (CRT) and Inverse CRT related attributes
  176. mpz_t *liftingIntegers;
  177. mpz_t moduliProduct;
  178. // NTT and inversse NTT related attributes
  179. // omega**i values (omega = polydegree-th primitive root of unity), and their inverses
  180. // phi**i values (phi = square root of omega), and their inverses multiplied by a constant
  181. // Shoup variables contain redundant data to speed up the process
  182. // NOTE : omega and derived values are ordered following David Harvey's algorithm
  183. // w**0 w**1 ... w**(polyDegree/2) w**0 w**2 ... w**(polyDegree/2) w**0 w**4 ...
  184. // w**(polyDegree/2) etc. (polyDegree values)
  185. uint64_t **phis, **shoupphis, **invpoly_times_invphis,
  186. **shoupinvpoly_times_invphis, **omegas,
  187. **shoupomegas, **invomegas, **shoupinvomegas, *invpolyDegree;
  188. uint64_t* inv_indexes;
  189. // WARNING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  190. // ENTERING THE DEN... Internal functions you should never read and we will never comment
  191. uint64_t* bitsplitter (unsigned char** inDataBuffers, uint64_t nbrOfBuffers,
  192. uint64_t bitsPerBuffer, unsigned int bitsPerChunk);
  193. void internalLongIntegersToCRT(uint64_t* tmpdata, poly64 outdata, uint64_t int_uint64PerChunk,
  194. uint64_t totalNbChunks) ;
  195. void bs_finish(poly64 &outdata, uint64_t int_uint64PerChunk, uint64_t polyNumber,
  196. uint64_t *splitData, uint64_t nbrOfBuffers, uint64_t bitsPerBuffer, unsigned int bitsPerChunk);
  197. void bs_loop (unsigned char** inDataBuffers, uint64_t nbrOfBuffers,
  198. uint64_t bitsPerBuffer, unsigned int bitsPerChunk, uint64_t *&tmpdata,
  199. uint64_t bufferIndex, uint64_t &bitsread, size_t &subchunkIndex);
  200. uint64_t* bitsplitter_backtoback_internal_test (unsigned char** inDataBuffers,
  201. uint64_t nbrOfBuffers, uint64_t bitsPerBuffer, unsigned int bitsPerChunk,
  202. uint64_t totalbitsread,uint64_t bitsread, uint64_t* pointer64, unsigned int bitsToRead);
  203. // END OF THE DEN
  204. };
  205. // We hate C++ inlining
  206. // These function sources were included here to force inlining
  207. //
  208. /* WARNING: For performance, our modular functions often do not do a complete modular reduction.
  209. * For example when adding two integers x, y modulo an integer p we do not check if they are
  210. * larger than p or not and only subtract to the result at most p. This is noted x + y lazymod p
  211. * in our comments. Applications using our functions should ensure that no overflow is possible given
  212. * our implementation. This is dangerous and a bad programming habit but mandatory for high
  213. * performance.
  214. * */
  215. // *********************************************************
  216. // Operations over uint64_t (polynomial coefficients)
  217. // *********************************************************
  218. // Modular addition plain and simple : add then test if sum is superior
  219. // to the modulus in which case substract the modulus to the sum
  220. // ASSUMPTION: x + y < 2**64
  221. // OUTPUT: x + y lazymod p
  222. inline uint64_t NFLlib::addmod(uint64_t x, uint64_t y, uint64_t p)
  223. {
  224. uint64_t z = x + y;
  225. return z -= ((z >= p) ? p : 0);
  226. }
  227. // Modular subtract trick: if y is smaller than p return x+p-y else return x+p-(y%p)
  228. // OUTPUT: x - y lazymod p
  229. inline uint64_t NFLlib::submod(uint64_t x, uint64_t y, uint64_t p)
  230. {
  231. return (y < p) ? addmod(x, p-y, p) : addmod(x, p - (y % p), p);
  232. }
  233. // Modular multiplication: trivial (and very costly) approach with complete reduction
  234. // OUTPUT: x * y mod p
  235. inline uint64_t NFLlib::mulmod(uint64_t x, uint64_t y, uint64_t p)
  236. {
  237. return (uint128_t) x * y % p;
  238. }
  239. // Modular multiplication: much faster alternative
  240. // Works only if yprime = ((uint128_t) y << 64) / p has been pre-computed
  241. // ASSUMPTION: p is at most 62 bits
  242. // OUTPUT: x * y lazymod p (if x and y are below p the result is x * y mod p)
  243. inline uint64_t NFLlib::mulmodShoup(uint64_t x, uint64_t y, uint64_t yprime, uint64_t p)
  244. {
  245. uint128_t res;
  246. uint64_t q = ((uint128_t) x * yprime) >> 64;
  247. res = x * y - q * p;
  248. res -= ((res>=p) ? p : 0);
  249. return res;
  250. }
  251. // Fused Multiply and Addition (FMA) : trivial approach
  252. inline void NFLlib::mulandadd(uint64_t& rop, uint64_t x, uint64_t y, uint64_t p)
  253. {
  254. rop = (((uint128_t) (x) * y) + rop) % p;
  255. }
  256. // Fused Multiply and Addition (FMA) : Shoup's precomputed approach again
  257. // Works only if yprime = ((uint128_t) y << 64) / p has been pre-computed
  258. // ASSUMPTION: p is at most 62 bits
  259. // OUTPUT: x * y lazymod p (if x and y are below p the result is below 2p)
  260. inline void NFLlib::mulandaddShoup(uint64_t& rop, const uint64_t x, const uint64_t y, const uint64_t yprime, const uint64_t p)
  261. {
  262. //mulandadd(rop, x, y, p);
  263. #ifdef DEBUG
  264. // This must be before the real computation modifies rop
  265. uint128_t res = ((uint128_t) x * y + rop) % p;
  266. #endif
  267. uint64_t q = ((uint128_t) x * yprime) >> 64;
  268. rop += x * y - q * p;
  269. rop -= ((rop>=p) ? p : 0);
  270. #ifdef DEBUG
  271. // And this must be after the modification of rop
  272. if((res%p)!=(rop%p))
  273. {
  274. std::cout<<"NFLlib: CRITICAL Shoup multiplication failed (prob. orig. precomputation or bounds)"<<std::endl;
  275. exit(1);
  276. }
  277. #endif
  278. }
  279. // *********************************************************
  280. // Operations over polynomials
  281. // *********************************************************
  282. // Apply addmod to all the coefficients of a polynomial
  283. inline void NFLlib::addmodPoly(poly64 rop, poly64 op1,poly64 op2) {
  284. for(unsigned short currentModulus=0;currentModulus<nbModuli;currentModulus++)
  285. {
  286. for(unsigned i=0;i<polyDegree;i++)
  287. {
  288. rop[i]=addmod(op1[i],op2[i],moduli[currentModulus]);
  289. }
  290. rop+=polyDegree;
  291. op1+=polyDegree;
  292. op2+=polyDegree;
  293. }
  294. }
  295. // Apply submod to all the coefficients of a polynomial
  296. inline void NFLlib::submodPoly(poly64 rop, poly64 op1,poly64 op2) {
  297. for(unsigned short currentModulus=0;currentModulus<nbModuli;currentModulus++)
  298. {
  299. for(unsigned i=0;i<polyDegree;i++)
  300. {
  301. rop[i]=submod(op1[i],op2[i],moduli[currentModulus]);
  302. }
  303. rop+=polyDegree;
  304. op1+=polyDegree;
  305. op2+=polyDegree;
  306. }
  307. }
  308. // Apply mulmod to all the coefficients of a polynomial
  309. // This is a polynomial multiplication mod X**n+1 iff the operands
  310. // have been processed through nttAndPowPhi
  311. inline void NFLlib::mulmodPolyNTT(poly64 rop, poly64 op1, poly64 op2)
  312. {
  313. for(unsigned short currentModulus=0;currentModulus<nbModuli;currentModulus++) {
  314. for (unsigned i = 0 ; i < polyDegree ;i++)
  315. {
  316. rop[i] = mulmod(op1[i],op2[i],moduli[currentModulus]);
  317. }
  318. rop+=polyDegree;
  319. op1+=polyDegree;
  320. op2+=polyDegree;
  321. }
  322. }
  323. // Apply mulmodShoup to all the coefficients of a polynomial (much faster)
  324. // This is a polynomial multiplication mod X**n+1 iff the operands
  325. // have been processed through nttAndPowPhi
  326. // op2prime must be a polynomial with op2 coefficients converted with Shoup's precomputation
  327. inline void NFLlib::mulmodPolyNTTShoup(poly64 rop, poly64 op1,poly64 op2,poly64 op2prime)
  328. {
  329. for(unsigned short currentModulus=0;currentModulus<nbModuli;currentModulus++) {
  330. //#pragma omp parallel for
  331. for (unsigned i = 0 ; i < polyDegree ;i++) {
  332. rop[i] = mulmodShoup(op1[i],op2[i],op2prime[i],moduli[currentModulus]);
  333. }
  334. rop+=polyDegree;
  335. op1+=polyDegree;
  336. op2+=polyDegree;
  337. op2prime+=polyDegree;
  338. }
  339. }
  340. // Same as mulmodPolyNTT but with fused multiplication and addition
  341. inline void NFLlib::mulandaddPolyNTT(poly64 rop, poly64 op1,poly64 op2)
  342. {
  343. for(unsigned short currentModulus=0;currentModulus<nbModuli;currentModulus++) {
  344. for (unsigned i = 0 ; i < polyDegree ;i++)
  345. {
  346. mulandadd(rop[i],op1[i],op2[i],moduli[currentModulus]);
  347. }
  348. rop+=polyDegree;
  349. op1+=polyDegree;
  350. op2+=polyDegree;
  351. }
  352. }
  353. // Same as mulmodPolyNTTShoup but with fused multiplication and addition
  354. inline void NFLlib::mulandaddPolyNTTShoup(poly64 rop, poly64 op1,poly64 op2,poly64 op2prime)
  355. {
  356. for(unsigned short currentModulus=0;currentModulus<nbModuli;currentModulus++) {
  357. //#pragma omp parallel for
  358. for (unsigned i = 0 ; i < polyDegree ;i++) {
  359. mulandaddShoup(rop[i],op1[i],op2[i],op2prime[i],moduli[currentModulus]);
  360. }
  361. rop+=polyDegree;
  362. op1+=polyDegree;
  363. op2+=polyDegree;
  364. op2prime+=polyDegree;
  365. }
  366. }
  367. // *********************************************************
  368. // Number-Theoretic Functions
  369. // *********************************************************
  370. // *****************************************************************
  371. // NTT functions from David Harvey
  372. // From : http://web.maths.unsw.edu.au/~davidharvey/papers/fastntt/
  373. // Most of the functions have been modified for our needs
  374. // The associated copyright notice is at the beginning of this file
  375. // *****************************************************************
  376. // Number-Theoretic Transform: replaces a coefficient representation of a polynomial
  377. // by the values of the polynomial on a set of points. Allows to do coefficient wise
  378. // multiplication of polynomials instead of the usual convolution product
  379. // - x points to the polynomial to which we will apply the NTT
  380. // - wtab is a pre-computed table with the powers of a root of unity
  381. // - winvtab is another pre-computed table with the powers of the inverse of a root of unity
  382. // - K is the log_2 of the degree of the polynomial
  383. // - p is the modulus coefficient operations are done with
  384. // The NTT is computed in-place on x, so x is the output
  385. inline void NFLlib::ntt(uint64_t* x, const uint64_t* wtab, const uint64_t* winvtab,
  386. const unsigned K, const uint64_t p)
  387. {
  388. if (K == 1)
  389. return;
  390. // special case
  391. if (K == 2)
  392. {
  393. uint64_t u0 = x[0];
  394. uint64_t u1 = x[1];
  395. uint64_t t0 = u0 + u1;
  396. uint64_t t1 = u0 - u1;
  397. t0 -= (t0 >= 2*p) ? (2*p) : 0;
  398. t1 += ((int64_t) t1 < 0) ? (2*p) : 0;
  399. x[0] = t0;
  400. x[1] = t1;
  401. return;
  402. }
  403. size_t N0 = K; // size of block
  404. //size_t M0 = 1; // number of blocks
  405. // log2(N)-2
  406. const int J = _bit_scan_reverse(K)-2;
  407. for (int w = 0; w < J; w++) {
  408. const size_t M = 1 << w;
  409. const size_t N = N0 >> w;
  410. //#pragma omp parallel for schedule(static) num_threads(4)
  411. for (size_t r = 0; r < M; r++) {
  412. for (size_t i = 0; i < N/2; i++) {
  413. uint64_t u0 = x[N * r + i];
  414. uint64_t u1 = x[N * r + i + N/2];
  415. uint64_t t0 = u0 + u1;
  416. t0 -= ((t0 >= 2*p) ? (2*p) : 0);
  417. uint64_t t1 = u0 - u1 + 2*p;
  418. uint64_t q = ((uint128_t) t1 * winvtab[i]) >> 64;
  419. uint64_t t2 = t1 * wtab[i] - q * p;
  420. x[N * r + i] = t0;
  421. x[N * r + i + N/2] = t2;
  422. }
  423. }
  424. wtab += N/2;
  425. winvtab += N/2;
  426. }
  427. const size_t M = 1 << J;
  428. // last two layers
  429. for (size_t r = 0; r < M; r++, x += 4)
  430. {
  431. uint64_t u0 = x[0];
  432. uint64_t u1 = x[1];
  433. uint64_t u2 = x[2];
  434. uint64_t u3 = x[3];
  435. uint64_t v0 = u0 + u2;
  436. v0 -= (v0 >= 2*p) ? (2*p) : 0;
  437. uint64_t v2 = u0 - u2;
  438. v2 += ((int64_t) v2 < 0) ? (2*p) : 0;
  439. uint64_t v1 = u1 + u3;
  440. v1 -= (v1 >= 2*p) ? (2*p) : 0;
  441. uint64_t t = u1 - u3 + 2*p;
  442. uint64_t q = ((uint128_t) t * winvtab[1]) >> 64;
  443. uint64_t v3 = t * wtab[1] - q * p;
  444. uint64_t z0 = v0 + v1;
  445. z0 -= (z0 >= 2*p) ? (2*p) : 0;
  446. uint64_t z1 = v0 - v1;
  447. z1 += ((int64_t) z1 < 0) ? (2*p) : 0;
  448. uint64_t z2 = v2 + v3;
  449. z2 -= (z2 >= 2*p) ? (2*p) : 0;
  450. uint64_t z3 = v2 - v3;
  451. z3 += ((int64_t) z3 < 0) ? (2*p) : 0;
  452. x[0] = z0 ;
  453. x[1] = z1 ;
  454. x[2] = z2 ;
  455. x[3] = z3 ;
  456. }
  457. }
  458. inline static void permut(uint64_t* const y, uint64_t const* const x, uint64_t const* const inv_indexes, size_t K)
  459. {
  460. for (size_t i = 0; i < K; i += 1)
  461. {
  462. y[inv_indexes[i]] = x[i];
  463. }
  464. }
  465. // Inverse NTT: replaces values representation by the classic coefficient representation
  466. inline void NFLlib::inv_ntt(uint64_t* const x, const uint64_t* const inv_wtab, const uint64_t* const inv_winvtab,
  467. const unsigned K, const uint64_t invK, const uint64_t p, const uint64_t* const inv_indexes)
  468. {
  469. // Do we need to align on 32 ?
  470. uint64_t* y = calloc_align<16, uint64_t>(K+1);
  471. if (K == 1)
  472. return;
  473. // bit-reverse
  474. permut(y, x, inv_indexes, K);
  475. ntt(y, inv_wtab, inv_winvtab, K, p);
  476. // bit-reverse again
  477. permut(x, y, inv_indexes, K);
  478. free(y);
  479. }
  480. // This function just computes the powers of the root of unity and its inverse
  481. // It is included here just to have all the NTT functions together
  482. inline void NFLlib::prep_wtab(uint64_t* wtab, uint64_t* wtabshoup, uint64_t w, unsigned K, uint64_t p)
  483. {
  484. while (K >= 2)
  485. {
  486. uint64_t wi = 1; // w^i
  487. for (size_t i = 0; i < K/2; i++)
  488. {
  489. *wtab++ = wi;
  490. *wtabshoup++ = ((uint128_t) wi << 64) / p;
  491. wi = mulmod(wi, w, p);
  492. }
  493. w = mulmod(w, w, p);
  494. K /= 2;
  495. }
  496. }
  497. // *****************************************************************
  498. // END OF NTT functions from David Harvey
  499. // From : http://web.maths.unsw.edu.au/~davidharvey/papers/fastntt/
  500. // Most of the functions have been modified for our needs
  501. // The associated copyright notice is at the beginning of this file
  502. // *****************************************************************
  503. // In order to have polynomial operations mod X**n + 1 as we want
  504. // we must multiply the polynomial coefficients by phi powers before doing the NTT
  505. inline void NFLlib::nttAndPowPhi(poly64 op)
  506. {
  507. for (unsigned int cm = 0 ; cm < nbModuli ; cm++)
  508. {
  509. for (unsigned int i = 0; i < polyDegree; i++)
  510. {
  511. op[i] = mulmodShoup(op[i], phis[cm][i], shoupphis[cm][i], moduli[cm]);
  512. }
  513. ntt(op, omegas[cm], shoupomegas[cm], polyDegree, moduli[cm]);
  514. op+=polyDegree;
  515. }
  516. }
  517. // In order to have polynomial operations mod X**n + 1 as we want
  518. // we must multiply the polynomial coefficients by invphi powers before doing the inverse-NTT
  519. inline void NFLlib::invnttAndPowInvPhi(poly64 op)
  520. {
  521. for(unsigned short currentModulus=0;currentModulus<nbModuli;currentModulus++)
  522. {
  523. inv_ntt(op, invomegas[currentModulus], shoupinvomegas[currentModulus],
  524. polyDegree, invpolyDegree[currentModulus] , moduli[currentModulus], inv_indexes);
  525. for (unsigned int i = 0; i < polyDegree; i++)
  526. {
  527. op[i] = mulmodShoup(op[i], invpoly_times_invphis[currentModulus][i],
  528. shoupinvpoly_times_invphis[currentModulus][i], moduli[currentModulus]);
  529. }
  530. op+=polyDegree;
  531. }
  532. }
  533. // *********************************************************
  534. // Pre-computation functions
  535. // *********************************************************
  536. // Pre-compute quotients for Shoup's multiplication for all the coefficients of a polynomial
  537. inline poly64 NFLlib::allocandcomputeShouppoly(poly64 x)
  538. {
  539. poly64 res = (poly64) malloc(polyDegree*nbModuli*sizeof(uint64_t));
  540. poly64 res_orig = res;
  541. for (unsigned short currentModulus = 0; currentModulus < nbModuli ; currentModulus++)
  542. {
  543. for (unsigned int i = 0; i < polyDegree; i++)
  544. {
  545. res[i] = ((uint128_t) x[i] << 64) / moduli[currentModulus];
  546. }
  547. res+=polyDegree;
  548. x+=polyDegree;
  549. }
  550. return res_orig;
  551. }
  552. // All the other functions are in the cpp file. A more carefull study should be done to see what needs inlining and what not.
  553. #endif