modm-donna-64bit.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. /*
  2. Public domain by Andrew M. <liquidsun@gmail.com>
  3. */
  4. /*
  5. Arithmetic modulo the group order n = 2^252 + 27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
  6. k = 32
  7. b = 1 << 8 = 256
  8. m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
  9. mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
  10. */
  11. #define bignum256modm_bits_per_limb 56
  12. #define bignum256modm_limb_size 5
  13. typedef uint64_t bignum256modm_element_t;
  14. typedef bignum256modm_element_t bignum256modm[5];
  15. static const bignum256modm modm_m = {
  16. 0x12631a5cf5d3ed,
  17. 0xf9dea2f79cd658,
  18. 0x000000000014de,
  19. 0x00000000000000,
  20. 0x00000010000000
  21. };
  22. static const bignum256modm modm_mu = {
  23. 0x9ce5a30a2c131b,
  24. 0x215d086329a7ed,
  25. 0xffffffffeb2106,
  26. 0xffffffffffffff,
  27. 0x00000fffffffff
  28. };
  29. static bignum256modm_element_t
  30. lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
  31. return (a - b) >> 63;
  32. }
  33. static void
  34. reduce256_modm(bignum256modm r) {
  35. bignum256modm t;
  36. bignum256modm_element_t b = 0, pb, mask;
  37. /* t = r - m */
  38. pb = 0;
  39. pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 56)); pb = b;
  40. pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 56)); pb = b;
  41. pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 56)); pb = b;
  42. pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 56)); pb = b;
  43. pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 32));
  44. /* keep r if r was smaller than m */
  45. mask = b - 1;
  46. r[0] ^= mask & (r[0] ^ t[0]);
  47. r[1] ^= mask & (r[1] ^ t[1]);
  48. r[2] ^= mask & (r[2] ^ t[2]);
  49. r[3] ^= mask & (r[3] ^ t[3]);
  50. r[4] ^= mask & (r[4] ^ t[4]);
  51. }
  52. static void
  53. barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
  54. bignum256modm q3, r2;
  55. uint128_t c, mul;
  56. bignum256modm_element_t f, b, pb;
  57. /* q1 = x >> 248 = 264 bits = 5 56 bit elements
  58. q2 = mu * q1
  59. q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
  60. mul64x64_128(c, modm_mu[0], q1[3]) mul64x64_128(mul, modm_mu[3], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[2]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[1]) add128(c, mul) shr128(f, c, 56);
  61. mul64x64_128(c, modm_mu[0], q1[4]) add128_64(c, f) mul64x64_128(mul, modm_mu[4], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[1]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[2]) add128(c, mul)
  62. f = lo128(c); q3[0] = (f >> 40) & 0xffff; shr128(f, c, 56);
  63. mul64x64_128(c, modm_mu[4], q1[1]) add128_64(c, f) mul64x64_128(mul, modm_mu[1], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[2]) add128(c, mul)
  64. f = lo128(c); q3[0] |= (f << 16) & 0xffffffffffffff; q3[1] = (f >> 40) & 0xffff; shr128(f, c, 56);
  65. mul64x64_128(c, modm_mu[4], q1[2]) add128_64(c, f) mul64x64_128(mul, modm_mu[2], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[3]) add128(c, mul)
  66. f = lo128(c); q3[1] |= (f << 16) & 0xffffffffffffff; q3[2] = (f >> 40) & 0xffff; shr128(f, c, 56);
  67. mul64x64_128(c, modm_mu[4], q1[3]) add128_64(c, f) mul64x64_128(mul, modm_mu[3], q1[4]) add128(c, mul)
  68. f = lo128(c); q3[2] |= (f << 16) & 0xffffffffffffff; q3[3] = (f >> 40) & 0xffff; shr128(f, c, 56);
  69. mul64x64_128(c, modm_mu[4], q1[4]) add128_64(c, f)
  70. f = lo128(c); q3[3] |= (f << 16) & 0xffffffffffffff; q3[4] = (f >> 40) & 0xffff; shr128(f, c, 56);
  71. q3[4] |= (f << 16);
  72. mul64x64_128(c, modm_m[0], q3[0])
  73. r2[0] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  74. mul64x64_128(c, modm_m[0], q3[1]) add128_64(c, f) mul64x64_128(mul, modm_m[1], q3[0]) add128(c, mul)
  75. r2[1] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  76. mul64x64_128(c, modm_m[0], q3[2]) add128_64(c, f) mul64x64_128(mul, modm_m[2], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[1]) add128(c, mul)
  77. r2[2] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  78. mul64x64_128(c, modm_m[0], q3[3]) add128_64(c, f) mul64x64_128(mul, modm_m[3], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[2]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[1]) add128(c, mul)
  79. r2[3] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  80. mul64x64_128(c, modm_m[0], q3[4]) add128_64(c, f) mul64x64_128(mul, modm_m[4], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[3], q3[1]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[3]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[2]) add128(c, mul)
  81. r2[4] = lo128(c) & 0x0000ffffffffff;
  82. pb = 0;
  83. pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 56)); pb = b;
  84. pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 56)); pb = b;
  85. pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 56)); pb = b;
  86. pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 56)); pb = b;
  87. pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 40));
  88. reduce256_modm(r);
  89. reduce256_modm(r);
  90. }
  91. static void
  92. add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  93. bignum256modm_element_t c;
  94. c = x[0] + y[0]; r[0] = c & 0xffffffffffffff; c >>= 56;
  95. c += x[1] + y[1]; r[1] = c & 0xffffffffffffff; c >>= 56;
  96. c += x[2] + y[2]; r[2] = c & 0xffffffffffffff; c >>= 56;
  97. c += x[3] + y[3]; r[3] = c & 0xffffffffffffff; c >>= 56;
  98. c += x[4] + y[4]; r[4] = c;
  99. reduce256_modm(r);
  100. }
  101. static void
  102. mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  103. bignum256modm q1, r1;
  104. uint128_t c, mul;
  105. bignum256modm_element_t f;
  106. mul64x64_128(c, x[0], y[0])
  107. f = lo128(c); r1[0] = f & 0xffffffffffffff; shr128(f, c, 56);
  108. mul64x64_128(c, x[0], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[0]) add128(c, mul)
  109. f = lo128(c); r1[1] = f & 0xffffffffffffff; shr128(f, c, 56);
  110. mul64x64_128(c, x[0], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[1]) add128(c, mul)
  111. f = lo128(c); r1[2] = f & 0xffffffffffffff; shr128(f, c, 56);
  112. mul64x64_128(c, x[0], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[2]) add128(c, mul) mul64x64_128(mul, x[2], y[1]) add128(c, mul)
  113. f = lo128(c); r1[3] = f & 0xffffffffffffff; shr128(f, c, 56);
  114. mul64x64_128(c, x[0], y[4]) add128_64(c, f) mul64x64_128(mul, x[4], y[0]) add128(c, mul) mul64x64_128(mul, x[3], y[1]) add128(c, mul) mul64x64_128(mul, x[1], y[3]) add128(c, mul) mul64x64_128(mul, x[2], y[2]) add128(c, mul)
  115. f = lo128(c); r1[4] = f & 0x0000ffffffffff; q1[0] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  116. mul64x64_128(c, x[4], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[4]) add128(c, mul) mul64x64_128(mul, x[2], y[3]) add128(c, mul) mul64x64_128(mul, x[3], y[2]) add128(c, mul)
  117. f = lo128(c); q1[0] |= (f << 32) & 0xffffffffffffff; q1[1] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  118. mul64x64_128(c, x[4], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[4]) add128(c, mul) mul64x64_128(mul, x[3], y[3]) add128(c, mul)
  119. f = lo128(c); q1[1] |= (f << 32) & 0xffffffffffffff; q1[2] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  120. mul64x64_128(c, x[4], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[4]) add128(c, mul)
  121. f = lo128(c); q1[2] |= (f << 32) & 0xffffffffffffff; q1[3] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  122. mul64x64_128(c, x[4], y[4]) add128_64(c, f)
  123. f = lo128(c); q1[3] |= (f << 32) & 0xffffffffffffff; q1[4] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  124. q1[4] |= (f << 32);
  125. barrett_reduce256_modm(r, q1, r1);
  126. }
  127. static void
  128. expand256_modm(bignum256modm out, const unsigned char *in, size_t len) {
  129. unsigned char work[64] = {0};
  130. bignum256modm_element_t x[16];
  131. bignum256modm q1;
  132. memcpy(work, in, len);
  133. x[0] = U8TO64_LE(work + 0);
  134. x[1] = U8TO64_LE(work + 8);
  135. x[2] = U8TO64_LE(work + 16);
  136. x[3] = U8TO64_LE(work + 24);
  137. x[4] = U8TO64_LE(work + 32);
  138. x[5] = U8TO64_LE(work + 40);
  139. x[6] = U8TO64_LE(work + 48);
  140. x[7] = U8TO64_LE(work + 56);
  141. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
  142. out[0] = ( x[0]) & 0xffffffffffffff;
  143. out[1] = ((x[ 0] >> 56) | (x[ 1] << 8)) & 0xffffffffffffff;
  144. out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff;
  145. out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff;
  146. out[4] = ((x[ 3] >> 32) | (x[ 4] << 32)) & 0x0000ffffffffff;
  147. /* under 252 bits, no need to reduce */
  148. if (len < 32)
  149. return;
  150. /* q1 = x >> 248 = 264 bits */
  151. q1[0] = ((x[ 3] >> 56) | (x[ 4] << 8)) & 0xffffffffffffff;
  152. q1[1] = ((x[ 4] >> 48) | (x[ 5] << 16)) & 0xffffffffffffff;
  153. q1[2] = ((x[ 5] >> 40) | (x[ 6] << 24)) & 0xffffffffffffff;
  154. q1[3] = ((x[ 6] >> 32) | (x[ 7] << 32)) & 0xffffffffffffff;
  155. q1[4] = ((x[ 7] >> 24) );
  156. barrett_reduce256_modm(out, q1, out);
  157. }
  158. static void
  159. expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
  160. bignum256modm_element_t x[4];
  161. x[0] = U8TO64_LE(in + 0);
  162. x[1] = U8TO64_LE(in + 8);
  163. x[2] = U8TO64_LE(in + 16);
  164. x[3] = U8TO64_LE(in + 24);
  165. out[0] = ( x[0]) & 0xffffffffffffff;
  166. out[1] = ((x[ 0] >> 56) | (x[ 1] << 8)) & 0xffffffffffffff;
  167. out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff;
  168. out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff;
  169. out[4] = ((x[ 3] >> 32) ) & 0x000000ffffffff;
  170. }
  171. static void
  172. contract256_modm(unsigned char out[32], const bignum256modm in) {
  173. U64TO8_LE(out + 0, (in[0] ) | (in[1] << 56));
  174. U64TO8_LE(out + 8, (in[1] >> 8) | (in[2] << 48));
  175. U64TO8_LE(out + 16, (in[2] >> 16) | (in[3] << 40));
  176. U64TO8_LE(out + 24, (in[3] >> 24) | (in[4] << 32));
  177. }
  178. static void
  179. contract256_window4_modm(signed char r[64], const bignum256modm in) {
  180. char carry;
  181. signed char *quads = r;
  182. bignum256modm_element_t i, j, v, m;
  183. for (i = 0; i < 5; i++) {
  184. v = in[i];
  185. m = (i == 4) ? 8 : 14;
  186. for (j = 0; j < m; j++) {
  187. *quads++ = (v & 15);
  188. v >>= 4;
  189. }
  190. }
  191. /* making it signed */
  192. carry = 0;
  193. for(i = 0; i < 63; i++) {
  194. r[i] += carry;
  195. r[i+1] += (r[i] >> 4);
  196. r[i] &= 15;
  197. carry = (r[i] >> 3);
  198. r[i] -= (carry << 4);
  199. }
  200. r[63] += carry;
  201. }
  202. static void
  203. contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
  204. int i,j,k,b;
  205. int m = (1 << (windowsize - 1)) - 1;
  206. const int soplen = 256;
  207. signed char *bits = r;
  208. bignum256modm_element_t v;
  209. /* first put the binary expansion into r */
  210. for (i = 0; i < 4; i++) {
  211. v = s[i];
  212. for (j = 0; j < 56; j++, v >>= 1)
  213. *bits++ = (v & 1);
  214. }
  215. v = s[4];
  216. for (j = 0; j < 32; j++, v >>= 1)
  217. *bits++ = (v & 1);
  218. /* Making it sliding window */
  219. for (j = 0; j < soplen; j++) {
  220. if (!r[j])
  221. continue;
  222. for (b = 1; (b < (soplen - j)) && (b <= 6); b++) {
  223. /* XXX Tor: coverity scan says that r[j+b] can
  224. * overflow, but that's not possible: b < (soplen-j)
  225. * guarantees that b + j < soplen, so b+j < 256,
  226. * so the index doesn't overflow. */
  227. if ((r[j] + (r[j + b] << b)) <= m) {
  228. r[j] += r[j + b] << b;
  229. r[j + b] = 0;
  230. } else if ((r[j] - (r[j + b] << b)) >= -m) {
  231. r[j] -= r[j + b] << b;
  232. for (k = j + b; k < soplen; k++) {
  233. if (!r[k]) {
  234. r[k] = 1;
  235. break;
  236. }
  237. r[k] = 0;
  238. }
  239. } else if (r[j + b]) {
  240. break;
  241. }
  242. }
  243. }
  244. }
  245. /*
  246. helpers for batch verifcation, are allowed to be vartime
  247. */
  248. /* out = a - b, a must be larger than b */
  249. static void
  250. sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) {
  251. size_t i = 0;
  252. bignum256modm_element_t carry = 0;
  253. switch (limbsize) {
  254. case 4: out[i] = (a[i] - b[i]) ; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; /* Falls through. */
  255. case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; /* Falls through. */
  256. case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; /* Falls through. */
  257. case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; /* Falls through. */
  258. case 0:
  259. default: out[i] = (a[i] - b[i]) - carry;
  260. }
  261. }
  262. /* is a < b */
  263. static int
  264. lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
  265. size_t i = 0;
  266. bignum256modm_element_t t, carry = 0;
  267. switch (limbsize) {
  268. case 4: t = (a[i] - b[i]) ; carry = (t >> 63); i++; /* Falls through. */
  269. case 3: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++; /* Falls through. */
  270. case 2: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++; /* Falls through. */
  271. case 1: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++; /* Falls through. */
  272. case 0: t = (a[i] - b[i]) - carry; carry = (t >> 63);
  273. }
  274. return (int)carry;
  275. }
  276. /* is a <= b */
  277. static int
  278. lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
  279. size_t i = 0;
  280. bignum256modm_element_t t, carry = 0;
  281. switch (limbsize) {
  282. case 4: t = (b[i] - a[i]) ; carry = (t >> 63); i++; /* Falls through. */
  283. case 3: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++; /* Falls through. */
  284. case 2: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++; /* Falls through. */
  285. case 1: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++; /* Falls through. */
  286. case 0: t = (b[i] - a[i]) - carry; carry = (t >> 63);
  287. }
  288. return (int)!carry;
  289. }
  290. /* is a == 0 */
  291. static int
  292. iszero256_modm_batch(const bignum256modm a) {
  293. size_t i;
  294. for (i = 0; i < 5; i++)
  295. if (a[i])
  296. return 0;
  297. return 1;
  298. }
  299. /* is a == 1 */
  300. static int
  301. isone256_modm_batch(const bignum256modm a) {
  302. size_t i;
  303. for (i = 0; i < 5; i++)
  304. if (a[i] != ((i) ? 0 : 1))
  305. return 0;
  306. return 1;
  307. }
  308. /* can a fit in to (at most) 128 bits */
  309. static int
  310. isatmost128bits256_modm_batch(const bignum256modm a) {
  311. uint64_t mask =
  312. ((a[4] ) | /* 32 */
  313. (a[3] ) | /* 88 */
  314. (a[2] & 0xffffffffff0000));
  315. return (mask == 0);
  316. }