modm-donna-32bit.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. /*
  2. Public domain by Andrew M. <liquidsun@gmail.com>
  3. */
  4. /*
  5. Arithmetic modulo the group order n = 2^252 + 27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
  6. k = 32
  7. b = 1 << 8 = 256
  8. m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
  9. mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
  10. */
  11. #define bignum256modm_bits_per_limb 30
  12. #define bignum256modm_limb_size 9
  13. typedef uint32_t bignum256modm_element_t;
  14. typedef bignum256modm_element_t bignum256modm[9];
  15. static const bignum256modm modm_m = {
  16. 0x1cf5d3ed, 0x20498c69, 0x2f79cd65, 0x37be77a8,
  17. 0x00000014, 0x00000000, 0x00000000, 0x00000000,
  18. 0x00001000
  19. };
  20. static const bignum256modm modm_mu = {
  21. 0x0a2c131b, 0x3673968c, 0x06329a7e, 0x01885742,
  22. 0x3fffeb21, 0x3fffffff, 0x3fffffff, 0x3fffffff,
  23. 0x000fffff
  24. };
  25. static bignum256modm_element_t
  26. lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
  27. return (a - b) >> 31;
  28. }
  29. /* see HAC, Alg. 14.42 Step 4 */
  30. static void
  31. reduce256_modm(bignum256modm r) {
  32. bignum256modm t;
  33. bignum256modm_element_t b = 0, pb, mask;
  34. /* t = r - m */
  35. pb = 0;
  36. pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 30)); pb = b;
  37. pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 30)); pb = b;
  38. pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 30)); pb = b;
  39. pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 30)); pb = b;
  40. pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 30)); pb = b;
  41. pb += modm_m[5]; b = lt_modm(r[5], pb); t[5] = (r[5] - pb + (b << 30)); pb = b;
  42. pb += modm_m[6]; b = lt_modm(r[6], pb); t[6] = (r[6] - pb + (b << 30)); pb = b;
  43. pb += modm_m[7]; b = lt_modm(r[7], pb); t[7] = (r[7] - pb + (b << 30)); pb = b;
  44. pb += modm_m[8]; b = lt_modm(r[8], pb); t[8] = (r[8] - pb + (b << 16));
  45. /* keep r if r was smaller than m */
  46. mask = b - 1;
  47. r[0] ^= mask & (r[0] ^ t[0]);
  48. r[1] ^= mask & (r[1] ^ t[1]);
  49. r[2] ^= mask & (r[2] ^ t[2]);
  50. r[3] ^= mask & (r[3] ^ t[3]);
  51. r[4] ^= mask & (r[4] ^ t[4]);
  52. r[5] ^= mask & (r[5] ^ t[5]);
  53. r[6] ^= mask & (r[6] ^ t[6]);
  54. r[7] ^= mask & (r[7] ^ t[7]);
  55. r[8] ^= mask & (r[8] ^ t[8]);
  56. }
  57. /*
  58. Barrett reduction, see HAC, Alg. 14.42
  59. Instead of passing in x, pre-process in to q1 and r1 for efficiency
  60. */
  61. static void
  62. barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
  63. bignum256modm q3, r2;
  64. uint64_t c;
  65. bignum256modm_element_t f, b, pb;
  66. /* q1 = x >> 248 = 264 bits = 9 30 bit elements
  67. q2 = mu * q1
  68. q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
  69. c = mul32x32_64(modm_mu[0], q1[7]) + mul32x32_64(modm_mu[1], q1[6]) + mul32x32_64(modm_mu[2], q1[5]) + mul32x32_64(modm_mu[3], q1[4]) + mul32x32_64(modm_mu[4], q1[3]) + mul32x32_64(modm_mu[5], q1[2]) + mul32x32_64(modm_mu[6], q1[1]) + mul32x32_64(modm_mu[7], q1[0]);
  70. c >>= 30;
  71. c += mul32x32_64(modm_mu[0], q1[8]) + mul32x32_64(modm_mu[1], q1[7]) + mul32x32_64(modm_mu[2], q1[6]) + mul32x32_64(modm_mu[3], q1[5]) + mul32x32_64(modm_mu[4], q1[4]) + mul32x32_64(modm_mu[5], q1[3]) + mul32x32_64(modm_mu[6], q1[2]) + mul32x32_64(modm_mu[7], q1[1]) + mul32x32_64(modm_mu[8], q1[0]);
  72. f = (bignum256modm_element_t)c; q3[0] = (f >> 24) & 0x3f; c >>= 30;
  73. c += mul32x32_64(modm_mu[1], q1[8]) + mul32x32_64(modm_mu[2], q1[7]) + mul32x32_64(modm_mu[3], q1[6]) + mul32x32_64(modm_mu[4], q1[5]) + mul32x32_64(modm_mu[5], q1[4]) + mul32x32_64(modm_mu[6], q1[3]) + mul32x32_64(modm_mu[7], q1[2]) + mul32x32_64(modm_mu[8], q1[1]);
  74. f = (bignum256modm_element_t)c; q3[0] |= (f << 6) & 0x3fffffff; q3[1] = (f >> 24) & 0x3f; c >>= 30;
  75. c += mul32x32_64(modm_mu[2], q1[8]) + mul32x32_64(modm_mu[3], q1[7]) + mul32x32_64(modm_mu[4], q1[6]) + mul32x32_64(modm_mu[5], q1[5]) + mul32x32_64(modm_mu[6], q1[4]) + mul32x32_64(modm_mu[7], q1[3]) + mul32x32_64(modm_mu[8], q1[2]);
  76. f = (bignum256modm_element_t)c; q3[1] |= (f << 6) & 0x3fffffff; q3[2] = (f >> 24) & 0x3f; c >>= 30;
  77. c += mul32x32_64(modm_mu[3], q1[8]) + mul32x32_64(modm_mu[4], q1[7]) + mul32x32_64(modm_mu[5], q1[6]) + mul32x32_64(modm_mu[6], q1[5]) + mul32x32_64(modm_mu[7], q1[4]) + mul32x32_64(modm_mu[8], q1[3]);
  78. f = (bignum256modm_element_t)c; q3[2] |= (f << 6) & 0x3fffffff; q3[3] = (f >> 24) & 0x3f; c >>= 30;
  79. c += mul32x32_64(modm_mu[4], q1[8]) + mul32x32_64(modm_mu[5], q1[7]) + mul32x32_64(modm_mu[6], q1[6]) + mul32x32_64(modm_mu[7], q1[5]) + mul32x32_64(modm_mu[8], q1[4]);
  80. f = (bignum256modm_element_t)c; q3[3] |= (f << 6) & 0x3fffffff; q3[4] = (f >> 24) & 0x3f; c >>= 30;
  81. c += mul32x32_64(modm_mu[5], q1[8]) + mul32x32_64(modm_mu[6], q1[7]) + mul32x32_64(modm_mu[7], q1[6]) + mul32x32_64(modm_mu[8], q1[5]);
  82. f = (bignum256modm_element_t)c; q3[4] |= (f << 6) & 0x3fffffff; q3[5] = (f >> 24) & 0x3f; c >>= 30;
  83. c += mul32x32_64(modm_mu[6], q1[8]) + mul32x32_64(modm_mu[7], q1[7]) + mul32x32_64(modm_mu[8], q1[6]);
  84. f = (bignum256modm_element_t)c; q3[5] |= (f << 6) & 0x3fffffff; q3[6] = (f >> 24) & 0x3f; c >>= 30;
  85. c += mul32x32_64(modm_mu[7], q1[8]) + mul32x32_64(modm_mu[8], q1[7]);
  86. f = (bignum256modm_element_t)c; q3[6] |= (f << 6) & 0x3fffffff; q3[7] = (f >> 24) & 0x3f; c >>= 30;
  87. c += mul32x32_64(modm_mu[8], q1[8]);
  88. f = (bignum256modm_element_t)c; q3[7] |= (f << 6) & 0x3fffffff; q3[8] = (bignum256modm_element_t)(c >> 24);
  89. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
  90. r2 = (q3 * m) mod (256^(32+1)) = (q3 * m) & ((1 << 264) - 1) */
  91. c = mul32x32_64(modm_m[0], q3[0]);
  92. r2[0] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  93. c += mul32x32_64(modm_m[0], q3[1]) + mul32x32_64(modm_m[1], q3[0]);
  94. r2[1] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  95. c += mul32x32_64(modm_m[0], q3[2]) + mul32x32_64(modm_m[1], q3[1]) + mul32x32_64(modm_m[2], q3[0]);
  96. r2[2] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  97. c += mul32x32_64(modm_m[0], q3[3]) + mul32x32_64(modm_m[1], q3[2]) + mul32x32_64(modm_m[2], q3[1]) + mul32x32_64(modm_m[3], q3[0]);
  98. r2[3] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  99. c += mul32x32_64(modm_m[0], q3[4]) + mul32x32_64(modm_m[1], q3[3]) + mul32x32_64(modm_m[2], q3[2]) + mul32x32_64(modm_m[3], q3[1]) + mul32x32_64(modm_m[4], q3[0]);
  100. r2[4] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  101. c += mul32x32_64(modm_m[0], q3[5]) + mul32x32_64(modm_m[1], q3[4]) + mul32x32_64(modm_m[2], q3[3]) + mul32x32_64(modm_m[3], q3[2]) + mul32x32_64(modm_m[4], q3[1]) + mul32x32_64(modm_m[5], q3[0]);
  102. r2[5] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  103. c += mul32x32_64(modm_m[0], q3[6]) + mul32x32_64(modm_m[1], q3[5]) + mul32x32_64(modm_m[2], q3[4]) + mul32x32_64(modm_m[3], q3[3]) + mul32x32_64(modm_m[4], q3[2]) + mul32x32_64(modm_m[5], q3[1]) + mul32x32_64(modm_m[6], q3[0]);
  104. r2[6] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  105. c += mul32x32_64(modm_m[0], q3[7]) + mul32x32_64(modm_m[1], q3[6]) + mul32x32_64(modm_m[2], q3[5]) + mul32x32_64(modm_m[3], q3[4]) + mul32x32_64(modm_m[4], q3[3]) + mul32x32_64(modm_m[5], q3[2]) + mul32x32_64(modm_m[6], q3[1]) + mul32x32_64(modm_m[7], q3[0]);
  106. r2[7] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
  107. c += mul32x32_64(modm_m[0], q3[8]) + mul32x32_64(modm_m[1], q3[7]) + mul32x32_64(modm_m[2], q3[6]) + mul32x32_64(modm_m[3], q3[5]) + mul32x32_64(modm_m[4], q3[4]) + mul32x32_64(modm_m[5], q3[3]) + mul32x32_64(modm_m[6], q3[2]) + mul32x32_64(modm_m[7], q3[1]) + mul32x32_64(modm_m[8], q3[0]);
  108. r2[8] = (bignum256modm_element_t)(c & 0xffffff);
  109. /* r = r1 - r2
  110. if (r < 0) r += (1 << 264) */
  111. pb = 0;
  112. pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 30)); pb = b;
  113. pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 30)); pb = b;
  114. pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 30)); pb = b;
  115. pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 30)); pb = b;
  116. pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 30)); pb = b;
  117. pb += r2[5]; b = lt_modm(r1[5], pb); r[5] = (r1[5] - pb + (b << 30)); pb = b;
  118. pb += r2[6]; b = lt_modm(r1[6], pb); r[6] = (r1[6] - pb + (b << 30)); pb = b;
  119. pb += r2[7]; b = lt_modm(r1[7], pb); r[7] = (r1[7] - pb + (b << 30)); pb = b;
  120. pb += r2[8]; b = lt_modm(r1[8], pb); r[8] = (r1[8] - pb + (b << 24));
  121. reduce256_modm(r);
  122. reduce256_modm(r);
  123. }
  124. /* addition modulo m */
  125. static void
  126. add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  127. bignum256modm_element_t c;
  128. c = x[0] + y[0]; r[0] = c & 0x3fffffff; c >>= 30;
  129. c += x[1] + y[1]; r[1] = c & 0x3fffffff; c >>= 30;
  130. c += x[2] + y[2]; r[2] = c & 0x3fffffff; c >>= 30;
  131. c += x[3] + y[3]; r[3] = c & 0x3fffffff; c >>= 30;
  132. c += x[4] + y[4]; r[4] = c & 0x3fffffff; c >>= 30;
  133. c += x[5] + y[5]; r[5] = c & 0x3fffffff; c >>= 30;
  134. c += x[6] + y[6]; r[6] = c & 0x3fffffff; c >>= 30;
  135. c += x[7] + y[7]; r[7] = c & 0x3fffffff; c >>= 30;
  136. c += x[8] + y[8]; r[8] = c;
  137. reduce256_modm(r);
  138. }
  139. /* multiplication modulo m */
  140. static void
  141. mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  142. bignum256modm r1, q1;
  143. uint64_t c;
  144. bignum256modm_element_t f;
  145. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
  146. q1 = x >> 248 = 264 bits = 9 30 bit elements */
  147. c = mul32x32_64(x[0], y[0]);
  148. f = (bignum256modm_element_t)c; r1[0] = (f & 0x3fffffff); c >>= 30;
  149. c += mul32x32_64(x[0], y[1]) + mul32x32_64(x[1], y[0]);
  150. f = (bignum256modm_element_t)c; r1[1] = (f & 0x3fffffff); c >>= 30;
  151. c += mul32x32_64(x[0], y[2]) + mul32x32_64(x[1], y[1]) + mul32x32_64(x[2], y[0]);
  152. f = (bignum256modm_element_t)c; r1[2] = (f & 0x3fffffff); c >>= 30;
  153. c += mul32x32_64(x[0], y[3]) + mul32x32_64(x[1], y[2]) + mul32x32_64(x[2], y[1]) + mul32x32_64(x[3], y[0]);
  154. f = (bignum256modm_element_t)c; r1[3] = (f & 0x3fffffff); c >>= 30;
  155. c += mul32x32_64(x[0], y[4]) + mul32x32_64(x[1], y[3]) + mul32x32_64(x[2], y[2]) + mul32x32_64(x[3], y[1]) + mul32x32_64(x[4], y[0]);
  156. f = (bignum256modm_element_t)c; r1[4] = (f & 0x3fffffff); c >>= 30;
  157. c += mul32x32_64(x[0], y[5]) + mul32x32_64(x[1], y[4]) + mul32x32_64(x[2], y[3]) + mul32x32_64(x[3], y[2]) + mul32x32_64(x[4], y[1]) + mul32x32_64(x[5], y[0]);
  158. f = (bignum256modm_element_t)c; r1[5] = (f & 0x3fffffff); c >>= 30;
  159. c += mul32x32_64(x[0], y[6]) + mul32x32_64(x[1], y[5]) + mul32x32_64(x[2], y[4]) + mul32x32_64(x[3], y[3]) + mul32x32_64(x[4], y[2]) + mul32x32_64(x[5], y[1]) + mul32x32_64(x[6], y[0]);
  160. f = (bignum256modm_element_t)c; r1[6] = (f & 0x3fffffff); c >>= 30;
  161. c += mul32x32_64(x[0], y[7]) + mul32x32_64(x[1], y[6]) + mul32x32_64(x[2], y[5]) + mul32x32_64(x[3], y[4]) + mul32x32_64(x[4], y[3]) + mul32x32_64(x[5], y[2]) + mul32x32_64(x[6], y[1]) + mul32x32_64(x[7], y[0]);
  162. f = (bignum256modm_element_t)c; r1[7] = (f & 0x3fffffff); c >>= 30;
  163. c += mul32x32_64(x[0], y[8]) + mul32x32_64(x[1], y[7]) + mul32x32_64(x[2], y[6]) + mul32x32_64(x[3], y[5]) + mul32x32_64(x[4], y[4]) + mul32x32_64(x[5], y[3]) + mul32x32_64(x[6], y[2]) + mul32x32_64(x[7], y[1]) + mul32x32_64(x[8], y[0]);
  164. f = (bignum256modm_element_t)c; r1[8] = (f & 0x00ffffff); q1[0] = (f >> 8) & 0x3fffff; c >>= 30;
  165. c += mul32x32_64(x[1], y[8]) + mul32x32_64(x[2], y[7]) + mul32x32_64(x[3], y[6]) + mul32x32_64(x[4], y[5]) + mul32x32_64(x[5], y[4]) + mul32x32_64(x[6], y[3]) + mul32x32_64(x[7], y[2]) + mul32x32_64(x[8], y[1]);
  166. f = (bignum256modm_element_t)c; q1[0] = (q1[0] | (f << 22)) & 0x3fffffff; q1[1] = (f >> 8) & 0x3fffff; c >>= 30;
  167. c += mul32x32_64(x[2], y[8]) + mul32x32_64(x[3], y[7]) + mul32x32_64(x[4], y[6]) + mul32x32_64(x[5], y[5]) + mul32x32_64(x[6], y[4]) + mul32x32_64(x[7], y[3]) + mul32x32_64(x[8], y[2]);
  168. f = (bignum256modm_element_t)c; q1[1] = (q1[1] | (f << 22)) & 0x3fffffff; q1[2] = (f >> 8) & 0x3fffff; c >>= 30;
  169. c += mul32x32_64(x[3], y[8]) + mul32x32_64(x[4], y[7]) + mul32x32_64(x[5], y[6]) + mul32x32_64(x[6], y[5]) + mul32x32_64(x[7], y[4]) + mul32x32_64(x[8], y[3]);
  170. f = (bignum256modm_element_t)c; q1[2] = (q1[2] | (f << 22)) & 0x3fffffff; q1[3] = (f >> 8) & 0x3fffff; c >>= 30;
  171. c += mul32x32_64(x[4], y[8]) + mul32x32_64(x[5], y[7]) + mul32x32_64(x[6], y[6]) + mul32x32_64(x[7], y[5]) + mul32x32_64(x[8], y[4]);
  172. f = (bignum256modm_element_t)c; q1[3] = (q1[3] | (f << 22)) & 0x3fffffff; q1[4] = (f >> 8) & 0x3fffff; c >>= 30;
  173. c += mul32x32_64(x[5], y[8]) + mul32x32_64(x[6], y[7]) + mul32x32_64(x[7], y[6]) + mul32x32_64(x[8], y[5]);
  174. f = (bignum256modm_element_t)c; q1[4] = (q1[4] | (f << 22)) & 0x3fffffff; q1[5] = (f >> 8) & 0x3fffff; c >>= 30;
  175. c += mul32x32_64(x[6], y[8]) + mul32x32_64(x[7], y[7]) + mul32x32_64(x[8], y[6]);
  176. f = (bignum256modm_element_t)c; q1[5] = (q1[5] | (f << 22)) & 0x3fffffff; q1[6] = (f >> 8) & 0x3fffff; c >>= 30;
  177. c += mul32x32_64(x[7], y[8]) + mul32x32_64(x[8], y[7]);
  178. f = (bignum256modm_element_t)c; q1[6] = (q1[6] | (f << 22)) & 0x3fffffff; q1[7] = (f >> 8) & 0x3fffff; c >>= 30;
  179. c += mul32x32_64(x[8], y[8]);
  180. f = (bignum256modm_element_t)c; q1[7] = (q1[7] | (f << 22)) & 0x3fffffff; q1[8] = (f >> 8) & 0x3fffff;
  181. barrett_reduce256_modm(r, q1, r1);
  182. }
  183. static void
  184. expand256_modm(bignum256modm out, const unsigned char *in, size_t len) {
  185. unsigned char work[64] = {0};
  186. bignum256modm_element_t x[16];
  187. bignum256modm q1;
  188. memcpy(work, in, len);
  189. x[0] = U8TO32_LE(work + 0);
  190. x[1] = U8TO32_LE(work + 4);
  191. x[2] = U8TO32_LE(work + 8);
  192. x[3] = U8TO32_LE(work + 12);
  193. x[4] = U8TO32_LE(work + 16);
  194. x[5] = U8TO32_LE(work + 20);
  195. x[6] = U8TO32_LE(work + 24);
  196. x[7] = U8TO32_LE(work + 28);
  197. x[8] = U8TO32_LE(work + 32);
  198. x[9] = U8TO32_LE(work + 36);
  199. x[10] = U8TO32_LE(work + 40);
  200. x[11] = U8TO32_LE(work + 44);
  201. x[12] = U8TO32_LE(work + 48);
  202. x[13] = U8TO32_LE(work + 52);
  203. x[14] = U8TO32_LE(work + 56);
  204. x[15] = U8TO32_LE(work + 60);
  205. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
  206. out[0] = ( x[0]) & 0x3fffffff;
  207. out[1] = ((x[ 0] >> 30) | (x[ 1] << 2)) & 0x3fffffff;
  208. out[2] = ((x[ 1] >> 28) | (x[ 2] << 4)) & 0x3fffffff;
  209. out[3] = ((x[ 2] >> 26) | (x[ 3] << 6)) & 0x3fffffff;
  210. out[4] = ((x[ 3] >> 24) | (x[ 4] << 8)) & 0x3fffffff;
  211. out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
  212. out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
  213. out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
  214. out[8] = ((x[ 7] >> 16) | (x[ 8] << 16)) & 0x00ffffff;
  215. /* 8*31 = 248 bits, no need to reduce */
  216. if (len < 32)
  217. return;
  218. /* q1 = x >> 248 = 264 bits = 9 30 bit elements */
  219. q1[0] = ((x[ 7] >> 24) | (x[ 8] << 8)) & 0x3fffffff;
  220. q1[1] = ((x[ 8] >> 22) | (x[ 9] << 10)) & 0x3fffffff;
  221. q1[2] = ((x[ 9] >> 20) | (x[10] << 12)) & 0x3fffffff;
  222. q1[3] = ((x[10] >> 18) | (x[11] << 14)) & 0x3fffffff;
  223. q1[4] = ((x[11] >> 16) | (x[12] << 16)) & 0x3fffffff;
  224. q1[5] = ((x[12] >> 14) | (x[13] << 18)) & 0x3fffffff;
  225. q1[6] = ((x[13] >> 12) | (x[14] << 20)) & 0x3fffffff;
  226. q1[7] = ((x[14] >> 10) | (x[15] << 22)) & 0x3fffffff;
  227. q1[8] = ((x[15] >> 8) );
  228. barrett_reduce256_modm(out, q1, out);
  229. }
  230. static void
  231. expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
  232. bignum256modm_element_t x[8];
  233. x[0] = U8TO32_LE(in + 0);
  234. x[1] = U8TO32_LE(in + 4);
  235. x[2] = U8TO32_LE(in + 8);
  236. x[3] = U8TO32_LE(in + 12);
  237. x[4] = U8TO32_LE(in + 16);
  238. x[5] = U8TO32_LE(in + 20);
  239. x[6] = U8TO32_LE(in + 24);
  240. x[7] = U8TO32_LE(in + 28);
  241. out[0] = ( x[0]) & 0x3fffffff;
  242. out[1] = ((x[ 0] >> 30) | (x[ 1] << 2)) & 0x3fffffff;
  243. out[2] = ((x[ 1] >> 28) | (x[ 2] << 4)) & 0x3fffffff;
  244. out[3] = ((x[ 2] >> 26) | (x[ 3] << 6)) & 0x3fffffff;
  245. out[4] = ((x[ 3] >> 24) | (x[ 4] << 8)) & 0x3fffffff;
  246. out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
  247. out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
  248. out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
  249. out[8] = ((x[ 7] >> 16) ) & 0x0000ffff;
  250. }
  251. static void
  252. contract256_modm(unsigned char out[32], const bignum256modm in) {
  253. U32TO8_LE(out + 0, (in[0] ) | (in[1] << 30));
  254. U32TO8_LE(out + 4, (in[1] >> 2) | (in[2] << 28));
  255. U32TO8_LE(out + 8, (in[2] >> 4) | (in[3] << 26));
  256. U32TO8_LE(out + 12, (in[3] >> 6) | (in[4] << 24));
  257. U32TO8_LE(out + 16, (in[4] >> 8) | (in[5] << 22));
  258. U32TO8_LE(out + 20, (in[5] >> 10) | (in[6] << 20));
  259. U32TO8_LE(out + 24, (in[6] >> 12) | (in[7] << 18));
  260. U32TO8_LE(out + 28, (in[7] >> 14) | (in[8] << 16));
  261. }
  262. static void
  263. contract256_window4_modm(signed char r[64], const bignum256modm in) {
  264. char carry;
  265. signed char *quads = r;
  266. bignum256modm_element_t i, j, v;
  267. for (i = 0; i < 8; i += 2) {
  268. v = in[i];
  269. for (j = 0; j < 7; j++) {
  270. *quads++ = (v & 15);
  271. v >>= 4;
  272. }
  273. v |= (in[i+1] << 2);
  274. for (j = 0; j < 8; j++) {
  275. *quads++ = (v & 15);
  276. v >>= 4;
  277. }
  278. }
  279. v = in[8];
  280. *quads++ = (v & 15); v >>= 4;
  281. *quads++ = (v & 15); v >>= 4;
  282. *quads++ = (v & 15); v >>= 4;
  283. *quads++ = (v & 15); v >>= 4;
  284. /* making it signed */
  285. carry = 0;
  286. for(i = 0; i < 63; i++) {
  287. r[i] += carry;
  288. r[i+1] += (r[i] >> 4);
  289. r[i] &= 15;
  290. carry = (r[i] >> 3);
  291. r[i] -= (carry << 4);
  292. }
  293. r[63] += carry;
  294. }
  295. static void
  296. contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
  297. int i,j,k,b;
  298. int m = (1 << (windowsize - 1)) - 1;
  299. const int soplen = 256;
  300. signed char *bits = r;
  301. bignum256modm_element_t v;
  302. /* first put the binary expansion into r */
  303. for (i = 0; i < 8; i++) {
  304. v = s[i];
  305. for (j = 0; j < 30; j++, v >>= 1)
  306. *bits++ = (v & 1);
  307. }
  308. v = s[8];
  309. for (j = 0; j < 16; j++, v >>= 1)
  310. *bits++ = (v & 1);
  311. /* Making it sliding window */
  312. for (j = 0; j < soplen; j++) {
  313. if (!r[j])
  314. continue;
  315. for (b = 1; (b < (soplen - j)) && (b <= 6); b++) {
  316. if ((r[j] + (r[j + b] << b)) <= m) {
  317. r[j] += r[j + b] << b;
  318. r[j + b] = 0;
  319. } else if ((r[j] - (r[j + b] << b)) >= -m) {
  320. r[j] -= r[j + b] << b;
  321. for (k = j + b; k < soplen; k++) {
  322. if (!r[k]) {
  323. r[k] = 1;
  324. break;
  325. }
  326. r[k] = 0;
  327. }
  328. } else if (r[j + b]) {
  329. break;
  330. }
  331. }
  332. }
  333. }
  334. /*
  335. helpers for batch verifcation, are allowed to be vartime
  336. */
  337. /* out = a - b, a must be larger than b */
  338. static void
  339. sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) {
  340. size_t i = 0;
  341. bignum256modm_element_t carry = 0;
  342. switch (limbsize) {
  343. case 8: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  344. case 7: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  345. case 6: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  346. case 5: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  347. case 4: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  348. case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  349. case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  350. case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; /* Falls through. */
  351. case 0:
  352. default: out[i] = (a[i] - b[i]) - carry;
  353. }
  354. }
  355. /* is a < b */
  356. static int
  357. lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
  358. switch (limbsize) {
  359. case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1; /* Falls through. */
  360. case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1; /* Falls through. */
  361. case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1; /* Falls through. */
  362. case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1; /* Falls through. */
  363. case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1; /* Falls through. */
  364. case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1; /* Falls through. */
  365. case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1; /* Falls through. */
  366. case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1; /* Falls through. */
  367. case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1;
  368. }
  369. return 0;
  370. }
  371. /* is a <= b */
  372. static int
  373. lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
  374. switch (limbsize) {
  375. case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1; /* Falls through. */
  376. case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1; /* Falls through. */
  377. case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1; /* Falls through. */
  378. case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1; /* Falls through. */
  379. case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1; /* Falls through. */
  380. case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1; /* Falls through. */
  381. case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1; /* Falls through. */
  382. case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1; /* Falls through. */
  383. case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1;
  384. }
  385. return 1;
  386. }
  387. /* is a == 0 */
  388. static int
  389. iszero256_modm_batch(const bignum256modm a) {
  390. size_t i;
  391. for (i = 0; i < 9; i++)
  392. if (a[i])
  393. return 0;
  394. return 1;
  395. }
  396. /* is a == 1 */
  397. static int
  398. isone256_modm_batch(const bignum256modm a) {
  399. size_t i;
  400. if (a[0] != 1)
  401. return 0;
  402. for (i = 1; i < 9; i++)
  403. if (a[i])
  404. return 0;
  405. return 1;
  406. }
  407. /* can a fit in to (at most) 128 bits */
  408. static int
  409. isatmost128bits256_modm_batch(const bignum256modm a) {
  410. uint32_t mask =
  411. ((a[8] ) | /* 16 */
  412. (a[7] ) | /* 46 */
  413. (a[6] ) | /* 76 */
  414. (a[5] ) | /* 106 */
  415. (a[4] & 0x3fffff00)); /* 128 */
  416. return (mask == 0);
  417. }