curve25519-donna.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730
  1. /* Copyright 2008, Google Inc.
  2. * All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. *
  8. * * Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * * Redistributions in binary form must reproduce the above
  11. * copyright notice, this list of conditions and the following disclaimer
  12. * in the documentation and/or other materials provided with the
  13. * distribution.
  14. * * Neither the name of Google Inc. nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * curve25519-donna: Curve25519 elliptic curve, public key function
  31. *
  32. * http://code.google.com/p/curve25519-donna/
  33. *
  34. * Adam Langley <agl@imperialviolet.org>
  35. *
  36. * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  37. *
  38. * More information about curve25519 can be found here
  39. * http://cr.yp.to/ecdh.html
  40. *
  41. * djb's sample implementation of curve25519 is written in a special assembly
  42. * language called qhasm and uses the floating point registers.
  43. *
  44. * This is, almost, a clean room reimplementation from the curve25519 paper. It
  45. * uses many of the tricks described therein. Only the crecip function is taken
  46. * from the sample implementation.
  47. */
  48. #include <string.h>
  49. #include <stdint.h>
  50. typedef uint8_t u8;
  51. typedef int32_t s32;
  52. typedef int64_t limb;
  53. /* Field element representation:
  54. *
  55. * Field elements are written as an array of signed, 64-bit limbs, least
  56. * significant first. The value of the field element is:
  57. * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
  58. *
  59. * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
  60. */
  61. /* Sum two numbers: output += in */
  62. static void fsum(limb *output, const limb *in) {
  63. unsigned i;
  64. for (i = 0; i < 10; i += 2) {
  65. output[0+i] = (output[0+i] + in[0+i]);
  66. output[1+i] = (output[1+i] + in[1+i]);
  67. }
  68. }
  69. /* Find the difference of two numbers: output = in - output
  70. * (note the order of the arguments!)
  71. */
  72. static void fdifference(limb *output, const limb *in) {
  73. unsigned i;
  74. for (i = 0; i < 10; ++i) {
  75. output[i] = (in[i] - output[i]);
  76. }
  77. }
  78. /* Multiply a number by a scalar: output = in * scalar */
  79. static void fscalar_product(limb *output, const limb *in, const limb scalar) {
  80. unsigned i;
  81. for (i = 0; i < 10; ++i) {
  82. output[i] = in[i] * scalar;
  83. }
  84. }
  85. /* Multiply two numbers: output = in2 * in
  86. *
  87. * output must be distinct to both inputs. The inputs are reduced coefficient
  88. * form, the output is not.
  89. */
  90. static void fproduct(limb *output, const limb *in2, const limb *in) {
  91. output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
  92. output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
  93. ((limb) ((s32) in2[1])) * ((s32) in[0]);
  94. output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
  95. ((limb) ((s32) in2[0])) * ((s32) in[2]) +
  96. ((limb) ((s32) in2[2])) * ((s32) in[0]);
  97. output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
  98. ((limb) ((s32) in2[2])) * ((s32) in[1]) +
  99. ((limb) ((s32) in2[0])) * ((s32) in[3]) +
  100. ((limb) ((s32) in2[3])) * ((s32) in[0]);
  101. output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
  102. 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
  103. ((limb) ((s32) in2[3])) * ((s32) in[1])) +
  104. ((limb) ((s32) in2[0])) * ((s32) in[4]) +
  105. ((limb) ((s32) in2[4])) * ((s32) in[0]);
  106. output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
  107. ((limb) ((s32) in2[3])) * ((s32) in[2]) +
  108. ((limb) ((s32) in2[1])) * ((s32) in[4]) +
  109. ((limb) ((s32) in2[4])) * ((s32) in[1]) +
  110. ((limb) ((s32) in2[0])) * ((s32) in[5]) +
  111. ((limb) ((s32) in2[5])) * ((s32) in[0]);
  112. output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
  113. ((limb) ((s32) in2[1])) * ((s32) in[5]) +
  114. ((limb) ((s32) in2[5])) * ((s32) in[1])) +
  115. ((limb) ((s32) in2[2])) * ((s32) in[4]) +
  116. ((limb) ((s32) in2[4])) * ((s32) in[2]) +
  117. ((limb) ((s32) in2[0])) * ((s32) in[6]) +
  118. ((limb) ((s32) in2[6])) * ((s32) in[0]);
  119. output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
  120. ((limb) ((s32) in2[4])) * ((s32) in[3]) +
  121. ((limb) ((s32) in2[2])) * ((s32) in[5]) +
  122. ((limb) ((s32) in2[5])) * ((s32) in[2]) +
  123. ((limb) ((s32) in2[1])) * ((s32) in[6]) +
  124. ((limb) ((s32) in2[6])) * ((s32) in[1]) +
  125. ((limb) ((s32) in2[0])) * ((s32) in[7]) +
  126. ((limb) ((s32) in2[7])) * ((s32) in[0]);
  127. output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
  128. 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
  129. ((limb) ((s32) in2[5])) * ((s32) in[3]) +
  130. ((limb) ((s32) in2[1])) * ((s32) in[7]) +
  131. ((limb) ((s32) in2[7])) * ((s32) in[1])) +
  132. ((limb) ((s32) in2[2])) * ((s32) in[6]) +
  133. ((limb) ((s32) in2[6])) * ((s32) in[2]) +
  134. ((limb) ((s32) in2[0])) * ((s32) in[8]) +
  135. ((limb) ((s32) in2[8])) * ((s32) in[0]);
  136. output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
  137. ((limb) ((s32) in2[5])) * ((s32) in[4]) +
  138. ((limb) ((s32) in2[3])) * ((s32) in[6]) +
  139. ((limb) ((s32) in2[6])) * ((s32) in[3]) +
  140. ((limb) ((s32) in2[2])) * ((s32) in[7]) +
  141. ((limb) ((s32) in2[7])) * ((s32) in[2]) +
  142. ((limb) ((s32) in2[1])) * ((s32) in[8]) +
  143. ((limb) ((s32) in2[8])) * ((s32) in[1]) +
  144. ((limb) ((s32) in2[0])) * ((s32) in[9]) +
  145. ((limb) ((s32) in2[9])) * ((s32) in[0]);
  146. output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
  147. ((limb) ((s32) in2[3])) * ((s32) in[7]) +
  148. ((limb) ((s32) in2[7])) * ((s32) in[3]) +
  149. ((limb) ((s32) in2[1])) * ((s32) in[9]) +
  150. ((limb) ((s32) in2[9])) * ((s32) in[1])) +
  151. ((limb) ((s32) in2[4])) * ((s32) in[6]) +
  152. ((limb) ((s32) in2[6])) * ((s32) in[4]) +
  153. ((limb) ((s32) in2[2])) * ((s32) in[8]) +
  154. ((limb) ((s32) in2[8])) * ((s32) in[2]);
  155. output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
  156. ((limb) ((s32) in2[6])) * ((s32) in[5]) +
  157. ((limb) ((s32) in2[4])) * ((s32) in[7]) +
  158. ((limb) ((s32) in2[7])) * ((s32) in[4]) +
  159. ((limb) ((s32) in2[3])) * ((s32) in[8]) +
  160. ((limb) ((s32) in2[8])) * ((s32) in[3]) +
  161. ((limb) ((s32) in2[2])) * ((s32) in[9]) +
  162. ((limb) ((s32) in2[9])) * ((s32) in[2]);
  163. output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
  164. 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
  165. ((limb) ((s32) in2[7])) * ((s32) in[5]) +
  166. ((limb) ((s32) in2[3])) * ((s32) in[9]) +
  167. ((limb) ((s32) in2[9])) * ((s32) in[3])) +
  168. ((limb) ((s32) in2[4])) * ((s32) in[8]) +
  169. ((limb) ((s32) in2[8])) * ((s32) in[4]);
  170. output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
  171. ((limb) ((s32) in2[7])) * ((s32) in[6]) +
  172. ((limb) ((s32) in2[5])) * ((s32) in[8]) +
  173. ((limb) ((s32) in2[8])) * ((s32) in[5]) +
  174. ((limb) ((s32) in2[4])) * ((s32) in[9]) +
  175. ((limb) ((s32) in2[9])) * ((s32) in[4]);
  176. output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
  177. ((limb) ((s32) in2[5])) * ((s32) in[9]) +
  178. ((limb) ((s32) in2[9])) * ((s32) in[5])) +
  179. ((limb) ((s32) in2[6])) * ((s32) in[8]) +
  180. ((limb) ((s32) in2[8])) * ((s32) in[6]);
  181. output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
  182. ((limb) ((s32) in2[8])) * ((s32) in[7]) +
  183. ((limb) ((s32) in2[6])) * ((s32) in[9]) +
  184. ((limb) ((s32) in2[9])) * ((s32) in[6]);
  185. output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
  186. 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
  187. ((limb) ((s32) in2[9])) * ((s32) in[7]));
  188. output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
  189. ((limb) ((s32) in2[9])) * ((s32) in[8]);
  190. output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
  191. }
  192. /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
  193. static void freduce_degree(limb *output) {
  194. /* Each of these shifts and adds ends up multiplying the value by 19. */
  195. output[8] += output[18] << 4;
  196. output[8] += output[18] << 1;
  197. output[8] += output[18];
  198. output[7] += output[17] << 4;
  199. output[7] += output[17] << 1;
  200. output[7] += output[17];
  201. output[6] += output[16] << 4;
  202. output[6] += output[16] << 1;
  203. output[6] += output[16];
  204. output[5] += output[15] << 4;
  205. output[5] += output[15] << 1;
  206. output[5] += output[15];
  207. output[4] += output[14] << 4;
  208. output[4] += output[14] << 1;
  209. output[4] += output[14];
  210. output[3] += output[13] << 4;
  211. output[3] += output[13] << 1;
  212. output[3] += output[13];
  213. output[2] += output[12] << 4;
  214. output[2] += output[12] << 1;
  215. output[2] += output[12];
  216. output[1] += output[11] << 4;
  217. output[1] += output[11] << 1;
  218. output[1] += output[11];
  219. output[0] += output[10] << 4;
  220. output[0] += output[10] << 1;
  221. output[0] += output[10];
  222. }
  223. #if (-1 & 3) != 3
  224. #error "This code only works on a two's complement system"
  225. #endif
  226. /* return v / 2^26, using only shifts and adds. */
  227. static inline limb
  228. div_by_2_26(const limb v)
  229. {
  230. /* High word of v; no shift needed*/
  231. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  232. /* Set to all 1s if v was negative; else set to 0s. */
  233. const int32_t sign = ((int32_t) highword) >> 31;
  234. /* Set to 0x3ffffff if v was negative; else set to 0. */
  235. const int32_t roundoff = ((uint32_t) sign) >> 6;
  236. /* Should return v / (1<<26) */
  237. return (v + roundoff) >> 26;
  238. }
  239. /* return v / (2^25), using only shifts and adds. */
  240. static inline limb
  241. div_by_2_25(const limb v)
  242. {
  243. /* High word of v; no shift needed*/
  244. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  245. /* Set to all 1s if v was negative; else set to 0s. */
  246. const int32_t sign = ((int32_t) highword) >> 31;
  247. /* Set to 0x1ffffff if v was negative; else set to 0. */
  248. const int32_t roundoff = ((uint32_t) sign) >> 7;
  249. /* Should return v / (1<<25) */
  250. return (v + roundoff) >> 25;
  251. }
  252. static inline s32
  253. div_s32_by_2_25(const s32 v)
  254. {
  255. const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
  256. return (v + roundoff) >> 25;
  257. }
  258. /* Reduce all coefficients of the short form input so that |x| < 2^26.
  259. *
  260. * On entry: |output[i]| < 2^62
  261. */
  262. static void freduce_coefficients(limb *output) {
  263. unsigned i;
  264. output[10] = 0;
  265. for (i = 0; i < 10; i += 2) {
  266. limb over = div_by_2_26(output[i]);
  267. output[i] -= over << 26;
  268. output[i+1] += over;
  269. over = div_by_2_25(output[i+1]);
  270. output[i+1] -= over << 25;
  271. output[i+2] += over;
  272. }
  273. /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
  274. output[0] += output[10] << 4;
  275. output[0] += output[10] << 1;
  276. output[0] += output[10];
  277. output[10] = 0;
  278. /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
  279. * So |over| will be no more than 77825 */
  280. {
  281. limb over = div_by_2_26(output[0]);
  282. output[0] -= over << 26;
  283. output[1] += over;
  284. }
  285. /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
  286. * So |over| will be no more than 1. */
  287. {
  288. /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
  289. s32 over32 = div_s32_by_2_25((s32) output[1]);
  290. output[1] -= over32 << 25;
  291. output[2] += over32;
  292. }
  293. /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
  294. * we have |output[2]| <= 2^26. This is good enough for all of our math,
  295. * but it will require an extra freduce_coefficients before fcontract. */
  296. }
  297. /* A helpful wrapper around fproduct: output = in * in2.
  298. *
  299. * output must be distinct to both inputs. The output is reduced degree and
  300. * reduced coefficient.
  301. */
  302. static void
  303. fmul(limb *output, const limb *in, const limb *in2) {
  304. limb t[19];
  305. fproduct(t, in, in2);
  306. freduce_degree(t);
  307. freduce_coefficients(t);
  308. memcpy(output, t, sizeof(limb) * 10);
  309. }
  310. static void fsquare_inner(limb *output, const limb *in) {
  311. output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
  312. output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
  313. output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
  314. ((limb) ((s32) in[0])) * ((s32) in[2]));
  315. output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
  316. ((limb) ((s32) in[0])) * ((s32) in[3]));
  317. output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
  318. 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
  319. 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
  320. output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
  321. ((limb) ((s32) in[1])) * ((s32) in[4]) +
  322. ((limb) ((s32) in[0])) * ((s32) in[5]));
  323. output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
  324. ((limb) ((s32) in[2])) * ((s32) in[4]) +
  325. ((limb) ((s32) in[0])) * ((s32) in[6]) +
  326. 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
  327. output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
  328. ((limb) ((s32) in[2])) * ((s32) in[5]) +
  329. ((limb) ((s32) in[1])) * ((s32) in[6]) +
  330. ((limb) ((s32) in[0])) * ((s32) in[7]));
  331. output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
  332. 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
  333. ((limb) ((s32) in[0])) * ((s32) in[8]) +
  334. 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
  335. ((limb) ((s32) in[3])) * ((s32) in[5])));
  336. output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
  337. ((limb) ((s32) in[3])) * ((s32) in[6]) +
  338. ((limb) ((s32) in[2])) * ((s32) in[7]) +
  339. ((limb) ((s32) in[1])) * ((s32) in[8]) +
  340. ((limb) ((s32) in[0])) * ((s32) in[9]));
  341. output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
  342. ((limb) ((s32) in[4])) * ((s32) in[6]) +
  343. ((limb) ((s32) in[2])) * ((s32) in[8]) +
  344. 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
  345. ((limb) ((s32) in[1])) * ((s32) in[9])));
  346. output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
  347. ((limb) ((s32) in[4])) * ((s32) in[7]) +
  348. ((limb) ((s32) in[3])) * ((s32) in[8]) +
  349. ((limb) ((s32) in[2])) * ((s32) in[9]));
  350. output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
  351. 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
  352. 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
  353. ((limb) ((s32) in[3])) * ((s32) in[9])));
  354. output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
  355. ((limb) ((s32) in[5])) * ((s32) in[8]) +
  356. ((limb) ((s32) in[4])) * ((s32) in[9]));
  357. output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
  358. ((limb) ((s32) in[6])) * ((s32) in[8]) +
  359. 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
  360. output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
  361. ((limb) ((s32) in[6])) * ((s32) in[9]));
  362. output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
  363. 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
  364. output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
  365. output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
  366. }
  367. static void
  368. fsquare(limb *output, const limb *in) {
  369. limb t[19];
  370. fsquare_inner(t, in);
  371. freduce_degree(t);
  372. freduce_coefficients(t);
  373. memcpy(output, t, sizeof(limb) * 10);
  374. }
  375. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  376. static void
  377. fexpand(limb *output, const u8 *input) {
  378. #define F(n,start,shift,mask) \
  379. output[n] = ((((limb) input[start + 0]) | \
  380. ((limb) input[start + 1]) << 8 | \
  381. ((limb) input[start + 2]) << 16 | \
  382. ((limb) input[start + 3]) << 24) >> shift) & mask;
  383. F(0, 0, 0, 0x3ffffff);
  384. F(1, 3, 2, 0x1ffffff);
  385. F(2, 6, 3, 0x3ffffff);
  386. F(3, 9, 5, 0x1ffffff);
  387. F(4, 12, 6, 0x3ffffff);
  388. F(5, 16, 0, 0x1ffffff);
  389. F(6, 19, 1, 0x3ffffff);
  390. F(7, 22, 3, 0x1ffffff);
  391. F(8, 25, 4, 0x3ffffff);
  392. F(9, 28, 6, 0x1ffffff);
  393. #undef F
  394. }
  395. #if (-32 >> 1) != -16
  396. #error "This code only works when >> does sign-extension on negative numbers"
  397. #endif
  398. /* Take a fully reduced polynomial form number and contract it into a
  399. * little-endian, 32-byte array
  400. */
  401. static void
  402. fcontract(u8 *output, limb *input) {
  403. int i;
  404. int j;
  405. for (j = 0; j < 2; ++j) {
  406. for (i = 0; i < 9; ++i) {
  407. if ((i & 1) == 1) {
  408. /* This calculation is a time-invariant way to make input[i] positive
  409. by borrowing from the next-larger limb.
  410. */
  411. const s32 mask = (s32)(input[i]) >> 31;
  412. const s32 carry = -(((s32)(input[i]) & mask) >> 25);
  413. input[i] = (s32)(input[i]) + (carry << 25);
  414. input[i+1] = (s32)(input[i+1]) - carry;
  415. } else {
  416. const s32 mask = (s32)(input[i]) >> 31;
  417. const s32 carry = -(((s32)(input[i]) & mask) >> 26);
  418. input[i] = (s32)(input[i]) + (carry << 26);
  419. input[i+1] = (s32)(input[i+1]) - carry;
  420. }
  421. }
  422. {
  423. const s32 mask = (s32)(input[9]) >> 31;
  424. const s32 carry = -(((s32)(input[9]) & mask) >> 25);
  425. input[9] = (s32)(input[9]) + (carry << 25);
  426. input[0] = (s32)(input[0]) - (carry * 19);
  427. }
  428. }
  429. /* The first borrow-propagation pass above ended with every limb
  430. except (possibly) input[0] non-negative.
  431. Since each input limb except input[0] is decreased by at most 1
  432. by a borrow-propagation pass, the second borrow-propagation pass
  433. could only have wrapped around to decrease input[0] again if the
  434. first pass left input[0] negative *and* input[1] through input[9]
  435. were all zero. In that case, input[1] is now 2^25 - 1, and this
  436. last borrow-propagation step will leave input[1] non-negative.
  437. */
  438. {
  439. const s32 mask = (s32)(input[0]) >> 31;
  440. const s32 carry = -(((s32)(input[0]) & mask) >> 26);
  441. input[0] = (s32)(input[0]) + (carry << 26);
  442. input[1] = (s32)(input[1]) - carry;
  443. }
  444. /* Both passes through the above loop, plus the last 0-to-1 step, are
  445. necessary: if input[9] is -1 and input[0] through input[8] are 0,
  446. negative values will remain in the array until the end.
  447. */
  448. input[1] <<= 2;
  449. input[2] <<= 3;
  450. input[3] <<= 5;
  451. input[4] <<= 6;
  452. input[6] <<= 1;
  453. input[7] <<= 3;
  454. input[8] <<= 4;
  455. input[9] <<= 6;
  456. #define F(i, s) \
  457. output[s+0] |= input[i] & 0xff; \
  458. output[s+1] = (input[i] >> 8) & 0xff; \
  459. output[s+2] = (input[i] >> 16) & 0xff; \
  460. output[s+3] = (input[i] >> 24) & 0xff;
  461. output[0] = 0;
  462. output[16] = 0;
  463. F(0,0);
  464. F(1,3);
  465. F(2,6);
  466. F(3,9);
  467. F(4,12);
  468. F(5,16);
  469. F(6,19);
  470. F(7,22);
  471. F(8,25);
  472. F(9,28);
  473. #undef F
  474. }
  475. /* Input: Q, Q', Q-Q'
  476. * Output: 2Q, Q+Q'
  477. *
  478. * x2 z3: long form
  479. * x3 z3: long form
  480. * x z: short form, destroyed
  481. * xprime zprime: short form, destroyed
  482. * qmqp: short form, preserved
  483. */
  484. static void fmonty(limb *x2, limb *z2, /* output 2Q */
  485. limb *x3, limb *z3, /* output Q + Q' */
  486. limb *x, limb *z, /* input Q */
  487. limb *xprime, limb *zprime, /* input Q' */
  488. const limb *qmqp /* input Q - Q' */) {
  489. limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
  490. zzprime[19], zzzprime[19], xxxprime[19];
  491. memcpy(origx, x, 10 * sizeof(limb));
  492. fsum(x, z);
  493. fdifference(z, origx); // does x - z
  494. memcpy(origxprime, xprime, sizeof(limb) * 10);
  495. fsum(xprime, zprime);
  496. fdifference(zprime, origxprime);
  497. fproduct(xxprime, xprime, z);
  498. fproduct(zzprime, x, zprime);
  499. freduce_degree(xxprime);
  500. freduce_coefficients(xxprime);
  501. freduce_degree(zzprime);
  502. freduce_coefficients(zzprime);
  503. memcpy(origxprime, xxprime, sizeof(limb) * 10);
  504. fsum(xxprime, zzprime);
  505. fdifference(zzprime, origxprime);
  506. fsquare(xxxprime, xxprime);
  507. fsquare(zzzprime, zzprime);
  508. fproduct(zzprime, zzzprime, qmqp);
  509. freduce_degree(zzprime);
  510. freduce_coefficients(zzprime);
  511. memcpy(x3, xxxprime, sizeof(limb) * 10);
  512. memcpy(z3, zzprime, sizeof(limb) * 10);
  513. fsquare(xx, x);
  514. fsquare(zz, z);
  515. fproduct(x2, xx, zz);
  516. freduce_degree(x2);
  517. freduce_coefficients(x2);
  518. fdifference(zz, xx); // does zz = xx - zz
  519. memset(zzz + 10, 0, sizeof(limb) * 9);
  520. fscalar_product(zzz, zz, 121665);
  521. /* No need to call freduce_degree here:
  522. fscalar_product doesn't increase the degree of its input. */
  523. freduce_coefficients(zzz);
  524. fsum(zzz, xx);
  525. fproduct(z2, zz, zzz);
  526. freduce_degree(z2);
  527. freduce_coefficients(z2);
  528. }
  529. /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
  530. * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
  531. * side-channel attacks.
  532. *
  533. * NOTE that this function requires that 'iswap' be 1 or 0; other values give
  534. * wrong results. Also, the two limb arrays must be in reduced-coefficient,
  535. * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
  536. * and all all values in a[0..9],b[0..9] must have magnitude less than
  537. * INT32_MAX.
  538. */
  539. static void
  540. swap_conditional(limb a[19], limb b[19], limb iswap) {
  541. unsigned i;
  542. const s32 swap = (s32) -iswap;
  543. for (i = 0; i < 10; ++i) {
  544. const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
  545. a[i] = ((s32)a[i]) ^ x;
  546. b[i] = ((s32)b[i]) ^ x;
  547. }
  548. }
  549. /* Calculates nQ where Q is the x-coordinate of a point on the curve
  550. *
  551. * resultx/resultz: the x coordinate of the resulting curve point (short form)
  552. * n: a little endian, 32-byte number
  553. * q: a point of the curve (short form)
  554. */
  555. static void
  556. cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
  557. limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
  558. limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
  559. limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
  560. limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
  561. unsigned i, j;
  562. memcpy(nqpqx, q, sizeof(limb) * 10);
  563. for (i = 0; i < 32; ++i) {
  564. u8 byte = n[31 - i];
  565. for (j = 0; j < 8; ++j) {
  566. const limb bit = byte >> 7;
  567. swap_conditional(nqx, nqpqx, bit);
  568. swap_conditional(nqz, nqpqz, bit);
  569. fmonty(nqx2, nqz2,
  570. nqpqx2, nqpqz2,
  571. nqx, nqz,
  572. nqpqx, nqpqz,
  573. q);
  574. swap_conditional(nqx2, nqpqx2, bit);
  575. swap_conditional(nqz2, nqpqz2, bit);
  576. t = nqx;
  577. nqx = nqx2;
  578. nqx2 = t;
  579. t = nqz;
  580. nqz = nqz2;
  581. nqz2 = t;
  582. t = nqpqx;
  583. nqpqx = nqpqx2;
  584. nqpqx2 = t;
  585. t = nqpqz;
  586. nqpqz = nqpqz2;
  587. nqpqz2 = t;
  588. byte <<= 1;
  589. }
  590. }
  591. memcpy(resultx, nqx, sizeof(limb) * 10);
  592. memcpy(resultz, nqz, sizeof(limb) * 10);
  593. }
  594. // -----------------------------------------------------------------------------
  595. // Shamelessly copied from djb's code
  596. // -----------------------------------------------------------------------------
  597. static void
  598. crecip(limb *out, const limb *z) {
  599. limb z2[10];
  600. limb z9[10];
  601. limb z11[10];
  602. limb z2_5_0[10];
  603. limb z2_10_0[10];
  604. limb z2_20_0[10];
  605. limb z2_50_0[10];
  606. limb z2_100_0[10];
  607. limb t0[10];
  608. limb t1[10];
  609. int i;
  610. /* 2 */ fsquare(z2,z);
  611. /* 4 */ fsquare(t1,z2);
  612. /* 8 */ fsquare(t0,t1);
  613. /* 9 */ fmul(z9,t0,z);
  614. /* 11 */ fmul(z11,z9,z2);
  615. /* 22 */ fsquare(t0,z11);
  616. /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
  617. /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
  618. /* 2^7 - 2^2 */ fsquare(t1,t0);
  619. /* 2^8 - 2^3 */ fsquare(t0,t1);
  620. /* 2^9 - 2^4 */ fsquare(t1,t0);
  621. /* 2^10 - 2^5 */ fsquare(t0,t1);
  622. /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
  623. /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
  624. /* 2^12 - 2^2 */ fsquare(t1,t0);
  625. /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  626. /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
  627. /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
  628. /* 2^22 - 2^2 */ fsquare(t1,t0);
  629. /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  630. /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
  631. /* 2^41 - 2^1 */ fsquare(t1,t0);
  632. /* 2^42 - 2^2 */ fsquare(t0,t1);
  633. /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  634. /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
  635. /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
  636. /* 2^52 - 2^2 */ fsquare(t1,t0);
  637. /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  638. /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
  639. /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
  640. /* 2^102 - 2^2 */ fsquare(t0,t1);
  641. /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  642. /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
  643. /* 2^201 - 2^1 */ fsquare(t0,t1);
  644. /* 2^202 - 2^2 */ fsquare(t1,t0);
  645. /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  646. /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
  647. /* 2^251 - 2^1 */ fsquare(t1,t0);
  648. /* 2^252 - 2^2 */ fsquare(t0,t1);
  649. /* 2^253 - 2^3 */ fsquare(t1,t0);
  650. /* 2^254 - 2^4 */ fsquare(t0,t1);
  651. /* 2^255 - 2^5 */ fsquare(t1,t0);
  652. /* 2^255 - 21 */ fmul(out,t1,z11);
  653. }
  654. int curve25519_donna(u8 *, const u8 *, const u8 *);
  655. int
  656. curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
  657. limb bp[10], x[10], z[11], zmone[10];
  658. uint8_t e[32];
  659. int i;
  660. for (i = 0; i < 32; ++i) e[i] = secret[i];
  661. e[0] &= 248;
  662. e[31] &= 127;
  663. e[31] |= 64;
  664. fexpand(bp, basepoint);
  665. cmult(x, z, e, bp);
  666. crecip(zmone, z);
  667. fmul(z, x, zmone);
  668. freduce_coefficients(z);
  669. fcontract(mypublic, z);
  670. return 0;
  671. }