curve25519-donna.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732
  1. /* Copyright 2008, Google Inc.
  2. * All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. *
  8. * * Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * * Redistributions in binary form must reproduce the above
  11. * copyright notice, this list of conditions and the following disclaimer
  12. * in the documentation and/or other materials provided with the
  13. * distribution.
  14. * * Neither the name of Google Inc. nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * curve25519-donna: Curve25519 elliptic curve, public key function
  31. *
  32. * http://code.google.com/p/curve25519-donna/
  33. *
  34. * Adam Langley <agl@imperialviolet.org>
  35. *
  36. * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  37. *
  38. * More information about curve25519 can be found here
  39. * http://cr.yp.to/ecdh.html
  40. *
  41. * djb's sample implementation of curve25519 is written in a special assembly
  42. * language called qhasm and uses the floating point registers.
  43. *
  44. * This is, almost, a clean room reimplementation from the curve25519 paper. It
  45. * uses many of the tricks described therein. Only the crecip function is taken
  46. * from the sample implementation.
  47. */
  48. #include "orconfig.h"
  49. #include <string.h>
  50. #include "torint.h"
  51. typedef uint8_t u8;
  52. typedef int32_t s32;
  53. typedef int64_t limb;
  54. /* Field element representation:
  55. *
  56. * Field elements are written as an array of signed, 64-bit limbs, least
  57. * significant first. The value of the field element is:
  58. * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
  59. *
  60. * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
  61. */
  62. /* Sum two numbers: output += in */
  63. static void fsum(limb *output, const limb *in) {
  64. unsigned i;
  65. for (i = 0; i < 10; i += 2) {
  66. output[0+i] = (output[0+i] + in[0+i]);
  67. output[1+i] = (output[1+i] + in[1+i]);
  68. }
  69. }
  70. /* Find the difference of two numbers: output = in - output
  71. * (note the order of the arguments!)
  72. */
  73. static void fdifference(limb *output, const limb *in) {
  74. unsigned i;
  75. for (i = 0; i < 10; ++i) {
  76. output[i] = (in[i] - output[i]);
  77. }
  78. }
  79. /* Multiply a number by a scalar: output = in * scalar */
  80. static void fscalar_product(limb *output, const limb *in, const limb scalar) {
  81. unsigned i;
  82. for (i = 0; i < 10; ++i) {
  83. output[i] = in[i] * scalar;
  84. }
  85. }
  86. /* Multiply two numbers: output = in2 * in
  87. *
  88. * output must be distinct to both inputs. The inputs are reduced coefficient
  89. * form, the output is not.
  90. */
  91. static void fproduct(limb *output, const limb *in2, const limb *in) {
  92. output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
  93. output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
  94. ((limb) ((s32) in2[1])) * ((s32) in[0]);
  95. output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
  96. ((limb) ((s32) in2[0])) * ((s32) in[2]) +
  97. ((limb) ((s32) in2[2])) * ((s32) in[0]);
  98. output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
  99. ((limb) ((s32) in2[2])) * ((s32) in[1]) +
  100. ((limb) ((s32) in2[0])) * ((s32) in[3]) +
  101. ((limb) ((s32) in2[3])) * ((s32) in[0]);
  102. output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
  103. 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
  104. ((limb) ((s32) in2[3])) * ((s32) in[1])) +
  105. ((limb) ((s32) in2[0])) * ((s32) in[4]) +
  106. ((limb) ((s32) in2[4])) * ((s32) in[0]);
  107. output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
  108. ((limb) ((s32) in2[3])) * ((s32) in[2]) +
  109. ((limb) ((s32) in2[1])) * ((s32) in[4]) +
  110. ((limb) ((s32) in2[4])) * ((s32) in[1]) +
  111. ((limb) ((s32) in2[0])) * ((s32) in[5]) +
  112. ((limb) ((s32) in2[5])) * ((s32) in[0]);
  113. output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
  114. ((limb) ((s32) in2[1])) * ((s32) in[5]) +
  115. ((limb) ((s32) in2[5])) * ((s32) in[1])) +
  116. ((limb) ((s32) in2[2])) * ((s32) in[4]) +
  117. ((limb) ((s32) in2[4])) * ((s32) in[2]) +
  118. ((limb) ((s32) in2[0])) * ((s32) in[6]) +
  119. ((limb) ((s32) in2[6])) * ((s32) in[0]);
  120. output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
  121. ((limb) ((s32) in2[4])) * ((s32) in[3]) +
  122. ((limb) ((s32) in2[2])) * ((s32) in[5]) +
  123. ((limb) ((s32) in2[5])) * ((s32) in[2]) +
  124. ((limb) ((s32) in2[1])) * ((s32) in[6]) +
  125. ((limb) ((s32) in2[6])) * ((s32) in[1]) +
  126. ((limb) ((s32) in2[0])) * ((s32) in[7]) +
  127. ((limb) ((s32) in2[7])) * ((s32) in[0]);
  128. output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
  129. 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
  130. ((limb) ((s32) in2[5])) * ((s32) in[3]) +
  131. ((limb) ((s32) in2[1])) * ((s32) in[7]) +
  132. ((limb) ((s32) in2[7])) * ((s32) in[1])) +
  133. ((limb) ((s32) in2[2])) * ((s32) in[6]) +
  134. ((limb) ((s32) in2[6])) * ((s32) in[2]) +
  135. ((limb) ((s32) in2[0])) * ((s32) in[8]) +
  136. ((limb) ((s32) in2[8])) * ((s32) in[0]);
  137. output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
  138. ((limb) ((s32) in2[5])) * ((s32) in[4]) +
  139. ((limb) ((s32) in2[3])) * ((s32) in[6]) +
  140. ((limb) ((s32) in2[6])) * ((s32) in[3]) +
  141. ((limb) ((s32) in2[2])) * ((s32) in[7]) +
  142. ((limb) ((s32) in2[7])) * ((s32) in[2]) +
  143. ((limb) ((s32) in2[1])) * ((s32) in[8]) +
  144. ((limb) ((s32) in2[8])) * ((s32) in[1]) +
  145. ((limb) ((s32) in2[0])) * ((s32) in[9]) +
  146. ((limb) ((s32) in2[9])) * ((s32) in[0]);
  147. output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
  148. ((limb) ((s32) in2[3])) * ((s32) in[7]) +
  149. ((limb) ((s32) in2[7])) * ((s32) in[3]) +
  150. ((limb) ((s32) in2[1])) * ((s32) in[9]) +
  151. ((limb) ((s32) in2[9])) * ((s32) in[1])) +
  152. ((limb) ((s32) in2[4])) * ((s32) in[6]) +
  153. ((limb) ((s32) in2[6])) * ((s32) in[4]) +
  154. ((limb) ((s32) in2[2])) * ((s32) in[8]) +
  155. ((limb) ((s32) in2[8])) * ((s32) in[2]);
  156. output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
  157. ((limb) ((s32) in2[6])) * ((s32) in[5]) +
  158. ((limb) ((s32) in2[4])) * ((s32) in[7]) +
  159. ((limb) ((s32) in2[7])) * ((s32) in[4]) +
  160. ((limb) ((s32) in2[3])) * ((s32) in[8]) +
  161. ((limb) ((s32) in2[8])) * ((s32) in[3]) +
  162. ((limb) ((s32) in2[2])) * ((s32) in[9]) +
  163. ((limb) ((s32) in2[9])) * ((s32) in[2]);
  164. output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
  165. 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
  166. ((limb) ((s32) in2[7])) * ((s32) in[5]) +
  167. ((limb) ((s32) in2[3])) * ((s32) in[9]) +
  168. ((limb) ((s32) in2[9])) * ((s32) in[3])) +
  169. ((limb) ((s32) in2[4])) * ((s32) in[8]) +
  170. ((limb) ((s32) in2[8])) * ((s32) in[4]);
  171. output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
  172. ((limb) ((s32) in2[7])) * ((s32) in[6]) +
  173. ((limb) ((s32) in2[5])) * ((s32) in[8]) +
  174. ((limb) ((s32) in2[8])) * ((s32) in[5]) +
  175. ((limb) ((s32) in2[4])) * ((s32) in[9]) +
  176. ((limb) ((s32) in2[9])) * ((s32) in[4]);
  177. output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
  178. ((limb) ((s32) in2[5])) * ((s32) in[9]) +
  179. ((limb) ((s32) in2[9])) * ((s32) in[5])) +
  180. ((limb) ((s32) in2[6])) * ((s32) in[8]) +
  181. ((limb) ((s32) in2[8])) * ((s32) in[6]);
  182. output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
  183. ((limb) ((s32) in2[8])) * ((s32) in[7]) +
  184. ((limb) ((s32) in2[6])) * ((s32) in[9]) +
  185. ((limb) ((s32) in2[9])) * ((s32) in[6]);
  186. output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
  187. 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
  188. ((limb) ((s32) in2[9])) * ((s32) in[7]));
  189. output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
  190. ((limb) ((s32) in2[9])) * ((s32) in[8]);
  191. output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
  192. }
  193. /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
  194. static void freduce_degree(limb *output) {
  195. /* Each of these shifts and adds ends up multiplying the value by 19. */
  196. output[8] += output[18] << 4;
  197. output[8] += output[18] << 1;
  198. output[8] += output[18];
  199. output[7] += output[17] << 4;
  200. output[7] += output[17] << 1;
  201. output[7] += output[17];
  202. output[6] += output[16] << 4;
  203. output[6] += output[16] << 1;
  204. output[6] += output[16];
  205. output[5] += output[15] << 4;
  206. output[5] += output[15] << 1;
  207. output[5] += output[15];
  208. output[4] += output[14] << 4;
  209. output[4] += output[14] << 1;
  210. output[4] += output[14];
  211. output[3] += output[13] << 4;
  212. output[3] += output[13] << 1;
  213. output[3] += output[13];
  214. output[2] += output[12] << 4;
  215. output[2] += output[12] << 1;
  216. output[2] += output[12];
  217. output[1] += output[11] << 4;
  218. output[1] += output[11] << 1;
  219. output[1] += output[11];
  220. output[0] += output[10] << 4;
  221. output[0] += output[10] << 1;
  222. output[0] += output[10];
  223. }
  224. #if (-1 & 3) != 3
  225. #error "This code only works on a two's complement system"
  226. #endif
  227. /* return v / 2^26, using only shifts and adds. */
  228. static inline limb
  229. div_by_2_26(const limb v)
  230. {
  231. /* High word of v; no shift needed*/
  232. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  233. /* Set to all 1s if v was negative; else set to 0s. */
  234. const int32_t sign = ((int32_t) highword) >> 31;
  235. /* Set to 0x3ffffff if v was negative; else set to 0. */
  236. const int32_t roundoff = ((uint32_t) sign) >> 6;
  237. /* Should return v / (1<<26) */
  238. return (v + roundoff) >> 26;
  239. }
  240. /* return v / (2^25), using only shifts and adds. */
  241. static inline limb
  242. div_by_2_25(const limb v)
  243. {
  244. /* High word of v; no shift needed*/
  245. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  246. /* Set to all 1s if v was negative; else set to 0s. */
  247. const int32_t sign = ((int32_t) highword) >> 31;
  248. /* Set to 0x1ffffff if v was negative; else set to 0. */
  249. const int32_t roundoff = ((uint32_t) sign) >> 7;
  250. /* Should return v / (1<<25) */
  251. return (v + roundoff) >> 25;
  252. }
  253. static inline s32
  254. div_s32_by_2_25(const s32 v)
  255. {
  256. const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
  257. return (v + roundoff) >> 25;
  258. }
  259. /* Reduce all coefficients of the short form input so that |x| < 2^26.
  260. *
  261. * On entry: |output[i]| < 2^62
  262. */
  263. static void freduce_coefficients(limb *output) {
  264. unsigned i;
  265. output[10] = 0;
  266. for (i = 0; i < 10; i += 2) {
  267. limb over = div_by_2_26(output[i]);
  268. output[i] -= over << 26;
  269. output[i+1] += over;
  270. over = div_by_2_25(output[i+1]);
  271. output[i+1] -= over << 25;
  272. output[i+2] += over;
  273. }
  274. /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
  275. output[0] += output[10] << 4;
  276. output[0] += output[10] << 1;
  277. output[0] += output[10];
  278. output[10] = 0;
  279. /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
  280. * So |over| will be no more than 77825 */
  281. {
  282. limb over = div_by_2_26(output[0]);
  283. output[0] -= over << 26;
  284. output[1] += over;
  285. }
  286. /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
  287. * So |over| will be no more than 1. */
  288. {
  289. /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
  290. s32 over32 = div_s32_by_2_25((s32) output[1]);
  291. output[1] -= over32 << 25;
  292. output[2] += over32;
  293. }
  294. /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
  295. * we have |output[2]| <= 2^26. This is good enough for all of our math,
  296. * but it will require an extra freduce_coefficients before fcontract. */
  297. }
  298. /* A helpful wrapper around fproduct: output = in * in2.
  299. *
  300. * output must be distinct to both inputs. The output is reduced degree and
  301. * reduced coefficient.
  302. */
  303. static void
  304. fmul(limb *output, const limb *in, const limb *in2) {
  305. limb t[19];
  306. fproduct(t, in, in2);
  307. freduce_degree(t);
  308. freduce_coefficients(t);
  309. memcpy(output, t, sizeof(limb) * 10);
  310. }
  311. static void fsquare_inner(limb *output, const limb *in) {
  312. output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
  313. output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
  314. output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
  315. ((limb) ((s32) in[0])) * ((s32) in[2]));
  316. output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
  317. ((limb) ((s32) in[0])) * ((s32) in[3]));
  318. output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
  319. 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
  320. 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
  321. output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
  322. ((limb) ((s32) in[1])) * ((s32) in[4]) +
  323. ((limb) ((s32) in[0])) * ((s32) in[5]));
  324. output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
  325. ((limb) ((s32) in[2])) * ((s32) in[4]) +
  326. ((limb) ((s32) in[0])) * ((s32) in[6]) +
  327. 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
  328. output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
  329. ((limb) ((s32) in[2])) * ((s32) in[5]) +
  330. ((limb) ((s32) in[1])) * ((s32) in[6]) +
  331. ((limb) ((s32) in[0])) * ((s32) in[7]));
  332. output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
  333. 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
  334. ((limb) ((s32) in[0])) * ((s32) in[8]) +
  335. 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
  336. ((limb) ((s32) in[3])) * ((s32) in[5])));
  337. output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
  338. ((limb) ((s32) in[3])) * ((s32) in[6]) +
  339. ((limb) ((s32) in[2])) * ((s32) in[7]) +
  340. ((limb) ((s32) in[1])) * ((s32) in[8]) +
  341. ((limb) ((s32) in[0])) * ((s32) in[9]));
  342. output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
  343. ((limb) ((s32) in[4])) * ((s32) in[6]) +
  344. ((limb) ((s32) in[2])) * ((s32) in[8]) +
  345. 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
  346. ((limb) ((s32) in[1])) * ((s32) in[9])));
  347. output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
  348. ((limb) ((s32) in[4])) * ((s32) in[7]) +
  349. ((limb) ((s32) in[3])) * ((s32) in[8]) +
  350. ((limb) ((s32) in[2])) * ((s32) in[9]));
  351. output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
  352. 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
  353. 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
  354. ((limb) ((s32) in[3])) * ((s32) in[9])));
  355. output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
  356. ((limb) ((s32) in[5])) * ((s32) in[8]) +
  357. ((limb) ((s32) in[4])) * ((s32) in[9]));
  358. output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
  359. ((limb) ((s32) in[6])) * ((s32) in[8]) +
  360. 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
  361. output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
  362. ((limb) ((s32) in[6])) * ((s32) in[9]));
  363. output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
  364. 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
  365. output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
  366. output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
  367. }
  368. static void
  369. fsquare(limb *output, const limb *in) {
  370. limb t[19];
  371. fsquare_inner(t, in);
  372. freduce_degree(t);
  373. freduce_coefficients(t);
  374. memcpy(output, t, sizeof(limb) * 10);
  375. }
  376. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  377. static void
  378. fexpand(limb *output, const u8 *input) {
  379. #define F(n,start,shift,mask) \
  380. output[n] = ((((limb) input[start + 0]) | \
  381. ((limb) input[start + 1]) << 8 | \
  382. ((limb) input[start + 2]) << 16 | \
  383. ((limb) input[start + 3]) << 24) >> shift) & mask;
  384. F(0, 0, 0, 0x3ffffff);
  385. F(1, 3, 2, 0x1ffffff);
  386. F(2, 6, 3, 0x3ffffff);
  387. F(3, 9, 5, 0x1ffffff);
  388. F(4, 12, 6, 0x3ffffff);
  389. F(5, 16, 0, 0x1ffffff);
  390. F(6, 19, 1, 0x3ffffff);
  391. F(7, 22, 3, 0x1ffffff);
  392. F(8, 25, 4, 0x3ffffff);
  393. F(9, 28, 6, 0x1ffffff);
  394. #undef F
  395. }
  396. #if (-32 >> 1) != -16
  397. #error "This code only works when >> does sign-extension on negative numbers"
  398. #endif
  399. /* Take a fully reduced polynomial form number and contract it into a
  400. * little-endian, 32-byte array
  401. */
  402. static void
  403. fcontract(u8 *output, limb *input) {
  404. int i;
  405. int j;
  406. for (j = 0; j < 2; ++j) {
  407. for (i = 0; i < 9; ++i) {
  408. if ((i & 1) == 1) {
  409. /* This calculation is a time-invariant way to make input[i] positive
  410. by borrowing from the next-larger limb.
  411. */
  412. const s32 mask = (s32)(input[i]) >> 31;
  413. const s32 carry = -(((s32)(input[i]) & mask) >> 25);
  414. input[i] = (s32)(input[i]) + (carry << 25);
  415. input[i+1] = (s32)(input[i+1]) - carry;
  416. } else {
  417. const s32 mask = (s32)(input[i]) >> 31;
  418. const s32 carry = -(((s32)(input[i]) & mask) >> 26);
  419. input[i] = (s32)(input[i]) + (carry << 26);
  420. input[i+1] = (s32)(input[i+1]) - carry;
  421. }
  422. }
  423. {
  424. const s32 mask = (s32)(input[9]) >> 31;
  425. const s32 carry = -(((s32)(input[9]) & mask) >> 25);
  426. input[9] = (s32)(input[9]) + (carry << 25);
  427. input[0] = (s32)(input[0]) - (carry * 19);
  428. }
  429. }
  430. /* The first borrow-propagation pass above ended with every limb
  431. except (possibly) input[0] non-negative.
  432. Since each input limb except input[0] is decreased by at most 1
  433. by a borrow-propagation pass, the second borrow-propagation pass
  434. could only have wrapped around to decrease input[0] again if the
  435. first pass left input[0] negative *and* input[1] through input[9]
  436. were all zero. In that case, input[1] is now 2^25 - 1, and this
  437. last borrow-propagation step will leave input[1] non-negative.
  438. */
  439. {
  440. const s32 mask = (s32)(input[0]) >> 31;
  441. const s32 carry = -(((s32)(input[0]) & mask) >> 26);
  442. input[0] = (s32)(input[0]) + (carry << 26);
  443. input[1] = (s32)(input[1]) - carry;
  444. }
  445. /* Both passes through the above loop, plus the last 0-to-1 step, are
  446. necessary: if input[9] is -1 and input[0] through input[8] are 0,
  447. negative values will remain in the array until the end.
  448. */
  449. input[1] <<= 2;
  450. input[2] <<= 3;
  451. input[3] <<= 5;
  452. input[4] <<= 6;
  453. input[6] <<= 1;
  454. input[7] <<= 3;
  455. input[8] <<= 4;
  456. input[9] <<= 6;
  457. #define F(i, s) \
  458. output[s+0] |= input[i] & 0xff; \
  459. output[s+1] = (input[i] >> 8) & 0xff; \
  460. output[s+2] = (input[i] >> 16) & 0xff; \
  461. output[s+3] = (input[i] >> 24) & 0xff;
  462. output[0] = 0;
  463. output[16] = 0;
  464. F(0,0);
  465. F(1,3);
  466. F(2,6);
  467. F(3,9);
  468. F(4,12);
  469. F(5,16);
  470. F(6,19);
  471. F(7,22);
  472. F(8,25);
  473. F(9,28);
  474. #undef F
  475. }
  476. /* Input: Q, Q', Q-Q'
  477. * Output: 2Q, Q+Q'
  478. *
  479. * x2 z3: long form
  480. * x3 z3: long form
  481. * x z: short form, destroyed
  482. * xprime zprime: short form, destroyed
  483. * qmqp: short form, preserved
  484. */
  485. static void fmonty(limb *x2, limb *z2, /* output 2Q */
  486. limb *x3, limb *z3, /* output Q + Q' */
  487. limb *x, limb *z, /* input Q */
  488. limb *xprime, limb *zprime, /* input Q' */
  489. const limb *qmqp /* input Q - Q' */) {
  490. limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
  491. zzprime[19], zzzprime[19], xxxprime[19];
  492. memcpy(origx, x, 10 * sizeof(limb));
  493. fsum(x, z);
  494. fdifference(z, origx); // does x - z
  495. memcpy(origxprime, xprime, sizeof(limb) * 10);
  496. fsum(xprime, zprime);
  497. fdifference(zprime, origxprime);
  498. fproduct(xxprime, xprime, z);
  499. fproduct(zzprime, x, zprime);
  500. freduce_degree(xxprime);
  501. freduce_coefficients(xxprime);
  502. freduce_degree(zzprime);
  503. freduce_coefficients(zzprime);
  504. memcpy(origxprime, xxprime, sizeof(limb) * 10);
  505. fsum(xxprime, zzprime);
  506. fdifference(zzprime, origxprime);
  507. fsquare(xxxprime, xxprime);
  508. fsquare(zzzprime, zzprime);
  509. fproduct(zzprime, zzzprime, qmqp);
  510. freduce_degree(zzprime);
  511. freduce_coefficients(zzprime);
  512. memcpy(x3, xxxprime, sizeof(limb) * 10);
  513. memcpy(z3, zzprime, sizeof(limb) * 10);
  514. fsquare(xx, x);
  515. fsquare(zz, z);
  516. fproduct(x2, xx, zz);
  517. freduce_degree(x2);
  518. freduce_coefficients(x2);
  519. fdifference(zz, xx); // does zz = xx - zz
  520. memset(zzz + 10, 0, sizeof(limb) * 9);
  521. fscalar_product(zzz, zz, 121665);
  522. /* No need to call freduce_degree here:
  523. fscalar_product doesn't increase the degree of its input. */
  524. freduce_coefficients(zzz);
  525. fsum(zzz, xx);
  526. fproduct(z2, zz, zzz);
  527. freduce_degree(z2);
  528. freduce_coefficients(z2);
  529. }
  530. /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
  531. * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
  532. * side-channel attacks.
  533. *
  534. * NOTE that this function requires that 'iswap' be 1 or 0; other values give
  535. * wrong results. Also, the two limb arrays must be in reduced-coefficient,
  536. * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
  537. * and all all values in a[0..9],b[0..9] must have magnitude less than
  538. * INT32_MAX.
  539. */
  540. static void
  541. swap_conditional(limb a[19], limb b[19], limb iswap) {
  542. unsigned i;
  543. const s32 swap = (s32) -iswap;
  544. for (i = 0; i < 10; ++i) {
  545. const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
  546. a[i] = ((s32)a[i]) ^ x;
  547. b[i] = ((s32)b[i]) ^ x;
  548. }
  549. }
  550. /* Calculates nQ where Q is the x-coordinate of a point on the curve
  551. *
  552. * resultx/resultz: the x coordinate of the resulting curve point (short form)
  553. * n: a little endian, 32-byte number
  554. * q: a point of the curve (short form)
  555. */
  556. static void
  557. cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
  558. limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
  559. limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
  560. limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
  561. limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
  562. unsigned i, j;
  563. memcpy(nqpqx, q, sizeof(limb) * 10);
  564. for (i = 0; i < 32; ++i) {
  565. u8 byte = n[31 - i];
  566. for (j = 0; j < 8; ++j) {
  567. const limb bit = byte >> 7;
  568. swap_conditional(nqx, nqpqx, bit);
  569. swap_conditional(nqz, nqpqz, bit);
  570. fmonty(nqx2, nqz2,
  571. nqpqx2, nqpqz2,
  572. nqx, nqz,
  573. nqpqx, nqpqz,
  574. q);
  575. swap_conditional(nqx2, nqpqx2, bit);
  576. swap_conditional(nqz2, nqpqz2, bit);
  577. t = nqx;
  578. nqx = nqx2;
  579. nqx2 = t;
  580. t = nqz;
  581. nqz = nqz2;
  582. nqz2 = t;
  583. t = nqpqx;
  584. nqpqx = nqpqx2;
  585. nqpqx2 = t;
  586. t = nqpqz;
  587. nqpqz = nqpqz2;
  588. nqpqz2 = t;
  589. byte <<= 1;
  590. }
  591. }
  592. memcpy(resultx, nqx, sizeof(limb) * 10);
  593. memcpy(resultz, nqz, sizeof(limb) * 10);
  594. }
  595. // -----------------------------------------------------------------------------
  596. // Shamelessly copied from djb's code
  597. // -----------------------------------------------------------------------------
  598. static void
  599. crecip(limb *out, const limb *z) {
  600. limb z2[10];
  601. limb z9[10];
  602. limb z11[10];
  603. limb z2_5_0[10];
  604. limb z2_10_0[10];
  605. limb z2_20_0[10];
  606. limb z2_50_0[10];
  607. limb z2_100_0[10];
  608. limb t0[10];
  609. limb t1[10];
  610. int i;
  611. /* 2 */ fsquare(z2,z);
  612. /* 4 */ fsquare(t1,z2);
  613. /* 8 */ fsquare(t0,t1);
  614. /* 9 */ fmul(z9,t0,z);
  615. /* 11 */ fmul(z11,z9,z2);
  616. /* 22 */ fsquare(t0,z11);
  617. /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
  618. /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
  619. /* 2^7 - 2^2 */ fsquare(t1,t0);
  620. /* 2^8 - 2^3 */ fsquare(t0,t1);
  621. /* 2^9 - 2^4 */ fsquare(t1,t0);
  622. /* 2^10 - 2^5 */ fsquare(t0,t1);
  623. /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
  624. /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
  625. /* 2^12 - 2^2 */ fsquare(t1,t0);
  626. /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  627. /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
  628. /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
  629. /* 2^22 - 2^2 */ fsquare(t1,t0);
  630. /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  631. /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
  632. /* 2^41 - 2^1 */ fsquare(t1,t0);
  633. /* 2^42 - 2^2 */ fsquare(t0,t1);
  634. /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  635. /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
  636. /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
  637. /* 2^52 - 2^2 */ fsquare(t1,t0);
  638. /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  639. /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
  640. /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
  641. /* 2^102 - 2^2 */ fsquare(t0,t1);
  642. /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  643. /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
  644. /* 2^201 - 2^1 */ fsquare(t0,t1);
  645. /* 2^202 - 2^2 */ fsquare(t1,t0);
  646. /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  647. /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
  648. /* 2^251 - 2^1 */ fsquare(t1,t0);
  649. /* 2^252 - 2^2 */ fsquare(t0,t1);
  650. /* 2^253 - 2^3 */ fsquare(t1,t0);
  651. /* 2^254 - 2^4 */ fsquare(t0,t1);
  652. /* 2^255 - 2^5 */ fsquare(t1,t0);
  653. /* 2^255 - 21 */ fmul(out,t1,z11);
  654. }
  655. int curve25519_donna(u8 *, const u8 *, const u8 *);
  656. int
  657. curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
  658. limb bp[10], x[10], z[11], zmone[10];
  659. uint8_t e[32];
  660. int i;
  661. for (i = 0; i < 32; ++i) e[i] = secret[i];
  662. e[0] &= 248;
  663. e[31] &= 127;
  664. e[31] |= 64;
  665. fexpand(bp, basepoint);
  666. cmult(x, z, e, bp);
  667. crecip(zmone, z);
  668. fmul(z, x, zmone);
  669. freduce_coefficients(z);
  670. fcontract(mypublic, z);
  671. return 0;
  672. }