curve25519-donna.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. /* Copyright 2008, Google Inc.
  2. * All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. *
  8. * * Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * * Redistributions in binary form must reproduce the above
  11. * copyright notice, this list of conditions and the following disclaimer
  12. * in the documentation and/or other materials provided with the
  13. * distribution.
  14. * * Neither the name of Google Inc. nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * curve25519-donna: Curve25519 elliptic curve, public key function
  31. *
  32. * http://code.google.com/p/curve25519-donna/
  33. *
  34. * Adam Langley <agl@imperialviolet.org>
  35. *
  36. * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  37. *
  38. * More information about curve25519 can be found here
  39. * http://cr.yp.to/ecdh.html
  40. *
  41. * djb's sample implementation of curve25519 is written in a special assembly
  42. * language called qhasm and uses the floating point registers.
  43. *
  44. * This is, almost, a clean room reimplementation from the curve25519 paper. It
  45. * uses many of the tricks described therein. Only the crecip function is taken
  46. * from the sample implementation. */
  47. #include "orconfig.h"
  48. #include <string.h>
  49. #include "torint.h"
  50. typedef uint8_t u8;
  51. typedef int32_t s32;
  52. typedef int64_t limb;
  53. /* Field element representation:
  54. *
  55. * Field elements are written as an array of signed, 64-bit limbs, least
  56. * significant first. The value of the field element is:
  57. * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
  58. *
  59. * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
  60. /* Sum two numbers: output += in */
  61. static void fsum(limb *output, const limb *in) {
  62. unsigned i;
  63. for (i = 0; i < 10; i += 2) {
  64. output[0+i] = output[0+i] + in[0+i];
  65. output[1+i] = output[1+i] + in[1+i];
  66. }
  67. }
  68. /* Find the difference of two numbers: output = in - output
  69. * (note the order of the arguments!). */
  70. static void fdifference(limb *output, const limb *in) {
  71. unsigned i;
  72. for (i = 0; i < 10; ++i) {
  73. output[i] = in[i] - output[i];
  74. }
  75. }
  76. /* Multiply a number by a scalar: output = in * scalar */
  77. static void fscalar_product(limb *output, const limb *in, const limb scalar) {
  78. unsigned i;
  79. for (i = 0; i < 10; ++i) {
  80. output[i] = in[i] * scalar;
  81. }
  82. }
  83. /* Multiply two numbers: output = in2 * in
  84. *
  85. * output must be distinct to both inputs. The inputs are reduced coefficient
  86. * form, the output is not.
  87. *
  88. * output[x] <= 14 * the largest product of the input limbs. */
  89. static void fproduct(limb *output, const limb *in2, const limb *in) {
  90. output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
  91. output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
  92. ((limb) ((s32) in2[1])) * ((s32) in[0]);
  93. output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
  94. ((limb) ((s32) in2[0])) * ((s32) in[2]) +
  95. ((limb) ((s32) in2[2])) * ((s32) in[0]);
  96. output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
  97. ((limb) ((s32) in2[2])) * ((s32) in[1]) +
  98. ((limb) ((s32) in2[0])) * ((s32) in[3]) +
  99. ((limb) ((s32) in2[3])) * ((s32) in[0]);
  100. output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
  101. 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
  102. ((limb) ((s32) in2[3])) * ((s32) in[1])) +
  103. ((limb) ((s32) in2[0])) * ((s32) in[4]) +
  104. ((limb) ((s32) in2[4])) * ((s32) in[0]);
  105. output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
  106. ((limb) ((s32) in2[3])) * ((s32) in[2]) +
  107. ((limb) ((s32) in2[1])) * ((s32) in[4]) +
  108. ((limb) ((s32) in2[4])) * ((s32) in[1]) +
  109. ((limb) ((s32) in2[0])) * ((s32) in[5]) +
  110. ((limb) ((s32) in2[5])) * ((s32) in[0]);
  111. output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
  112. ((limb) ((s32) in2[1])) * ((s32) in[5]) +
  113. ((limb) ((s32) in2[5])) * ((s32) in[1])) +
  114. ((limb) ((s32) in2[2])) * ((s32) in[4]) +
  115. ((limb) ((s32) in2[4])) * ((s32) in[2]) +
  116. ((limb) ((s32) in2[0])) * ((s32) in[6]) +
  117. ((limb) ((s32) in2[6])) * ((s32) in[0]);
  118. output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
  119. ((limb) ((s32) in2[4])) * ((s32) in[3]) +
  120. ((limb) ((s32) in2[2])) * ((s32) in[5]) +
  121. ((limb) ((s32) in2[5])) * ((s32) in[2]) +
  122. ((limb) ((s32) in2[1])) * ((s32) in[6]) +
  123. ((limb) ((s32) in2[6])) * ((s32) in[1]) +
  124. ((limb) ((s32) in2[0])) * ((s32) in[7]) +
  125. ((limb) ((s32) in2[7])) * ((s32) in[0]);
  126. output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
  127. 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
  128. ((limb) ((s32) in2[5])) * ((s32) in[3]) +
  129. ((limb) ((s32) in2[1])) * ((s32) in[7]) +
  130. ((limb) ((s32) in2[7])) * ((s32) in[1])) +
  131. ((limb) ((s32) in2[2])) * ((s32) in[6]) +
  132. ((limb) ((s32) in2[6])) * ((s32) in[2]) +
  133. ((limb) ((s32) in2[0])) * ((s32) in[8]) +
  134. ((limb) ((s32) in2[8])) * ((s32) in[0]);
  135. output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
  136. ((limb) ((s32) in2[5])) * ((s32) in[4]) +
  137. ((limb) ((s32) in2[3])) * ((s32) in[6]) +
  138. ((limb) ((s32) in2[6])) * ((s32) in[3]) +
  139. ((limb) ((s32) in2[2])) * ((s32) in[7]) +
  140. ((limb) ((s32) in2[7])) * ((s32) in[2]) +
  141. ((limb) ((s32) in2[1])) * ((s32) in[8]) +
  142. ((limb) ((s32) in2[8])) * ((s32) in[1]) +
  143. ((limb) ((s32) in2[0])) * ((s32) in[9]) +
  144. ((limb) ((s32) in2[9])) * ((s32) in[0]);
  145. output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
  146. ((limb) ((s32) in2[3])) * ((s32) in[7]) +
  147. ((limb) ((s32) in2[7])) * ((s32) in[3]) +
  148. ((limb) ((s32) in2[1])) * ((s32) in[9]) +
  149. ((limb) ((s32) in2[9])) * ((s32) in[1])) +
  150. ((limb) ((s32) in2[4])) * ((s32) in[6]) +
  151. ((limb) ((s32) in2[6])) * ((s32) in[4]) +
  152. ((limb) ((s32) in2[2])) * ((s32) in[8]) +
  153. ((limb) ((s32) in2[8])) * ((s32) in[2]);
  154. output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
  155. ((limb) ((s32) in2[6])) * ((s32) in[5]) +
  156. ((limb) ((s32) in2[4])) * ((s32) in[7]) +
  157. ((limb) ((s32) in2[7])) * ((s32) in[4]) +
  158. ((limb) ((s32) in2[3])) * ((s32) in[8]) +
  159. ((limb) ((s32) in2[8])) * ((s32) in[3]) +
  160. ((limb) ((s32) in2[2])) * ((s32) in[9]) +
  161. ((limb) ((s32) in2[9])) * ((s32) in[2]);
  162. output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
  163. 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
  164. ((limb) ((s32) in2[7])) * ((s32) in[5]) +
  165. ((limb) ((s32) in2[3])) * ((s32) in[9]) +
  166. ((limb) ((s32) in2[9])) * ((s32) in[3])) +
  167. ((limb) ((s32) in2[4])) * ((s32) in[8]) +
  168. ((limb) ((s32) in2[8])) * ((s32) in[4]);
  169. output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
  170. ((limb) ((s32) in2[7])) * ((s32) in[6]) +
  171. ((limb) ((s32) in2[5])) * ((s32) in[8]) +
  172. ((limb) ((s32) in2[8])) * ((s32) in[5]) +
  173. ((limb) ((s32) in2[4])) * ((s32) in[9]) +
  174. ((limb) ((s32) in2[9])) * ((s32) in[4]);
  175. output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
  176. ((limb) ((s32) in2[5])) * ((s32) in[9]) +
  177. ((limb) ((s32) in2[9])) * ((s32) in[5])) +
  178. ((limb) ((s32) in2[6])) * ((s32) in[8]) +
  179. ((limb) ((s32) in2[8])) * ((s32) in[6]);
  180. output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
  181. ((limb) ((s32) in2[8])) * ((s32) in[7]) +
  182. ((limb) ((s32) in2[6])) * ((s32) in[9]) +
  183. ((limb) ((s32) in2[9])) * ((s32) in[6]);
  184. output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
  185. 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
  186. ((limb) ((s32) in2[9])) * ((s32) in[7]));
  187. output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
  188. ((limb) ((s32) in2[9])) * ((s32) in[8]);
  189. output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
  190. }
  191. /* Reduce a long form to a short form by taking the input mod 2^255 - 19.
  192. *
  193. * On entry: |output[i]| < 14*2^54
  194. * On exit: |output[0..8]| < 280*2^54 */
  195. static void freduce_degree(limb *output) {
  196. /* Each of these shifts and adds ends up multiplying the value by 19.
  197. *
  198. * For output[0..8], the absolute entry value is < 14*2^54 and we add, at
  199. * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */
  200. output[8] += output[18] << 4;
  201. output[8] += output[18] << 1;
  202. output[8] += output[18];
  203. output[7] += output[17] << 4;
  204. output[7] += output[17] << 1;
  205. output[7] += output[17];
  206. output[6] += output[16] << 4;
  207. output[6] += output[16] << 1;
  208. output[6] += output[16];
  209. output[5] += output[15] << 4;
  210. output[5] += output[15] << 1;
  211. output[5] += output[15];
  212. output[4] += output[14] << 4;
  213. output[4] += output[14] << 1;
  214. output[4] += output[14];
  215. output[3] += output[13] << 4;
  216. output[3] += output[13] << 1;
  217. output[3] += output[13];
  218. output[2] += output[12] << 4;
  219. output[2] += output[12] << 1;
  220. output[2] += output[12];
  221. output[1] += output[11] << 4;
  222. output[1] += output[11] << 1;
  223. output[1] += output[11];
  224. output[0] += output[10] << 4;
  225. output[0] += output[10] << 1;
  226. output[0] += output[10];
  227. }
  228. #if (-1 & 3) != 3
  229. #error "This code only works on a two's complement system"
  230. #endif
  231. /* return v / 2^26, using only shifts and adds.
  232. *
  233. * On entry: v can take any value. */
  234. static inline limb
  235. div_by_2_26(const limb v)
  236. {
  237. /* High word of v; no shift needed. */
  238. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  239. /* Set to all 1s if v was negative; else set to 0s. */
  240. const int32_t sign = ((int32_t) highword) >> 31;
  241. /* Set to 0x3ffffff if v was negative; else set to 0. */
  242. const int32_t roundoff = ((uint32_t) sign) >> 6;
  243. /* Should return v / (1<<26) */
  244. return (v + roundoff) >> 26;
  245. }
  246. /* return v / (2^25), using only shifts and adds.
  247. *
  248. * On entry: v can take any value. */
  249. static inline limb
  250. div_by_2_25(const limb v)
  251. {
  252. /* High word of v; no shift needed*/
  253. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  254. /* Set to all 1s if v was negative; else set to 0s. */
  255. const int32_t sign = ((int32_t) highword) >> 31;
  256. /* Set to 0x1ffffff if v was negative; else set to 0. */
  257. const int32_t roundoff = ((uint32_t) sign) >> 7;
  258. /* Should return v / (1<<25) */
  259. return (v + roundoff) >> 25;
  260. }
  261. #if 0
  262. /* return v / (2^25), using only shifts and adds.
  263. *
  264. * On entry: v can take any value. */
  265. static inline s32
  266. div_s32_by_2_25(const s32 v)
  267. {
  268. const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
  269. return (v + roundoff) >> 25;
  270. }
  271. #endif
  272. /* Reduce all coefficients of the short form input so that |x| < 2^26.
  273. *
  274. * On entry: |output[i]| < 280*2^54 */
  275. static void freduce_coefficients(limb *output) {
  276. unsigned i;
  277. output[10] = 0;
  278. for (i = 0; i < 10; i += 2) {
  279. limb over = div_by_2_26(output[i]);
  280. /* The entry condition (that |output[i]| < 280*2^54) means that over is, at
  281. * most, 280*2^28 in the first iteration of this loop. This is added to the
  282. * next limb and we can approximate the resulting bound of that limb by
  283. * 281*2^54. */
  284. output[i] -= over << 26;
  285. output[i+1] += over;
  286. /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| <
  287. * 281*2^29. When this is added to the next limb, the resulting bound can
  288. * be approximated as 281*2^54.
  289. *
  290. * For subsequent iterations of the loop, 281*2^54 remains a conservative
  291. * bound and no overflow occurs. */
  292. over = div_by_2_25(output[i+1]);
  293. output[i+1] -= over << 25;
  294. output[i+2] += over;
  295. }
  296. /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */
  297. output[0] += output[10] << 4;
  298. output[0] += output[10] << 1;
  299. output[0] += output[10];
  300. output[10] = 0;
  301. /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29
  302. * So |over| will be no more than 2^16. */
  303. {
  304. limb over = div_by_2_26(output[0]);
  305. output[0] -= over << 26;
  306. output[1] += over;
  307. }
  308. /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The
  309. * bound on |output[1]| is sufficient to meet our needs. */
  310. }
  311. /* A helpful wrapper around fproduct: output = in * in2.
  312. *
  313. * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.
  314. *
  315. * output must be distinct to both inputs. The output is reduced degree
  316. * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */
  317. static void
  318. fmul(limb *output, const limb *in, const limb *in2) {
  319. limb t[19];
  320. fproduct(t, in, in2);
  321. /* |t[i]| < 14*2^54 */
  322. freduce_degree(t);
  323. freduce_coefficients(t);
  324. /* |t[i]| < 2^26 */
  325. memcpy(output, t, sizeof(limb) * 10);
  326. }
  327. /* Square a number: output = in**2
  328. *
  329. * output must be distinct from the input. The inputs are reduced coefficient
  330. * form, the output is not.
  331. *
  332. * output[x] <= 14 * the largest product of the input limbs. */
  333. static void fsquare_inner(limb *output, const limb *in) {
  334. output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
  335. output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
  336. output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
  337. ((limb) ((s32) in[0])) * ((s32) in[2]));
  338. output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
  339. ((limb) ((s32) in[0])) * ((s32) in[3]));
  340. output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
  341. 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
  342. 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
  343. output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
  344. ((limb) ((s32) in[1])) * ((s32) in[4]) +
  345. ((limb) ((s32) in[0])) * ((s32) in[5]));
  346. output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
  347. ((limb) ((s32) in[2])) * ((s32) in[4]) +
  348. ((limb) ((s32) in[0])) * ((s32) in[6]) +
  349. 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
  350. output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
  351. ((limb) ((s32) in[2])) * ((s32) in[5]) +
  352. ((limb) ((s32) in[1])) * ((s32) in[6]) +
  353. ((limb) ((s32) in[0])) * ((s32) in[7]));
  354. output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
  355. 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
  356. ((limb) ((s32) in[0])) * ((s32) in[8]) +
  357. 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
  358. ((limb) ((s32) in[3])) * ((s32) in[5])));
  359. output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
  360. ((limb) ((s32) in[3])) * ((s32) in[6]) +
  361. ((limb) ((s32) in[2])) * ((s32) in[7]) +
  362. ((limb) ((s32) in[1])) * ((s32) in[8]) +
  363. ((limb) ((s32) in[0])) * ((s32) in[9]));
  364. output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
  365. ((limb) ((s32) in[4])) * ((s32) in[6]) +
  366. ((limb) ((s32) in[2])) * ((s32) in[8]) +
  367. 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
  368. ((limb) ((s32) in[1])) * ((s32) in[9])));
  369. output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
  370. ((limb) ((s32) in[4])) * ((s32) in[7]) +
  371. ((limb) ((s32) in[3])) * ((s32) in[8]) +
  372. ((limb) ((s32) in[2])) * ((s32) in[9]));
  373. output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
  374. 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
  375. 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
  376. ((limb) ((s32) in[3])) * ((s32) in[9])));
  377. output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
  378. ((limb) ((s32) in[5])) * ((s32) in[8]) +
  379. ((limb) ((s32) in[4])) * ((s32) in[9]));
  380. output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
  381. ((limb) ((s32) in[6])) * ((s32) in[8]) +
  382. 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
  383. output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
  384. ((limb) ((s32) in[6])) * ((s32) in[9]));
  385. output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
  386. 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
  387. output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
  388. output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
  389. }
  390. /* fsquare sets output = in^2.
  391. *
  392. * On entry: The |in| argument is in reduced coefficients form and |in[i]| <
  393. * 2^27.
  394. *
  395. * On exit: The |output| argument is in reduced coefficients form (indeed, one
  396. * need only provide storage for 10 limbs) and |out[i]| < 2^26. */
  397. static void
  398. fsquare(limb *output, const limb *in) {
  399. limb t[19];
  400. fsquare_inner(t, in);
  401. /* |t[i]| < 14*2^54 because the largest product of two limbs will be <
  402. * 2^(27+27) and fsquare_inner adds together, at most, 14 of those
  403. * products. */
  404. freduce_degree(t);
  405. freduce_coefficients(t);
  406. /* |t[i]| < 2^26 */
  407. memcpy(output, t, sizeof(limb) * 10);
  408. }
  409. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  410. static void
  411. fexpand(limb *output, const u8 *input) {
  412. #define F(n,start,shift,mask) \
  413. output[n] = ((((limb) input[start + 0]) | \
  414. ((limb) input[start + 1]) << 8 | \
  415. ((limb) input[start + 2]) << 16 | \
  416. ((limb) input[start + 3]) << 24) >> shift) & mask;
  417. F(0, 0, 0, 0x3ffffff);
  418. F(1, 3, 2, 0x1ffffff);
  419. F(2, 6, 3, 0x3ffffff);
  420. F(3, 9, 5, 0x1ffffff);
  421. F(4, 12, 6, 0x3ffffff);
  422. F(5, 16, 0, 0x1ffffff);
  423. F(6, 19, 1, 0x3ffffff);
  424. F(7, 22, 3, 0x1ffffff);
  425. F(8, 25, 4, 0x3ffffff);
  426. F(9, 28, 6, 0x1ffffff);
  427. #undef F
  428. }
  429. #if (-32 >> 1) != -16
  430. #error "This code only works when >> does sign-extension on negative numbers"
  431. #endif
  432. /* s32_eq returns 0xffffffff iff a == b and zero otherwise. */
  433. static s32 s32_eq(s32 a, s32 b) {
  434. a = ~(a ^ b);
  435. a &= a << 16;
  436. a &= a << 8;
  437. a &= a << 4;
  438. a &= a << 2;
  439. a &= a << 1;
  440. return a >> 31;
  441. }
  442. /* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are
  443. * both non-negative. */
  444. static s32 s32_gte(s32 a, s32 b) {
  445. a -= b;
  446. /* a >= 0 iff a >= b. */
  447. return ~(a >> 31);
  448. }
  449. /* Take a fully reduced polynomial form number and contract it into a
  450. * little-endian, 32-byte array.
  451. *
  452. * On entry: |input_limbs[i]| < 2^26 */
  453. static void
  454. fcontract(u8 *output, limb *input_limbs) {
  455. int i;
  456. int j;
  457. s32 input[10];
  458. /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */
  459. for (i = 0; i < 10; i++) {
  460. input[i] = (s32) input_limbs[i];
  461. }
  462. for (j = 0; j < 2; ++j) {
  463. for (i = 0; i < 9; ++i) {
  464. if ((i & 1) == 1) {
  465. /* This calculation is a time-invariant way to make input[i]
  466. * non-negative by borrowing from the next-larger limb. */
  467. const s32 mask = input[i] >> 31;
  468. const s32 carry = -((input[i] & mask) >> 25);
  469. input[i] = input[i] + (carry << 25);
  470. input[i+1] = input[i+1] - carry;
  471. } else {
  472. const s32 mask = input[i] >> 31;
  473. const s32 carry = -((input[i] & mask) >> 26);
  474. input[i] = input[i] + (carry << 26);
  475. input[i+1] = input[i+1] - carry;
  476. }
  477. }
  478. /* There's no greater limb for input[9] to borrow from, but we can multiply
  479. * by 19 and borrow from input[0], which is valid mod 2^255-19. */
  480. {
  481. const s32 mask = input[9] >> 31;
  482. const s32 carry = -((input[9] & mask) >> 25);
  483. input[9] = input[9] + (carry << 25);
  484. input[0] = input[0] - (carry * 19);
  485. }
  486. /* After the first iteration, input[1..9] are non-negative and fit within
  487. * 25 or 26 bits, depending on position. However, input[0] may be
  488. * negative. */
  489. }
  490. /* The first borrow-propagation pass above ended with every limb
  491. except (possibly) input[0] non-negative.
  492. If input[0] was negative after the first pass, then it was because of a
  493. carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
  494. one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19.
  495. In the second pass, each limb is decreased by at most one. Thus the second
  496. borrow-propagation pass could only have wrapped around to decrease
  497. input[0] again if the first pass left input[0] negative *and* input[1]
  498. through input[9] were all zero. In that case, input[1] is now 2^25 - 1,
  499. and this last borrow-propagation step will leave input[1] non-negative. */
  500. {
  501. const s32 mask = input[0] >> 31;
  502. const s32 carry = -((input[0] & mask) >> 26);
  503. input[0] = input[0] + (carry << 26);
  504. input[1] = input[1] - carry;
  505. }
  506. /* All input[i] are now non-negative. However, there might be values between
  507. * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
  508. for (j = 0; j < 2; j++) {
  509. for (i = 0; i < 9; i++) {
  510. if ((i & 1) == 1) {
  511. const s32 carry = input[i] >> 25;
  512. input[i] &= 0x1ffffff;
  513. input[i+1] += carry;
  514. } else {
  515. const s32 carry = input[i] >> 26;
  516. input[i] &= 0x3ffffff;
  517. input[i+1] += carry;
  518. }
  519. }
  520. {
  521. const s32 carry = input[9] >> 25;
  522. input[9] &= 0x1ffffff;
  523. input[0] += 19*carry;
  524. }
  525. }
  526. /* If the first carry-chain pass, just above, ended up with a carry from
  527. * input[9], and that caused input[0] to be out-of-bounds, then input[0] was
  528. * < 2^26 + 2*19, because the carry was, at most, two.
  529. *
  530. * If the second pass carried from input[9] again then input[0] is < 2*19 and
  531. * the input[9] -> input[0] carry didn't push input[0] out of bounds. */
  532. /* It still remains the case that input might be between 2^255-19 and 2^255.
  533. * In this case, input[1..9] must take their maximum value and input[0] must
  534. * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
  535. s32 mask = s32_gte(input[0], 0x3ffffed);
  536. for (i = 1; i < 10; i++) {
  537. if ((i & 1) == 1) {
  538. mask &= s32_eq(input[i], 0x1ffffff);
  539. } else {
  540. mask &= s32_eq(input[i], 0x3ffffff);
  541. }
  542. }
  543. /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus
  544. * this conditionally subtracts 2^255-19. */
  545. input[0] -= mask & 0x3ffffed;
  546. for (i = 1; i < 10; i++) {
  547. if ((i & 1) == 1) {
  548. input[i] -= mask & 0x1ffffff;
  549. } else {
  550. input[i] -= mask & 0x3ffffff;
  551. }
  552. }
  553. input[1] <<= 2;
  554. input[2] <<= 3;
  555. input[3] <<= 5;
  556. input[4] <<= 6;
  557. input[6] <<= 1;
  558. input[7] <<= 3;
  559. input[8] <<= 4;
  560. input[9] <<= 6;
  561. #define F(i, s) \
  562. output[s+0] |= input[i] & 0xff; \
  563. output[s+1] = (input[i] >> 8) & 0xff; \
  564. output[s+2] = (input[i] >> 16) & 0xff; \
  565. output[s+3] = (input[i] >> 24) & 0xff;
  566. output[0] = 0;
  567. output[16] = 0;
  568. F(0,0);
  569. F(1,3);
  570. F(2,6);
  571. F(3,9);
  572. F(4,12);
  573. F(5,16);
  574. F(6,19);
  575. F(7,22);
  576. F(8,25);
  577. F(9,28);
  578. #undef F
  579. }
  580. /* Input: Q, Q', Q-Q'
  581. * Output: 2Q, Q+Q'
  582. *
  583. * x2 z3: long form
  584. * x3 z3: long form
  585. * x z: short form, destroyed
  586. * xprime zprime: short form, destroyed
  587. * qmqp: short form, preserved
  588. *
  589. * On entry and exit, the absolute value of the limbs of all inputs and outputs
  590. * are < 2^26. */
  591. static void fmonty(limb *x2, limb *z2, /* output 2Q */
  592. limb *x3, limb *z3, /* output Q + Q' */
  593. limb *x, limb *z, /* input Q */
  594. limb *xprime, limb *zprime, /* input Q' */
  595. const limb *qmqp /* input Q - Q' */) {
  596. limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
  597. zzprime[19], zzzprime[19], xxxprime[19];
  598. memcpy(origx, x, 10 * sizeof(limb));
  599. fsum(x, z);
  600. /* |x[i]| < 2^27 */
  601. fdifference(z, origx); /* does x - z */
  602. /* |z[i]| < 2^27 */
  603. memcpy(origxprime, xprime, sizeof(limb) * 10);
  604. fsum(xprime, zprime);
  605. /* |xprime[i]| < 2^27 */
  606. fdifference(zprime, origxprime);
  607. /* |zprime[i]| < 2^27 */
  608. fproduct(xxprime, xprime, z);
  609. /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be <
  610. * 2^(27+27) and fproduct adds together, at most, 14 of those products.
  611. * (Approximating that to 2^58 doesn't work out.) */
  612. fproduct(zzprime, x, zprime);
  613. /* |zzprime[i]| < 14*2^54 */
  614. freduce_degree(xxprime);
  615. freduce_coefficients(xxprime);
  616. /* |xxprime[i]| < 2^26 */
  617. freduce_degree(zzprime);
  618. freduce_coefficients(zzprime);
  619. /* |zzprime[i]| < 2^26 */
  620. memcpy(origxprime, xxprime, sizeof(limb) * 10);
  621. fsum(xxprime, zzprime);
  622. /* |xxprime[i]| < 2^27 */
  623. fdifference(zzprime, origxprime);
  624. /* |zzprime[i]| < 2^27 */
  625. fsquare(xxxprime, xxprime);
  626. /* |xxxprime[i]| < 2^26 */
  627. fsquare(zzzprime, zzprime);
  628. /* |zzzprime[i]| < 2^26 */
  629. fproduct(zzprime, zzzprime, qmqp);
  630. /* |zzprime[i]| < 14*2^52 */
  631. freduce_degree(zzprime);
  632. freduce_coefficients(zzprime);
  633. /* |zzprime[i]| < 2^26 */
  634. memcpy(x3, xxxprime, sizeof(limb) * 10);
  635. memcpy(z3, zzprime, sizeof(limb) * 10);
  636. fsquare(xx, x);
  637. /* |xx[i]| < 2^26 */
  638. fsquare(zz, z);
  639. /* |zz[i]| < 2^26 */
  640. fproduct(x2, xx, zz);
  641. /* |x2[i]| < 14*2^52 */
  642. freduce_degree(x2);
  643. freduce_coefficients(x2);
  644. /* |x2[i]| < 2^26 */
  645. fdifference(zz, xx); // does zz = xx - zz
  646. /* |zz[i]| < 2^27 */
  647. memset(zzz + 10, 0, sizeof(limb) * 9);
  648. fscalar_product(zzz, zz, 121665);
  649. /* |zzz[i]| < 2^(27+17) */
  650. /* No need to call freduce_degree here:
  651. fscalar_product doesn't increase the degree of its input. */
  652. freduce_coefficients(zzz);
  653. /* |zzz[i]| < 2^26 */
  654. fsum(zzz, xx);
  655. /* |zzz[i]| < 2^27 */
  656. fproduct(z2, zz, zzz);
  657. /* |z2[i]| < 14*2^(26+27) */
  658. freduce_degree(z2);
  659. freduce_coefficients(z2);
  660. /* |z2|i| < 2^26 */
  661. }
  662. /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
  663. * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
  664. * side-channel attacks.
  665. *
  666. * NOTE that this function requires that 'iswap' be 1 or 0; other values give
  667. * wrong results. Also, the two limb arrays must be in reduced-coefficient,
  668. * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
  669. * and all all values in a[0..9],b[0..9] must have magnitude less than
  670. * INT32_MAX. */
  671. static void
  672. swap_conditional(limb a[19], limb b[19], limb iswap) {
  673. unsigned i;
  674. const s32 swap = (s32) -iswap;
  675. for (i = 0; i < 10; ++i) {
  676. const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
  677. a[i] = ((s32)a[i]) ^ x;
  678. b[i] = ((s32)b[i]) ^ x;
  679. }
  680. }
  681. /* Calculates nQ where Q is the x-coordinate of a point on the curve
  682. *
  683. * resultx/resultz: the x coordinate of the resulting curve point (short form)
  684. * n: a little endian, 32-byte number
  685. * q: a point of the curve (short form) */
  686. static void
  687. cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
  688. limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
  689. limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
  690. limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
  691. limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
  692. unsigned i, j;
  693. memcpy(nqpqx, q, sizeof(limb) * 10);
  694. for (i = 0; i < 32; ++i) {
  695. u8 byte = n[31 - i];
  696. for (j = 0; j < 8; ++j) {
  697. const limb bit = byte >> 7;
  698. swap_conditional(nqx, nqpqx, bit);
  699. swap_conditional(nqz, nqpqz, bit);
  700. fmonty(nqx2, nqz2,
  701. nqpqx2, nqpqz2,
  702. nqx, nqz,
  703. nqpqx, nqpqz,
  704. q);
  705. swap_conditional(nqx2, nqpqx2, bit);
  706. swap_conditional(nqz2, nqpqz2, bit);
  707. t = nqx;
  708. nqx = nqx2;
  709. nqx2 = t;
  710. t = nqz;
  711. nqz = nqz2;
  712. nqz2 = t;
  713. t = nqpqx;
  714. nqpqx = nqpqx2;
  715. nqpqx2 = t;
  716. t = nqpqz;
  717. nqpqz = nqpqz2;
  718. nqpqz2 = t;
  719. byte <<= 1;
  720. }
  721. }
  722. memcpy(resultx, nqx, sizeof(limb) * 10);
  723. memcpy(resultz, nqz, sizeof(limb) * 10);
  724. }
  725. // -----------------------------------------------------------------------------
  726. // Shamelessly copied from djb's code
  727. // -----------------------------------------------------------------------------
  728. static void
  729. crecip(limb *out, const limb *z) {
  730. limb z2[10];
  731. limb z9[10];
  732. limb z11[10];
  733. limb z2_5_0[10];
  734. limb z2_10_0[10];
  735. limb z2_20_0[10];
  736. limb z2_50_0[10];
  737. limb z2_100_0[10];
  738. limb t0[10];
  739. limb t1[10];
  740. int i;
  741. /* 2 */ fsquare(z2,z);
  742. /* 4 */ fsquare(t1,z2);
  743. /* 8 */ fsquare(t0,t1);
  744. /* 9 */ fmul(z9,t0,z);
  745. /* 11 */ fmul(z11,z9,z2);
  746. /* 22 */ fsquare(t0,z11);
  747. /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
  748. /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
  749. /* 2^7 - 2^2 */ fsquare(t1,t0);
  750. /* 2^8 - 2^3 */ fsquare(t0,t1);
  751. /* 2^9 - 2^4 */ fsquare(t1,t0);
  752. /* 2^10 - 2^5 */ fsquare(t0,t1);
  753. /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
  754. /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
  755. /* 2^12 - 2^2 */ fsquare(t1,t0);
  756. /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  757. /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
  758. /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
  759. /* 2^22 - 2^2 */ fsquare(t1,t0);
  760. /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  761. /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
  762. /* 2^41 - 2^1 */ fsquare(t1,t0);
  763. /* 2^42 - 2^2 */ fsquare(t0,t1);
  764. /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  765. /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
  766. /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
  767. /* 2^52 - 2^2 */ fsquare(t1,t0);
  768. /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  769. /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
  770. /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
  771. /* 2^102 - 2^2 */ fsquare(t0,t1);
  772. /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  773. /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
  774. /* 2^201 - 2^1 */ fsquare(t0,t1);
  775. /* 2^202 - 2^2 */ fsquare(t1,t0);
  776. /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  777. /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
  778. /* 2^251 - 2^1 */ fsquare(t1,t0);
  779. /* 2^252 - 2^2 */ fsquare(t0,t1);
  780. /* 2^253 - 2^3 */ fsquare(t1,t0);
  781. /* 2^254 - 2^4 */ fsquare(t0,t1);
  782. /* 2^255 - 2^5 */ fsquare(t1,t0);
  783. /* 2^255 - 21 */ fmul(out,t1,z11);
  784. }
  785. int curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint);
  786. int
  787. curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
  788. limb bp[10], x[10], z[11], zmone[10];
  789. uint8_t e[32];
  790. int i;
  791. for (i = 0; i < 32; ++i) e[i] = secret[i];
  792. e[0] &= 248;
  793. e[31] &= 127;
  794. e[31] |= 64;
  795. fexpand(bp, basepoint);
  796. cmult(x, z, e, bp);
  797. crecip(zmone, z);
  798. fmul(z, x, zmone);
  799. fcontract(mypublic, z);
  800. return 0;
  801. }