curve25519-donna-sse2.h 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105
  1. /*
  2. Public domain by Andrew M. <liquidsun@gmail.com>
  3. See: https://github.com/floodyberry/curve25519-donna
  4. SSE2 curve25519 implementation
  5. */
  6. #include <emmintrin.h>
  7. typedef __m128i xmmi;
  8. typedef union packedelem8_t {
  9. unsigned char u[16];
  10. xmmi v;
  11. } packedelem8;
  12. typedef union packedelem32_t {
  13. uint32_t u[4];
  14. xmmi v;
  15. } packedelem32;
  16. typedef union packedelem64_t {
  17. uint64_t u[2];
  18. xmmi v;
  19. } packedelem64;
  20. /* 10 elements + an extra 2 to fit in 3 xmm registers */
  21. typedef uint32_t bignum25519[12];
  22. typedef packedelem32 packed32bignum25519[5];
  23. typedef packedelem64 packed64bignum25519[10];
  24. static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
  25. static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
  26. static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
  27. /* reduction masks */
  28. static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
  29. static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
  30. static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
  31. static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
  32. /* multipliers */
  33. static const packedelem64 packednineteen = {{19, 19}};
  34. static const packedelem64 packedthirtyeight = {{38, 38}};
  35. static const packedelem64 packed3819 = {{19*2,19}};
  36. /* 2*(2^255 - 19) = 0 mod p */
  37. static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
  38. static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
  39. static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
  40. static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
  41. static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
  42. /* 4*(2^255 - 19) = 0 mod p */
  43. static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
  44. static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
  45. static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
  46. static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
  47. static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
  48. /* out = in */
  49. DONNA_INLINE static void
  50. curve25519_copy(bignum25519 out, const bignum25519 in) {
  51. xmmi x0,x1,x2;
  52. x0 = _mm_load_si128((xmmi*)in + 0);
  53. x1 = _mm_load_si128((xmmi*)in + 1);
  54. x2 = _mm_load_si128((xmmi*)in + 2);
  55. _mm_store_si128((xmmi*)out + 0, x0);
  56. _mm_store_si128((xmmi*)out + 1, x1);
  57. _mm_store_si128((xmmi*)out + 2, x2);
  58. }
  59. /* out = a + b */
  60. DONNA_INLINE static void
  61. curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  62. xmmi a0,a1,a2,b0,b1,b2;
  63. a0 = _mm_load_si128((xmmi*)a + 0);
  64. a1 = _mm_load_si128((xmmi*)a + 1);
  65. a2 = _mm_load_si128((xmmi*)a + 2);
  66. b0 = _mm_load_si128((xmmi*)b + 0);
  67. b1 = _mm_load_si128((xmmi*)b + 1);
  68. b2 = _mm_load_si128((xmmi*)b + 2);
  69. a0 = _mm_add_epi32(a0, b0);
  70. a1 = _mm_add_epi32(a1, b1);
  71. a2 = _mm_add_epi32(a2, b2);
  72. _mm_store_si128((xmmi*)out + 0, a0);
  73. _mm_store_si128((xmmi*)out + 1, a1);
  74. _mm_store_si128((xmmi*)out + 2, a2);
  75. }
  76. #define curve25519_add_after_basic curve25519_add_reduce
  77. DONNA_INLINE static void
  78. curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  79. xmmi a0,a1,a2,b0,b1,b2;
  80. xmmi c1,c2,c3;
  81. xmmi r0,r1,r2,r3,r4,r5;
  82. a0 = _mm_load_si128((xmmi*)a + 0);
  83. a1 = _mm_load_si128((xmmi*)a + 1);
  84. a2 = _mm_load_si128((xmmi*)a + 2);
  85. b0 = _mm_load_si128((xmmi*)b + 0);
  86. b1 = _mm_load_si128((xmmi*)b + 1);
  87. b2 = _mm_load_si128((xmmi*)b + 2);
  88. a0 = _mm_add_epi32(a0, b0);
  89. a1 = _mm_add_epi32(a1, b1);
  90. a2 = _mm_add_epi32(a2, b2);
  91. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  92. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  93. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  94. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  95. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  96. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  97. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  98. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  99. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  100. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  101. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  102. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  103. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  104. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  105. }
  106. DONNA_INLINE static void
  107. curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  108. xmmi a0,a1,a2,b0,b1,b2;
  109. xmmi c1,c2;
  110. xmmi r0,r1;
  111. a0 = _mm_load_si128((xmmi*)a + 0);
  112. a1 = _mm_load_si128((xmmi*)a + 1);
  113. a2 = _mm_load_si128((xmmi*)a + 2);
  114. a0 = _mm_add_epi32(a0, packed2p0.v);
  115. a1 = _mm_add_epi32(a1, packed2p1.v);
  116. a2 = _mm_add_epi32(a2, packed2p2.v);
  117. b0 = _mm_load_si128((xmmi*)b + 0);
  118. b1 = _mm_load_si128((xmmi*)b + 1);
  119. b2 = _mm_load_si128((xmmi*)b + 2);
  120. a0 = _mm_sub_epi32(a0, b0);
  121. a1 = _mm_sub_epi32(a1, b1);
  122. a2 = _mm_sub_epi32(a2, b2);
  123. r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v);
  124. r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v);
  125. c1 = _mm_srli_epi32(r0, 26);
  126. c2 = _mm_srli_epi32(r1, 25);
  127. r0 = _mm_and_si128(r0, packedmask26.v);
  128. r1 = _mm_and_si128(r1, packedmask25.v);
  129. r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
  130. r1 = _mm_add_epi32(r1, c1);
  131. a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
  132. a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
  133. _mm_store_si128((xmmi*)out + 0, a0);
  134. _mm_store_si128((xmmi*)out + 1, a1);
  135. _mm_store_si128((xmmi*)out + 2, a2);
  136. }
  137. DONNA_INLINE static void
  138. curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  139. xmmi a0,a1,a2,b0,b1,b2;
  140. xmmi c1,c2,c3;
  141. xmmi r0,r1,r2,r3,r4,r5;
  142. a0 = _mm_load_si128((xmmi*)a + 0);
  143. a1 = _mm_load_si128((xmmi*)a + 1);
  144. a2 = _mm_load_si128((xmmi*)a + 2);
  145. a0 = _mm_add_epi32(a0, packed4p0.v);
  146. a1 = _mm_add_epi32(a1, packed4p1.v);
  147. a2 = _mm_add_epi32(a2, packed4p2.v);
  148. b0 = _mm_load_si128((xmmi*)b + 0);
  149. b1 = _mm_load_si128((xmmi*)b + 1);
  150. b2 = _mm_load_si128((xmmi*)b + 2);
  151. a0 = _mm_sub_epi32(a0, b0);
  152. a1 = _mm_sub_epi32(a1, b1);
  153. a2 = _mm_sub_epi32(a2, b2);
  154. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  155. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  156. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  157. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  158. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  159. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  160. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  161. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  162. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  163. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  164. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  165. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  166. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  167. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  168. }
  169. DONNA_INLINE static void
  170. curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  171. xmmi a0,a1,a2,b0,b1,b2;
  172. xmmi c1,c2,c3;
  173. xmmi r0,r1,r2,r3,r4,r5;
  174. a0 = _mm_load_si128((xmmi*)a + 0);
  175. a1 = _mm_load_si128((xmmi*)a + 1);
  176. a2 = _mm_load_si128((xmmi*)a + 2);
  177. a0 = _mm_add_epi32(a0, packed2p0.v);
  178. a1 = _mm_add_epi32(a1, packed2p1.v);
  179. a2 = _mm_add_epi32(a2, packed2p2.v);
  180. b0 = _mm_load_si128((xmmi*)b + 0);
  181. b1 = _mm_load_si128((xmmi*)b + 1);
  182. b2 = _mm_load_si128((xmmi*)b + 2);
  183. a0 = _mm_sub_epi32(a0, b0);
  184. a1 = _mm_sub_epi32(a1, b1);
  185. a2 = _mm_sub_epi32(a2, b2);
  186. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  187. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  188. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  189. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  190. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  191. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  192. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  193. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  194. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  195. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  196. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  197. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  198. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  199. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  200. }
  201. DONNA_INLINE static void
  202. curve25519_neg(bignum25519 out, const bignum25519 b) {
  203. xmmi a0,a1,a2,b0,b1,b2;
  204. xmmi c1,c2,c3;
  205. xmmi r0,r1,r2,r3,r4,r5;
  206. a0 = packed2p0.v;
  207. a1 = packed2p1.v;
  208. a2 = packed2p2.v;
  209. b0 = _mm_load_si128((xmmi*)b + 0);
  210. b1 = _mm_load_si128((xmmi*)b + 1);
  211. b2 = _mm_load_si128((xmmi*)b + 2);
  212. a0 = _mm_sub_epi32(a0, b0);
  213. a1 = _mm_sub_epi32(a1, b1);
  214. a2 = _mm_sub_epi32(a2, b2);
  215. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  216. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  217. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  218. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  219. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  220. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  221. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  222. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  223. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  224. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  225. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  226. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  227. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  228. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  229. }
  230. /* Multiply two numbers: out = in2 * in */
  231. static void
  232. curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
  233. xmmi m01,m23,m45,m67,m89;
  234. xmmi m0123,m4567;
  235. xmmi s0123,s4567;
  236. xmmi s01,s23,s45,s67,s89;
  237. xmmi s12,s34,s56,s78,s9;
  238. xmmi r0,r2,r4,r6,r8;
  239. xmmi r1,r3,r5,r7,r9;
  240. xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
  241. xmmi c1,c2,c3;
  242. s0123 = _mm_load_si128((xmmi*)s + 0);
  243. s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
  244. s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
  245. s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
  246. s4567 = _mm_load_si128((xmmi*)s + 1);
  247. s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
  248. s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
  249. s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
  250. s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
  251. s89 = _mm_load_si128((xmmi*)s + 2);
  252. s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
  253. s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
  254. s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
  255. r0 = _mm_load_si128((xmmi*)r + 0);
  256. r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
  257. r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v));
  258. r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
  259. r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
  260. r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v));
  261. r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
  262. r4 = _mm_load_si128((xmmi*)r + 1);
  263. r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
  264. r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v));
  265. r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
  266. r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
  267. r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v));
  268. r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
  269. r8 = _mm_load_si128((xmmi*)r + 2);
  270. r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
  271. r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v));
  272. r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
  273. m01 = _mm_mul_epu32(r1,s01);
  274. m23 = _mm_mul_epu32(r1,s23);
  275. m45 = _mm_mul_epu32(r1,s45);
  276. m67 = _mm_mul_epu32(r1,s67);
  277. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
  278. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
  279. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
  280. m89 = _mm_mul_epu32(r1,s89);
  281. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
  282. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
  283. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
  284. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
  285. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
  286. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
  287. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
  288. /* shift up */
  289. m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
  290. m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
  291. m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
  292. m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
  293. m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
  294. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
  295. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
  296. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
  297. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
  298. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
  299. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
  300. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
  301. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
  302. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
  303. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
  304. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
  305. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
  306. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
  307. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
  308. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
  309. r219 = _mm_mul_epu32(r2, packednineteen.v);
  310. r419 = _mm_mul_epu32(r4, packednineteen.v);
  311. r619 = _mm_mul_epu32(r6, packednineteen.v);
  312. r819 = _mm_mul_epu32(r8, packednineteen.v);
  313. r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
  314. r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
  315. r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
  316. r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
  317. r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
  318. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
  319. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
  320. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
  321. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
  322. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
  323. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
  324. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
  325. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
  326. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
  327. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
  328. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
  329. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
  330. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
  331. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
  332. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
  333. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
  334. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
  335. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
  336. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
  337. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
  338. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
  339. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
  340. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
  341. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
  342. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
  343. r0 = _mm_unpacklo_epi64(m01, m45);
  344. r1 = _mm_unpackhi_epi64(m01, m45);
  345. r2 = _mm_unpacklo_epi64(m23, m67);
  346. r3 = _mm_unpackhi_epi64(m23, m67);
  347. r4 = _mm_unpacklo_epi64(m89, m89);
  348. r5 = _mm_unpackhi_epi64(m89, m89);
  349. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  350. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  351. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  352. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  353. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  354. m0123 = _mm_unpacklo_epi32(r0, r1);
  355. m4567 = _mm_unpackhi_epi32(r0, r1);
  356. m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
  357. m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
  358. m89 = _mm_unpackhi_epi32(r4, r5);
  359. _mm_store_si128((xmmi*)out + 0, m0123);
  360. _mm_store_si128((xmmi*)out + 1, m4567);
  361. _mm_store_si128((xmmi*)out + 2, m89);
  362. }
  363. DONNA_NOINLINE static void
  364. curve25519_mul_noinline(bignum25519 out, const bignum25519 r, const bignum25519 s) {
  365. curve25519_mul(out, r, s);
  366. }
  367. #define curve25519_square(r, n) curve25519_square_times(r, n, 1)
  368. static void
  369. curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
  370. xmmi m01,m23,m45,m67,m89;
  371. xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
  372. xmmi r0a,r1a,r2a,r3a,r7a,r9a;
  373. xmmi r0123,r4567;
  374. xmmi r01,r23,r45,r67,r6x,r89,r8x;
  375. xmmi r12,r34,r56,r78,r9x;
  376. xmmi r5619;
  377. xmmi c1,c2,c3;
  378. r0123 = _mm_load_si128((xmmi*)in + 0);
  379. r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
  380. r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
  381. r4567 = _mm_load_si128((xmmi*)in + 1);
  382. r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
  383. r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
  384. r89 = _mm_load_si128((xmmi*)in + 2);
  385. r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
  386. do {
  387. r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
  388. r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
  389. r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v));
  390. r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
  391. r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
  392. r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
  393. r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v));
  394. r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
  395. r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
  396. r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
  397. r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
  398. r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v));
  399. r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
  400. r5619 = _mm_mul_epu32(r56, packednineteen.v);
  401. r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
  402. r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
  403. r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
  404. r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
  405. r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
  406. r7 = _mm_mul_epu32(r7, packed3819.v);
  407. r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
  408. r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
  409. r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
  410. r8 = _mm_mul_epu32(r8, packednineteen.v);
  411. r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
  412. r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
  413. r9 = _mm_mul_epu32(r9, packed3819.v);
  414. r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
  415. m01 = _mm_mul_epu32(r01, r0);
  416. m23 = _mm_mul_epu32(r23, r0a);
  417. m45 = _mm_mul_epu32(r45, r0a);
  418. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
  419. r23 = _mm_slli_epi32(r23, 1);
  420. m67 = _mm_mul_epu32(r67, r0a);
  421. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
  422. m89 = _mm_mul_epu32(r89, r0a);
  423. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
  424. r67 = _mm_slli_epi32(r67, 1);
  425. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
  426. r45 = _mm_slli_epi32(r45, 1);
  427. r1 = _mm_slli_epi32(r1, 1);
  428. r3 = _mm_slli_epi32(r3, 1);
  429. r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v));
  430. r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v));
  431. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
  432. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
  433. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
  434. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
  435. r34 = _mm_slli_epi32(r34, 1);
  436. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
  437. r78 = _mm_slli_epi32(r78, 1);
  438. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
  439. r56 = _mm_slli_epi32(r56, 1);
  440. m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
  441. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
  442. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
  443. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
  444. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
  445. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
  446. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
  447. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
  448. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
  449. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
  450. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
  451. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
  452. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
  453. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
  454. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
  455. r0 = _mm_unpacklo_epi64(m01, m45);
  456. r1 = _mm_unpackhi_epi64(m01, m45);
  457. r2 = _mm_unpacklo_epi64(m23, m67);
  458. r3 = _mm_unpackhi_epi64(m23, m67);
  459. r4 = _mm_unpacklo_epi64(m89, m89);
  460. r5 = _mm_unpackhi_epi64(m89, m89);
  461. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  462. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  463. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  464. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  465. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  466. r01 = _mm_unpacklo_epi64(r0, r1);
  467. r45 = _mm_unpackhi_epi64(r0, r1);
  468. r23 = _mm_unpacklo_epi64(r2, r3);
  469. r67 = _mm_unpackhi_epi64(r2, r3);
  470. r89 = _mm_unpackhi_epi64(r4, r5);
  471. } while (--count);
  472. r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
  473. r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
  474. r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
  475. r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
  476. r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
  477. _mm_store_si128((xmmi*)r + 0, r0123);
  478. _mm_store_si128((xmmi*)r + 1, r4567);
  479. _mm_store_si128((xmmi*)r + 2, r89);
  480. }
  481. DONNA_INLINE static void
  482. curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
  483. xmmi x0,x1,x2,z0,z1,z2;
  484. x0 = _mm_load_si128((xmmi *)(x + 0));
  485. x1 = _mm_load_si128((xmmi *)(x + 4));
  486. x2 = _mm_load_si128((xmmi *)(x + 8));
  487. z0 = _mm_load_si128((xmmi *)(z + 0));
  488. z1 = _mm_load_si128((xmmi *)(z + 4));
  489. z2 = _mm_load_si128((xmmi *)(z + 8));
  490. out[0].v = _mm_unpacklo_epi32(x0, z0);
  491. out[1].v = _mm_unpackhi_epi32(x0, z0);
  492. out[2].v = _mm_unpacklo_epi32(x1, z1);
  493. out[3].v = _mm_unpackhi_epi32(x1, z1);
  494. out[4].v = _mm_unpacklo_epi32(x2, z2);
  495. }
  496. DONNA_INLINE static void
  497. curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) {
  498. xmmi t0,t1,t2,t3,t4,zero;
  499. t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
  500. t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
  501. t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
  502. t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
  503. t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
  504. zero = _mm_setzero_si128();
  505. _mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1));
  506. _mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3));
  507. _mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero));
  508. _mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1));
  509. _mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3));
  510. _mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero));
  511. }
  512. DONNA_INLINE static void
  513. curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  514. xmmi r0,r1,r2,r3,r4;
  515. xmmi s0,s1,s2,s3,s4,s5;
  516. xmmi c1,c2;
  517. r0 = _mm_add_epi32(r[0].v, s[0].v);
  518. r1 = _mm_add_epi32(r[1].v, s[1].v);
  519. r2 = _mm_add_epi32(r[2].v, s[2].v);
  520. r3 = _mm_add_epi32(r[3].v, s[3].v);
  521. r4 = _mm_add_epi32(r[4].v, s[4].v);
  522. s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
  523. s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
  524. s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
  525. s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
  526. s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
  527. s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
  528. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  529. c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
  530. c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
  531. c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
  532. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  533. out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
  534. out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
  535. out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
  536. out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
  537. out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
  538. }
  539. DONNA_INLINE static void
  540. curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  541. out[0].v = _mm_add_epi32(r[0].v, s[0].v);
  542. out[1].v = _mm_add_epi32(r[1].v, s[1].v);
  543. out[2].v = _mm_add_epi32(r[2].v, s[2].v);
  544. out[3].v = _mm_add_epi32(r[3].v, s[3].v);
  545. out[4].v = _mm_add_epi32(r[4].v, s[4].v);
  546. }
  547. DONNA_INLINE static void
  548. curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  549. xmmi r0,r1,r2,r3,r4;
  550. xmmi s0,s1,s2,s3;
  551. xmmi c1,c2;
  552. r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v);
  553. r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v);
  554. r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v);
  555. r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v);
  556. r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v);
  557. r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
  558. r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
  559. r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
  560. r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
  561. r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
  562. s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
  563. s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
  564. s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
  565. s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
  566. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  567. c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
  568. out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
  569. out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
  570. out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
  571. out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
  572. out[4].v = r4;
  573. }
  574. DONNA_INLINE static void
  575. curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  576. xmmi r0,r1,r2,r3,r4;
  577. xmmi s0,s1,s2,s3,s4,s5;
  578. xmmi c1,c2;
  579. r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v);
  580. r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v);
  581. r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v);
  582. r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v);
  583. r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v);
  584. r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
  585. r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
  586. r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
  587. r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
  588. r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
  589. s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
  590. s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
  591. s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
  592. s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
  593. s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
  594. s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
  595. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  596. c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
  597. c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
  598. c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
  599. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  600. out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
  601. out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
  602. out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
  603. out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
  604. out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
  605. }
  606. DONNA_INLINE static void
  607. curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) {
  608. xmmi c0,c1,c2,c3,c4,c5,t;
  609. xmmi d0,d1,d2,d3,d4,d5;
  610. xmmi t0,t1,t2,t3,t4,zero;
  611. t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
  612. t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
  613. t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
  614. t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
  615. c0 = _mm_unpacklo_epi64(t0, t1);
  616. c3 = _mm_unpackhi_epi64(t0, t1);
  617. d0 = _mm_unpacklo_epi64(t2, t3);
  618. d3 = _mm_unpackhi_epi64(t2, t3);
  619. t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32);
  620. t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32);
  621. t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32);
  622. t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32);
  623. t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
  624. t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
  625. t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
  626. t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
  627. c1 = _mm_unpacklo_epi64(t0, t1);
  628. c4 = _mm_unpackhi_epi64(t0, t1);
  629. d1 = _mm_unpacklo_epi64(t2, t3);
  630. d4 = _mm_unpackhi_epi64(t2, t3);
  631. t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32);
  632. t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32);
  633. t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32);
  634. t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32);
  635. t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
  636. zero = _mm_setzero_si128();
  637. c2 = _mm_unpacklo_epi64(t4, zero);
  638. c5 = _mm_unpackhi_epi64(t4, zero);
  639. t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
  640. d2 = _mm_unpacklo_epi64(t4, zero);
  641. d5 = _mm_unpackhi_epi64(t4, zero);
  642. t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32);
  643. t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32);
  644. }
  645. DONNA_INLINE static void
  646. curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) {
  647. xmmi x0,x1,x2,z0,z1,z2,t;
  648. x0 = _mm_load_si128((xmmi *)x + 0);
  649. x1 = _mm_load_si128((xmmi *)x + 1);
  650. x2 = _mm_load_si128((xmmi *)x + 2);
  651. z0 = _mm_load_si128((xmmi *)z + 0);
  652. z1 = _mm_load_si128((xmmi *)z + 1);
  653. z2 = _mm_load_si128((xmmi *)z + 2);
  654. t = _mm_unpacklo_epi64(x0, z0); out[0].v = t; out[1].v = _mm_srli_epi64(t, 32);
  655. t = _mm_unpackhi_epi64(x0, z0); out[2].v = t; out[3].v = _mm_srli_epi64(t, 32);
  656. t = _mm_unpacklo_epi64(x1, z1); out[4].v = t; out[5].v = _mm_srli_epi64(t, 32);
  657. t = _mm_unpackhi_epi64(x1, z1); out[6].v = t; out[7].v = _mm_srli_epi64(t, 32);
  658. t = _mm_unpacklo_epi64(x2, z2); out[8].v = t; out[9].v = _mm_srli_epi64(t, 32);
  659. }
  660. DONNA_INLINE static void
  661. curve25519_tangleone64(packedelem64 *out, const bignum25519 x) {
  662. xmmi x0,x1,x2;
  663. x0 = _mm_load_si128((xmmi *)(x + 0));
  664. x1 = _mm_load_si128((xmmi *)(x + 4));
  665. x2 = _mm_load_si128((xmmi *)(x + 8));
  666. out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
  667. out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
  668. out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
  669. out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
  670. out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
  671. out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
  672. out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
  673. out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
  674. out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
  675. out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
  676. }
  677. DONNA_INLINE static void
  678. curve25519_swap64(packedelem64 *out) {
  679. out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
  680. out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
  681. out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
  682. out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
  683. out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
  684. out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
  685. out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
  686. out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
  687. out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
  688. out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
  689. }
  690. DONNA_INLINE static void
  691. curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
  692. _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
  693. _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
  694. _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
  695. _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
  696. _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
  697. _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
  698. }
  699. DONNA_INLINE static void
  700. curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
  701. xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
  702. xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
  703. xmmi c1,c2;
  704. out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
  705. out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
  706. r1_2 = _mm_slli_epi32(r[1].v, 1);
  707. out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
  708. out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
  709. r3_2 = _mm_slli_epi32(r[3].v, 1);
  710. out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
  711. out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
  712. r5_2 = _mm_slli_epi32(r[5].v, 1);
  713. out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
  714. out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
  715. r7_2 = _mm_slli_epi32(r[7].v, 1);
  716. out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
  717. out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
  718. r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
  719. r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
  720. r1_2 = _mm_slli_epi32(r1, 1);
  721. r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
  722. r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
  723. r3_2 = _mm_slli_epi32(r3, 1);
  724. r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
  725. r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
  726. r5_2 = _mm_slli_epi32(r5, 1);
  727. r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
  728. r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
  729. r7_2 = _mm_slli_epi32(r7, 1);
  730. r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
  731. r9_2 = _mm_slli_epi32(r9, 1);
  732. out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
  733. out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
  734. out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
  735. out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
  736. out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
  737. out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
  738. out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
  739. out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
  740. out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
  741. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  742. c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
  743. c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
  744. c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
  745. c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
  746. c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
  747. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  748. }
  749. DONNA_INLINE static void
  750. curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
  751. xmmi r0,r1,r2,r3;
  752. xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
  753. xmmi d5,d6,d7,d8,d9;
  754. xmmi c1,c2;
  755. r0 = r[0].v;
  756. r1 = r[1].v;
  757. r2 = r[2].v;
  758. r3 = r[3].v;
  759. out[0].v = _mm_mul_epu32(r0, r0);
  760. r0 = _mm_slli_epi32(r0, 1);
  761. out[1].v = _mm_mul_epu32(r0, r1);
  762. r1_2 = _mm_slli_epi32(r1, 1);
  763. out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
  764. r1 = r1_2;
  765. out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
  766. r3_2 = _mm_slli_epi32(r3, 1);
  767. out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
  768. r2 = _mm_slli_epi32(r2, 1);
  769. out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
  770. r5_2 = _mm_slli_epi32(r[5].v, 1);
  771. out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
  772. r3 = r3_2;
  773. out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
  774. r7_2 = _mm_slli_epi32(r[7].v, 1);
  775. out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
  776. out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
  777. d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
  778. d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
  779. d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
  780. d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
  781. d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
  782. r4_2 = _mm_slli_epi32(r[4].v, 1);
  783. r6_2 = _mm_slli_epi32(r[6].v, 1);
  784. out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
  785. out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
  786. out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
  787. out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
  788. out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
  789. out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
  790. out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
  791. out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
  792. out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
  793. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  794. c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
  795. c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
  796. c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
  797. c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
  798. c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
  799. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  800. }
  801. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  802. static void
  803. curve25519_expand(bignum25519 out, const unsigned char in[32]) {
  804. uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
  805. x0 = *(uint32_t *)(in + 0);
  806. x1 = *(uint32_t *)(in + 4);
  807. x2 = *(uint32_t *)(in + 8);
  808. x3 = *(uint32_t *)(in + 12);
  809. x4 = *(uint32_t *)(in + 16);
  810. x5 = *(uint32_t *)(in + 20);
  811. x6 = *(uint32_t *)(in + 24);
  812. x7 = *(uint32_t *)(in + 28);
  813. out[0] = ( x0 ) & 0x3ffffff;
  814. out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
  815. out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
  816. out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
  817. out[4] = (( x3) >> 6) & 0x3ffffff;
  818. out[5] = ( x4 ) & 0x1ffffff;
  819. out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
  820. out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
  821. out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
  822. out[9] = (( x7) >> 6) & 0x1ffffff;
  823. out[10] = 0;
  824. out[11] = 0;
  825. }
  826. /* Take a fully reduced polynomial form number and contract it into a
  827. * little-endian, 32-byte array
  828. */
  829. static void
  830. curve25519_contract(unsigned char out[32], const bignum25519 in) {
  831. bignum25519 ALIGN(16) f;
  832. curve25519_copy(f, in);
  833. #define carry_pass() \
  834. f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \
  835. f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \
  836. f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \
  837. f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \
  838. f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \
  839. f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \
  840. f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \
  841. f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \
  842. f[9] += f[8] >> 26; f[8] &= 0x3ffffff;
  843. #define carry_pass_full() \
  844. carry_pass() \
  845. f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff;
  846. #define carry_pass_final() \
  847. carry_pass() \
  848. f[9] &= 0x1ffffff;
  849. carry_pass_full()
  850. carry_pass_full()
  851. /* now t is between 0 and 2^255-1, properly carried. */
  852. /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
  853. f[0] += 19;
  854. carry_pass_full()
  855. /* now between 19 and 2^255-1 in both cases, and offset by 19. */
  856. f[0] += (1 << 26) - 19;
  857. f[1] += (1 << 25) - 1;
  858. f[2] += (1 << 26) - 1;
  859. f[3] += (1 << 25) - 1;
  860. f[4] += (1 << 26) - 1;
  861. f[5] += (1 << 25) - 1;
  862. f[6] += (1 << 26) - 1;
  863. f[7] += (1 << 25) - 1;
  864. f[8] += (1 << 26) - 1;
  865. f[9] += (1 << 25) - 1;
  866. /* now between 2^255 and 2^256-20, and offset by 2^255. */
  867. carry_pass_final()
  868. #undef carry_pass
  869. #undef carry_full
  870. #undef carry_final
  871. f[1] <<= 2;
  872. f[2] <<= 3;
  873. f[3] <<= 5;
  874. f[4] <<= 6;
  875. f[6] <<= 1;
  876. f[7] <<= 3;
  877. f[8] <<= 4;
  878. f[9] <<= 6;
  879. #define F(i, s) \
  880. out[s+0] |= (unsigned char )(f[i] & 0xff); \
  881. out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
  882. out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
  883. out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
  884. out[0] = 0;
  885. out[16] = 0;
  886. F(0,0);
  887. F(1,3);
  888. F(2,6);
  889. F(3,9);
  890. F(4,12);
  891. F(5,16);
  892. F(6,19);
  893. F(7,22);
  894. F(8,25);
  895. F(9,28);
  896. #undef F
  897. }
  898. /* if (iswap) swap(a, b) */
  899. DONNA_INLINE static void
  900. curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) {
  901. const uint32_t swap = (uint32_t)(-(int32_t)iswap);
  902. xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
  903. xmmi mask = _mm_cvtsi32_si128(swap);
  904. mask = _mm_shuffle_epi32(mask, 0);
  905. a0 = _mm_load_si128((xmmi *)a + 0);
  906. a1 = _mm_load_si128((xmmi *)a + 1);
  907. b0 = _mm_load_si128((xmmi *)b + 0);
  908. b1 = _mm_load_si128((xmmi *)b + 1);
  909. b0 = _mm_xor_si128(a0, b0);
  910. b1 = _mm_xor_si128(a1, b1);
  911. x0 = _mm_and_si128(b0, mask);
  912. x1 = _mm_and_si128(b1, mask);
  913. x0 = _mm_xor_si128(x0, a0);
  914. x1 = _mm_xor_si128(x1, a1);
  915. a0 = _mm_xor_si128(x0, b0);
  916. a1 = _mm_xor_si128(x1, b1);
  917. _mm_store_si128((xmmi *)a + 0, x0);
  918. _mm_store_si128((xmmi *)a + 1, x1);
  919. _mm_store_si128((xmmi *)b + 0, a0);
  920. _mm_store_si128((xmmi *)b + 1, a1);
  921. a2 = _mm_load_si128((xmmi *)a + 2);
  922. b2 = _mm_load_si128((xmmi *)b + 2);
  923. b2 = _mm_xor_si128(a2, b2);
  924. x2 = _mm_and_si128(b2, mask);
  925. x2 = _mm_xor_si128(x2, a2);
  926. a2 = _mm_xor_si128(x2, b2);
  927. _mm_store_si128((xmmi *)b + 2, a2);
  928. _mm_store_si128((xmmi *)a + 2, x2);
  929. }
  930. /* out = (flag) ? out : in */
  931. DONNA_INLINE static void
  932. curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
  933. xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
  934. const uint32_t nb = flag - 1;
  935. xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
  936. a0 = _mm_load_si128((xmmi *)in + 0);
  937. a1 = _mm_load_si128((xmmi *)in + 1);
  938. a2 = _mm_load_si128((xmmi *)in + 2);
  939. b0 = _mm_load_si128((xmmi *)out + 0);
  940. b1 = _mm_load_si128((xmmi *)out + 1);
  941. b2 = _mm_load_si128((xmmi *)out + 2);
  942. a0 = _mm_andnot_si128(masknb, a0);
  943. a1 = _mm_andnot_si128(masknb, a1);
  944. a2 = _mm_andnot_si128(masknb, a2);
  945. b0 = _mm_and_si128(masknb, b0);
  946. b1 = _mm_and_si128(masknb, b1);
  947. b2 = _mm_and_si128(masknb, b2);
  948. a0 = _mm_or_si128(a0, b0);
  949. a1 = _mm_or_si128(a1, b1);
  950. a2 = _mm_or_si128(a2, b2);
  951. _mm_store_si128((xmmi*)out + 0, a0);
  952. _mm_store_si128((xmmi*)out + 1, a1);
  953. _mm_store_si128((xmmi*)out + 2, a2);
  954. a3 = _mm_load_si128((xmmi *)in + 3);
  955. a4 = _mm_load_si128((xmmi *)in + 4);
  956. a5 = _mm_load_si128((xmmi *)in + 5);
  957. b3 = _mm_load_si128((xmmi *)out + 3);
  958. b4 = _mm_load_si128((xmmi *)out + 4);
  959. b5 = _mm_load_si128((xmmi *)out + 5);
  960. a3 = _mm_andnot_si128(masknb, a3);
  961. a4 = _mm_andnot_si128(masknb, a4);
  962. a5 = _mm_andnot_si128(masknb, a5);
  963. b3 = _mm_and_si128(masknb, b3);
  964. b4 = _mm_and_si128(masknb, b4);
  965. b5 = _mm_and_si128(masknb, b5);
  966. a3 = _mm_or_si128(a3, b3);
  967. a4 = _mm_or_si128(a4, b4);
  968. a5 = _mm_or_si128(a5, b5);
  969. _mm_store_si128((xmmi*)out + 3, a3);
  970. _mm_store_si128((xmmi*)out + 4, a4);
  971. _mm_store_si128((xmmi*)out + 5, a5);
  972. }