TedKrovetzAesNiWrapperC.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. #include "TedKrovetzAesNiWrapperC.h"
  2. #ifdef USE_PIPELINED_AES_NI
  3. #ifdef _WIN32
  4. #include "StdAfx.h"
  5. #endif
  6. void AES_128_Key_Expansion(const unsigned char *userkey, AES_KEY *aesKey)
  7. {
  8. block x0,x1,x2;
  9. //block *kp = (block *)&aesKey;
  10. aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
  11. x2 = _mm_setzero_si128();
  12. EXPAND_ASSIST(x0, x1, x2, x0, 255, 1); aesKey->rd_key[1] = x0;
  13. EXPAND_ASSIST(x0, x1, x2, x0, 255, 2); aesKey->rd_key[2] = x0;
  14. EXPAND_ASSIST(x0, x1, x2, x0, 255, 4); aesKey->rd_key[3] = x0;
  15. EXPAND_ASSIST(x0, x1, x2, x0, 255, 8); aesKey->rd_key[4] = x0;
  16. EXPAND_ASSIST(x0, x1, x2, x0, 255, 16); aesKey->rd_key[5] = x0;
  17. EXPAND_ASSIST(x0, x1, x2, x0, 255, 32); aesKey->rd_key[6] = x0;
  18. EXPAND_ASSIST(x0, x1, x2, x0, 255, 64); aesKey->rd_key[7] = x0;
  19. EXPAND_ASSIST(x0, x1, x2, x0, 255, 128); aesKey->rd_key[8] = x0;
  20. EXPAND_ASSIST(x0, x1, x2, x0, 255, 27); aesKey->rd_key[9] = x0;
  21. EXPAND_ASSIST(x0, x1, x2, x0, 255, 54); aesKey->rd_key[10] = x0;
  22. }
  23. void AES_192_Key_Expansion(const unsigned char *userkey, AES_KEY *aesKey)
  24. {
  25. __m128i x0,x1,x2,x3,tmp,*kp = (block *)&aesKey;
  26. kp[0] = x0 = _mm_loadu_si128((block*)userkey);
  27. tmp = x3 = _mm_loadu_si128((block*)(userkey+16));
  28. x2 = _mm_setzero_si128();
  29. EXPAND192_STEP(1,1);
  30. EXPAND192_STEP(4,4);
  31. EXPAND192_STEP(7,16);
  32. EXPAND192_STEP(10,64);
  33. }
  34. void AES_256_Key_Expansion(const unsigned char *userkey, AES_KEY *aesKey)
  35. {
  36. __m128i x0, x1, x2, x3;/* , *kp = (block *)&aesKey;*/
  37. aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
  38. aesKey->rd_key[1] = x3 = _mm_loadu_si128((block*)(userkey + 16));
  39. x2 = _mm_setzero_si128();
  40. EXPAND_ASSIST(x0, x1, x2, x3, 255, 1); aesKey->rd_key[2] = x0;
  41. EXPAND_ASSIST(x3, x1, x2, x0, 170, 1); aesKey->rd_key[3] = x3;
  42. EXPAND_ASSIST(x0, x1, x2, x3, 255, 2); aesKey->rd_key[4] = x0;
  43. EXPAND_ASSIST(x3, x1, x2, x0, 170, 2); aesKey->rd_key[5] = x3;
  44. EXPAND_ASSIST(x0, x1, x2, x3, 255, 4); aesKey->rd_key[6] = x0;
  45. EXPAND_ASSIST(x3, x1, x2, x0, 170, 4); aesKey->rd_key[7] = x3;
  46. EXPAND_ASSIST(x0, x1, x2, x3, 255, 8); aesKey->rd_key[8] = x0;
  47. EXPAND_ASSIST(x3, x1, x2, x0, 170, 8); aesKey->rd_key[9] = x3;
  48. EXPAND_ASSIST(x0, x1, x2, x3, 255, 16); aesKey->rd_key[10] = x0;
  49. EXPAND_ASSIST(x3, x1, x2, x0, 170, 16); aesKey->rd_key[11] = x3;
  50. EXPAND_ASSIST(x0, x1, x2, x3, 255, 32); aesKey->rd_key[12] = x0;
  51. EXPAND_ASSIST(x3, x1, x2, x0, 170, 32); aesKey->rd_key[13] = x3;
  52. EXPAND_ASSIST(x0, x1, x2, x3, 255, 64); aesKey->rd_key[14] = x0;
  53. }
  54. void AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *aesKey)
  55. {
  56. if (bits == 128) {
  57. AES_128_Key_Expansion(userKey, aesKey);
  58. } else if (bits == 192) {
  59. AES_192_Key_Expansion(userKey, aesKey);
  60. } else if (bits == 256) {
  61. AES_256_Key_Expansion(userKey, aesKey);
  62. }
  63. aesKey->rounds = 6 + bits / 32;
  64. }
  65. void AES_encryptC(block *in, block *out, AES_KEY *aesKey)
  66. {
  67. int j, rnds = ROUNDS(aesKey);
  68. const __m128i *sched = ((__m128i *)(aesKey->rd_key));
  69. __m128i tmp = _mm_load_si128((__m128i*)in);
  70. tmp = _mm_xor_si128(tmp, sched[0]);
  71. for (j = 1; j<rnds; j++) tmp = _mm_aesenc_si128(tmp, sched[j]);
  72. tmp = _mm_aesenclast_si128(tmp, sched[j]);
  73. _mm_store_si128((__m128i*)out, tmp);
  74. }
  75. void AES_ecb_encrypt(block *blk, AES_KEY *aesKey) {
  76. unsigned j, rnds = ROUNDS(aesKey);
  77. const block *sched = ((block *)(aesKey->rd_key));
  78. *blk = _mm_xor_si128(*blk, sched[0]);
  79. for (j = 1; j<rnds; ++j)
  80. *blk = _mm_aesenc_si128(*blk, sched[j]);
  81. *blk = _mm_aesenclast_si128(*blk, sched[j]);
  82. }
  83. void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *aesKey) {
  84. unsigned i,j,rnds=ROUNDS(aesKey);
  85. const block *sched = ((block *)(aesKey->rd_key));
  86. for (i=0; i<nblks; ++i)
  87. blks[i] =_mm_xor_si128(blks[i], sched[0]);
  88. for(j=1; j<rnds; ++j)
  89. for (i=0; i<nblks; ++i)
  90. blks[i] = _mm_aesenc_si128(blks[i], sched[j]);
  91. for (i=0; i<nblks; ++i)
  92. blks[i] =_mm_aesenclast_si128(blks[i], sched[j]);
  93. }
  94. void AES_ecb_encrypt_blks_4(block *blks, AES_KEY *aesKey) {
  95. unsigned j, rnds = ROUNDS(aesKey);
  96. const block *sched = ((block *)(aesKey->rd_key));
  97. blks[0] = _mm_xor_si128(blks[0], sched[0]);
  98. blks[1] = _mm_xor_si128(blks[1], sched[0]);
  99. blks[2] = _mm_xor_si128(blks[2], sched[0]);
  100. blks[3] = _mm_xor_si128(blks[3], sched[0]);
  101. for (j = 1; j < rnds; ++j){
  102. blks[0] = _mm_aesenc_si128(blks[0], sched[j]);
  103. blks[1] = _mm_aesenc_si128(blks[1], sched[j]);
  104. blks[2] = _mm_aesenc_si128(blks[2], sched[j]);
  105. blks[3] = _mm_aesenc_si128(blks[3], sched[j]);
  106. }
  107. blks[0] = _mm_aesenclast_si128(blks[0], sched[j]);
  108. blks[1] = _mm_aesenclast_si128(blks[1], sched[j]);
  109. blks[2] = _mm_aesenclast_si128(blks[2], sched[j]);
  110. blks[3] = _mm_aesenclast_si128(blks[3], sched[j]);
  111. }
  112. void AES_ecb_encrypt_blks_2_in_out(block *in, block *out, AES_KEY *aesKey) {
  113. unsigned j, rnds = ROUNDS(aesKey);
  114. const block *sched = ((block *)(aesKey->rd_key));
  115. out[0] = _mm_xor_si128(in[0], sched[0]);
  116. out[1] = _mm_xor_si128(in[1], sched[0]);
  117. for (j = 1; j < rnds; ++j){
  118. out[0] = _mm_aesenc_si128(out[0], sched[j]);
  119. out[1] = _mm_aesenc_si128(out[1], sched[j]);
  120. }
  121. out[0] = _mm_aesenclast_si128(out[0], sched[j]);
  122. out[1] = _mm_aesenclast_si128(out[1], sched[j]);
  123. }
  124. void AES_ecb_encrypt_blks_4_in_out(block *in, block *out, AES_KEY *aesKey) {
  125. unsigned j, rnds = ROUNDS(aesKey);
  126. const block *sched = ((block *)(aesKey->rd_key));
  127. //block temp[4];
  128. out[0] = _mm_xor_si128(in[0], sched[0]);
  129. out[1] = _mm_xor_si128(in[1], sched[0]);
  130. out[2] = _mm_xor_si128(in[2], sched[0]);
  131. out[3] = _mm_xor_si128(in[3], sched[0]);
  132. for (j = 1; j < rnds; ++j){
  133. out[0] = _mm_aesenc_si128(out[0], sched[j]);
  134. out[1] = _mm_aesenc_si128(out[1], sched[j]);
  135. out[2] = _mm_aesenc_si128(out[2], sched[j]);
  136. out[3] = _mm_aesenc_si128(out[3], sched[j]);
  137. }
  138. out[0] = _mm_aesenclast_si128(out[0], sched[j]);
  139. out[1] = _mm_aesenclast_si128(out[1], sched[j]);
  140. out[2] = _mm_aesenclast_si128(out[2], sched[j]);
  141. out[3] = _mm_aesenclast_si128(out[3], sched[j]);
  142. }
  143. void AES_ecb_encrypt_blks_4_in_out_ind_keys(block *in, block *out, AES_KEY **aesKey, block** sched) {
  144. unsigned j, rnds = ROUNDS(aesKey[0]);
  145. sched[0] = ((block *)(aesKey[0][0].rd_key));
  146. sched[1] = ((block *)(aesKey[0][1].rd_key));
  147. sched[2] = ((block *)(aesKey[0][2].rd_key));
  148. sched[3] = ((block *)(aesKey[0][3].rd_key));
  149. //block temp[4];
  150. out[0] = _mm_xor_si128(in[0], sched[0][0]);
  151. out[1] = _mm_xor_si128(in[1], sched[1][0]);
  152. out[2] = _mm_xor_si128(in[2], sched[2][0]);
  153. out[3] = _mm_xor_si128(in[3], sched[3][0]);
  154. for (j = 1; j < rnds; ++j){
  155. out[0] = _mm_aesenc_si128(out[0], sched[0][j]);
  156. out[1] = _mm_aesenc_si128(out[1], sched[1][j]);
  157. out[2] = _mm_aesenc_si128(out[2], sched[2][j]);
  158. out[3] = _mm_aesenc_si128(out[3], sched[3][j]);
  159. }
  160. out[0] = _mm_aesenclast_si128(out[0], sched[0][j]);
  161. out[1] = _mm_aesenclast_si128(out[1], sched[1][j]);
  162. out[2] = _mm_aesenclast_si128(out[2], sched[2][j]);
  163. out[3] = _mm_aesenclast_si128(out[3], sched[3][j]);
  164. }
  165. void AES_ecb_encrypt_blks_4_in_out_par_ks(block *in, block *out, const unsigned char* userkey) {
  166. unsigned int j, rnds = 10;
  167. block k0, k1, k2, k3, ktmp, k0tmp, k1tmp, k2tmp, k3tmp;
  168. /*aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
  169. x2 = _mm_setzero_si128();
  170. EXPAND_ASSIST(x0, x1, x2, x0, 255, 2); aesKey->rd_key[2] = x0;
  171. EXPAND_ASSIST(x0, x1, x2, x0, 255, 4); aesKey->rd_key[3] = x0;
  172. EXPAND_ASSIST(x0, x1, x2, x0, 255, 8); aesKey->rd_key[4] = x0;
  173. EXPAND_ASSIST(x0, x1, x2, x0, 255, 16); aesKey->rd_key[5] = x0;
  174. EXPAND_ASSIST(x0, x1, x2, x0, 255, 32); aesKey->rd_key[6] = x0;
  175. EXPAND_ASSIST(x0, x1, x2, x0, 255, 64); aesKey->rd_key[7] = x0;
  176. EXPAND_ASSIST(x0, x1, x2, x0, 255, 128); aesKey->rd_key[8] = x0;
  177. EXPAND_ASSIST(x0, x1, x2, x0, 255, 27); aesKey->rd_key[9] = x0;
  178. EXPAND_ASSIST(x0, x1, x2, x0, 255, 54); aesKey->rd_key[10] = x0;*/
  179. /*sched[0] = ((block *)(aesKey[0]->rd_key));
  180. sched[1] = ((block *)(aesKey[1]->rd_key));
  181. sched[2] = ((block *)(aesKey[2]->rd_key));
  182. sched[3] = ((block *)(aesKey[3]->rd_key));*/
  183. k0 = _mm_loadu_si128((block*)userkey);
  184. out[0] = _mm_xor_si128(in[0], k0);
  185. k1 = _mm_loadu_si128((block*)(userkey+16));
  186. out[1] = _mm_xor_si128(in[1], k1);
  187. k2 = _mm_loadu_si128((block*)(userkey+32));
  188. out[2] = _mm_xor_si128(in[2], k2);
  189. k3 = _mm_loadu_si128((block*)(userkey+48));
  190. out[3] = _mm_xor_si128(in[3], k3);
  191. k0tmp = _mm_setzero_si128();
  192. k1tmp = _mm_setzero_si128();
  193. k2tmp = _mm_setzero_si128();
  194. k3tmp = _mm_setzero_si128();
  195. //First Round
  196. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 1);
  197. out[0] = _mm_aesenc_si128(out[0], k0);
  198. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 1);
  199. out[1] = _mm_aesenc_si128(out[1], k1);
  200. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 1);
  201. out[2] = _mm_aesenc_si128(out[2], k2);
  202. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 1);
  203. out[3] = _mm_aesenc_si128(out[3], k3);
  204. //Second Round
  205. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 2);
  206. out[0] = _mm_aesenc_si128(out[0], k0);
  207. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 2);
  208. out[1] = _mm_aesenc_si128(out[1], k1);
  209. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 2);
  210. out[2] = _mm_aesenc_si128(out[2], k2);
  211. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 2);
  212. out[3] = _mm_aesenc_si128(out[3], k3);
  213. //Third Round
  214. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 4);
  215. out[0] = _mm_aesenc_si128(out[0], k0);
  216. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 4);
  217. out[1] = _mm_aesenc_si128(out[1], k1);
  218. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 4);
  219. out[2] = _mm_aesenc_si128(out[2], k2);
  220. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 4);
  221. out[3] = _mm_aesenc_si128(out[3], k3);
  222. //Fourth Round
  223. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 8);
  224. out[0] = _mm_aesenc_si128(out[0], k0);
  225. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 8);
  226. out[1] = _mm_aesenc_si128(out[1], k1);
  227. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 8);
  228. out[2] = _mm_aesenc_si128(out[2], k2);
  229. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 8);
  230. out[3] = _mm_aesenc_si128(out[3], k3);
  231. //Fifth Round
  232. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 16);
  233. out[0] = _mm_aesenc_si128(out[0], k0);
  234. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 16);
  235. out[1] = _mm_aesenc_si128(out[1], k1);
  236. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 16);
  237. out[2] = _mm_aesenc_si128(out[2], k2);
  238. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 16);
  239. out[3] = _mm_aesenc_si128(out[3], k3);
  240. //Sixth Round
  241. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 32);
  242. out[0] = _mm_aesenc_si128(out[0], k0);
  243. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 32);
  244. out[1] = _mm_aesenc_si128(out[1], k1);
  245. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 32);
  246. out[2] = _mm_aesenc_si128(out[2], k2);
  247. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 32);
  248. out[3] = _mm_aesenc_si128(out[3], k3);
  249. //Seventh Round
  250. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 64);
  251. out[0] = _mm_aesenc_si128(out[0], k0);
  252. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 64);
  253. out[1] = _mm_aesenc_si128(out[1], k1);
  254. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 64);
  255. out[2] = _mm_aesenc_si128(out[2], k2);
  256. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 64);
  257. out[3] = _mm_aesenc_si128(out[3], k3);
  258. //Eight Round
  259. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 128);
  260. out[0] = _mm_aesenc_si128(out[0], k0);
  261. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 128);
  262. out[1] = _mm_aesenc_si128(out[1], k1);
  263. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 128);
  264. out[2] = _mm_aesenc_si128(out[2], k2);
  265. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 128);
  266. out[3] = _mm_aesenc_si128(out[3], k3);
  267. //Ninth Round
  268. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 27);
  269. out[0] = _mm_aesenc_si128(out[0], k0);
  270. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 27);
  271. out[1] = _mm_aesenc_si128(out[1], k1);
  272. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 27);
  273. out[2] = _mm_aesenc_si128(out[2], k2);
  274. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 27);
  275. out[3] = _mm_aesenc_si128(out[3], k3);
  276. //Tenth Roundkey
  277. EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 54);
  278. out[0] = _mm_aesenclast_si128(out[0], k0);
  279. EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 54);
  280. out[1] = _mm_aesenclast_si128(out[1], k1);
  281. EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 54);
  282. out[2] = _mm_aesenclast_si128(out[2], k2);
  283. EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 54);
  284. out[3] = _mm_aesenclast_si128(out[3], k3);
  285. }
  286. void AES256_ecb_encrypt_blks_4_in_out_par_ks(block *in, block *out, const unsigned char* userkey) {
  287. unsigned int j, rnds = 14;
  288. //four keys for even and odd-numbered rounds as well as temporary keys
  289. block k0e, k1e, k2e, k3e, k0o, k1o, k2o, k3o, ktmp, k0tmp, k1tmp, k2tmp, k3tmp;
  290. /* __m128i x0, x1, x2, x3;
  291. aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
  292. aesKey->rd_key[1] = x3 = _mm_loadu_si128((block*)(userkey + 16));
  293. x2 = _mm_setzero_si128();
  294. EXPAND_ASSIST(x0, x1, x2, x3, 255, 1); aesKey->rd_key[2] = x0;
  295. EXPAND_ASSIST(x3, x1, x2, x0, 170, 1); aesKey->rd_key[3] = x3;
  296. EXPAND_ASSIST(x0, x1, x2, x3, 255, 2); aesKey->rd_key[4] = x0;
  297. EXPAND_ASSIST(x3, x1, x2, x0, 170, 2); aesKey->rd_key[5] = x3;
  298. EXPAND_ASSIST(x0, x1, x2, x3, 255, 4); aesKey->rd_key[6] = x0;
  299. EXPAND_ASSIST(x3, x1, x2, x0, 170, 4); aesKey->rd_key[7] = x3;
  300. EXPAND_ASSIST(x0, x1, x2, x3, 255, 8); aesKey->rd_key[8] = x0;
  301. EXPAND_ASSIST(x3, x1, x2, x0, 170, 8); aesKey->rd_key[9] = x3;
  302. EXPAND_ASSIST(x0, x1, x2, x3, 255, 16); aesKey->rd_key[10] = x0;
  303. EXPAND_ASSIST(x3, x1, x2, x0, 170, 16); aesKey->rd_key[11] = x3;
  304. EXPAND_ASSIST(x0, x1, x2, x3, 255, 32); aesKey->rd_key[12] = x0;
  305. EXPAND_ASSIST(x3, x1, x2, x0, 170, 32); aesKey->rd_key[13] = x3;
  306. EXPAND_ASSIST(x0, x1, x2, x3, 255, 64); aesKey->rd_key[14] = x0;*/
  307. //Zero-th Round
  308. k0e = _mm_loadu_si128((block*)userkey);
  309. out[0] = _mm_xor_si128(in[0], k0e);
  310. k1e = _mm_loadu_si128((block*)(userkey+32));
  311. out[1] = _mm_xor_si128(in[1], k1e);
  312. k2e = _mm_loadu_si128((block*)(userkey+64));
  313. out[2] = _mm_xor_si128(in[2], k2e);
  314. k3e = _mm_loadu_si128((block*)(userkey+96));
  315. out[3] = _mm_xor_si128(in[3], k3e);
  316. k0tmp = _mm_setzero_si128();
  317. k1tmp = _mm_setzero_si128();
  318. k2tmp = _mm_setzero_si128();
  319. k3tmp = _mm_setzero_si128();
  320. //First Round
  321. k0o = _mm_loadu_si128((block*)(userkey+16));
  322. out[0] = _mm_aesenc_si128(out[0], k0o);
  323. k1o = _mm_loadu_si128((block*)(userkey+48));
  324. out[1] = _mm_aesenc_si128(out[1], k1o);
  325. k2o = _mm_loadu_si128((block*)(userkey+80));
  326. out[2] = _mm_aesenc_si128(out[2], k2o);
  327. k3o = _mm_loadu_si128((block*)(userkey+112));
  328. out[3] = _mm_aesenc_si128(out[3], k3o);
  329. //Second Round; even round: result is written in kie
  330. //EXPAND_ASSIST(x0, x1, x2, x3, 255, 1); aesKey->rd_key[2] = x0;
  331. EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 1);
  332. out[0] = _mm_aesenc_si128(out[0], k0e);
  333. EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 1);
  334. out[1] = _mm_aesenc_si128(out[1], k1e);
  335. EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 1);
  336. out[2] = _mm_aesenc_si128(out[2], k2e);
  337. EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 1);
  338. out[3] = _mm_aesenc_si128(out[3], k3e);
  339. //Third Round; odd round: result is written in kio
  340. //EXPAND_ASSIST(x3, x1, x2, x0, 170, 1); aesKey->rd_key[3] = x3;
  341. EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 1);
  342. out[0] = _mm_aesenc_si128(out[0], k0o);
  343. EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 1);
  344. out[1] = _mm_aesenc_si128(out[1], k1o);
  345. EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 1);
  346. out[2] = _mm_aesenc_si128(out[2], k2o);
  347. EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 1);
  348. out[3] = _mm_aesenc_si128(out[3], k3o);
  349. //Fourth Round; even round: result is written in kie
  350. //EXPAND_ASSIST(x0, x1, x2, x3, 255, 2); aesKey->rd_key[4] = x0;
  351. EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 2);
  352. out[0] = _mm_aesenc_si128(out[0], k0e);
  353. EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 2);
  354. out[1] = _mm_aesenc_si128(out[1], k1e);
  355. EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 2);
  356. out[2] = _mm_aesenc_si128(out[2], k2e);
  357. EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 2);
  358. out[3] = _mm_aesenc_si128(out[3], k3e);
  359. //Fifth Round; odd round: result is written in kio
  360. //EXPAND_ASSIST(x3, x1, x2, x0, 170, 2); aesKey->rd_key[5] = x3;
  361. EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 2);
  362. out[0] = _mm_aesenc_si128(out[0], k0o);
  363. EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 2);
  364. out[1] = _mm_aesenc_si128(out[1], k1o);
  365. EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 2);
  366. out[2] = _mm_aesenc_si128(out[2], k2o);
  367. EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 2);
  368. out[3] = _mm_aesenc_si128(out[3], k3o);
  369. //Sixth Round; even round: result is written in kie
  370. //EXPAND_ASSIST(x0, x1, x2, x3, 255, 4); aesKey->rd_key[6] = x0;
  371. EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 4);
  372. out[0] = _mm_aesenc_si128(out[0], k0e);
  373. EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 4);
  374. out[1] = _mm_aesenc_si128(out[1], k1e);
  375. EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 4);
  376. out[2] = _mm_aesenc_si128(out[2], k2e);
  377. EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 4);
  378. out[3] = _mm_aesenc_si128(out[3], k3e);
  379. //Seventh Round: result is written in kio
  380. //EXPAND_ASSIST(x3, x1, x2, x0, 170, 4); aesKey->rd_key[7] = x3;
  381. EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 4);
  382. out[0] = _mm_aesenc_si128(out[0], k0o);
  383. EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 4);
  384. out[1] = _mm_aesenc_si128(out[1], k1o);
  385. EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 4);
  386. out[2] = _mm_aesenc_si128(out[2], k2o);
  387. EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 4);
  388. out[3] = _mm_aesenc_si128(out[3], k3o);
  389. //Eigth Round; even round: result is written in kie
  390. //EXPAND_ASSIST(x0, x1, x2, x3, 255, 8); aesKey->rd_key[8] = x0;
  391. EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 8);
  392. out[0] = _mm_aesenc_si128(out[0], k0e);
  393. EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 8);
  394. out[1] = _mm_aesenc_si128(out[1], k1e);
  395. EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 8);
  396. out[2] = _mm_aesenc_si128(out[2], k2e);
  397. EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 8);
  398. out[3] = _mm_aesenc_si128(out[3], k3e);
  399. //Ninth Round: odd result is written in kio
  400. //EXPAND_ASSIST(x3, x1, x2, x0, 170, 8); aesKey->rd_key[9] = x3;
  401. EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 8);
  402. out[0] = _mm_aesenc_si128(out[0], k0o);
  403. EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 8);
  404. out[1] = _mm_aesenc_si128(out[1], k1o);
  405. EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 8);
  406. out[2] = _mm_aesenc_si128(out[2], k2o);
  407. EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 8);
  408. out[3] = _mm_aesenc_si128(out[3], k3o);
  409. //Tenth Round; even round: result is written in kie
  410. //EXPAND_ASSIST(x0, x1, x2, x3, 255, 16); aesKey->rd_key[10] = x0;
  411. EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 16);
  412. out[0] = _mm_aesenc_si128(out[0], k0e);
  413. EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 16);
  414. out[1] = _mm_aesenc_si128(out[1], k1e);
  415. EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 16);
  416. out[2] = _mm_aesenc_si128(out[2], k2e);
  417. EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 16);
  418. out[3] = _mm_aesenc_si128(out[3], k3e);
  419. //Eleventh Roundkey: odd result is written in kio
  420. //EXPAND_ASSIST(x3, x1, x2, x0, 170, 16); aesKey->rd_key[11] = x3;
  421. EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 16);
  422. out[0] = _mm_aesenc_si128(out[0], k0o);
  423. EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 16);
  424. out[1] = _mm_aesenc_si128(out[1], k1o);
  425. EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 16);
  426. out[2] = _mm_aesenc_si128(out[2], k2o);
  427. EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 16);
  428. out[3] = _mm_aesenc_si128(out[3], k3o);
  429. //Twelvth Roundkey; even round: result is written in kie
  430. //EXPAND_ASSIST(x0, x1, x2, x3, 255, 32); aesKey->rd_key[12] = x0;
  431. EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 32);
  432. out[0] = _mm_aesenc_si128(out[0], k0e);
  433. EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 32);
  434. out[1] = _mm_aesenc_si128(out[1], k1e);
  435. EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 32);
  436. out[2] = _mm_aesenc_si128(out[2], k2e);
  437. EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 32);
  438. out[3] = _mm_aesenc_si128(out[3], k3e);
  439. //Thirtheenth Roundkey: odd result is written in kio
  440. //EXPAND_ASSIST(x3, x1, x2, x0, 170, 32); aesKey->rd_key[13] = x3;
  441. EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 32);
  442. out[0] = _mm_aesenc_si128(out[0], k0o);
  443. EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 32);
  444. out[1] = _mm_aesenc_si128(out[1], k1o);
  445. EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 32);
  446. out[2] = _mm_aesenc_si128(out[2], k2o);
  447. EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 32);
  448. out[3] = _mm_aesenc_si128(out[3], k3o);
  449. //Fourteenth Roundkey; even round: result is written in kie
  450. //EXPAND_ASSIST(x0, x1, x2, x3, 255, 64); aesKey->rd_key[14] = x0;
  451. EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 64);
  452. out[0] = _mm_aesenclast_si128(out[0], k0e);
  453. EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 64);
  454. out[1] = _mm_aesenclast_si128(out[1], k1e);
  455. EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 64);
  456. out[2] = _mm_aesenclast_si128(out[2], k2e);
  457. EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 64);
  458. out[3] = _mm_aesenclast_si128(out[3], k3e);
  459. }
  460. void AES_ecb_encrypt_chunk_in_out(block *in, block *out, unsigned nblks, AES_KEY *aesKey) {
  461. int numberOfLoops = nblks / 8;
  462. int blocksPipeLined = numberOfLoops * 8;
  463. int remainingEncrypts = nblks - blocksPipeLined;
  464. unsigned j, rnds = ROUNDS(aesKey);
  465. const block *sched = ((block *)(aesKey->rd_key));
  466. for (int i = 0; i < numberOfLoops; i++){
  467. out[0 + i * 8] = _mm_xor_si128(in[0 + i * 8], sched[0]);
  468. out[1 + i * 8] = _mm_xor_si128(in[1 + i * 8], sched[0]);
  469. out[2 + i * 8] = _mm_xor_si128(in[2 + i * 8], sched[0]);
  470. out[3 + i * 8] = _mm_xor_si128(in[3 + i * 8], sched[0]);
  471. out[4 + i * 8] = _mm_xor_si128(in[4 + i * 8], sched[0]);
  472. out[5 + i * 8] = _mm_xor_si128(in[5 + i * 8], sched[0]);
  473. out[6 + i * 8] = _mm_xor_si128(in[6 + i * 8], sched[0]);
  474. out[7 + i * 8] = _mm_xor_si128(in[7 + i * 8], sched[0]);
  475. for (j = 1; j < rnds; ++j){
  476. out[0 + i * 8] = _mm_aesenc_si128(out[0 + i * 8], sched[j]);
  477. out[1 + i * 8] = _mm_aesenc_si128(out[1 + i * 8], sched[j]);
  478. out[2 + i * 8] = _mm_aesenc_si128(out[2 + i * 8], sched[j]);
  479. out[3 + i * 8] = _mm_aesenc_si128(out[3 + i * 8], sched[j]);
  480. out[4 + i * 8] = _mm_aesenc_si128(out[4 + i * 8], sched[j]);
  481. out[5 + i * 8] = _mm_aesenc_si128(out[5 + i * 8], sched[j]);
  482. out[6 + i * 8] = _mm_aesenc_si128(out[6 + i * 8], sched[j]);
  483. out[7 + i * 8] = _mm_aesenc_si128(out[7 + i * 8], sched[j]);
  484. }
  485. out[0 + i * 8] = _mm_aesenclast_si128(out[0 + i * 8], sched[j]);
  486. out[1 + i * 8] = _mm_aesenclast_si128(out[1 + i * 8], sched[j]);
  487. out[2 + i * 8] = _mm_aesenclast_si128(out[2 + i * 8], sched[j]);
  488. out[3 + i * 8] = _mm_aesenclast_si128(out[3 + i * 8], sched[j]);
  489. out[4 + i * 8] = _mm_aesenclast_si128(out[4 + i * 8], sched[j]);
  490. out[5 + i * 8] = _mm_aesenclast_si128(out[5 + i * 8], sched[j]);
  491. out[6 + i * 8] = _mm_aesenclast_si128(out[6 + i * 8], sched[j]);
  492. out[7 + i * 8] = _mm_aesenclast_si128(out[7 + i * 8], sched[j]);
  493. }
  494. for (int i = blocksPipeLined; i < blocksPipeLined + remainingEncrypts; ++i){
  495. out[i] = _mm_xor_si128(in[i], sched[0]);
  496. for (j = 1; j < rnds; ++j)
  497. {
  498. out[i] = _mm_aesenc_si128(out[i], sched[j]);
  499. }
  500. out[i] = _mm_aesenclast_si128(out[i], sched[j]);
  501. }
  502. }
  503. #endif