intrin_sequential_enc8.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /*
  2. * Copied and modified from Shay Gueron's intrin_sequential_ks4_enc8.cpp
  3. */
  4. /********************************************************************/
  5. /* Copyright(c) 2014, Intel Corp. */
  6. /* Developers and authors: Shay Gueron (1) (2) */
  7. /* (1) University of Haifa, Israel */
  8. /* (2) Intel, Israel */
  9. /* IPG, Architecture, Israel Development Center, Haifa, Israel */
  10. /********************************************************************/
  11. #include "intrin_sequential_enc8.h"
  12. #ifdef USE_PIPELINED_AES_NI
  13. #define KS_BLOCK(t, reg, reg2) {globAux=_mm_slli_epi64(reg, 32);\
  14. reg=_mm_xor_si128(globAux, reg);\
  15. globAux=_mm_shuffle_epi8(reg, con3);\
  16. reg=_mm_xor_si128(globAux, reg);\
  17. reg=_mm_xor_si128(reg2, reg);\
  18. }
  19. #define KS_round(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
  20. keyA_aux=_mm_aesenclast_si128 (x2, con); \
  21. KS_BLOCK(0, keyA, keyA_aux);\
  22. x2 =_mm_shuffle_epi8(keyB, mask); \
  23. keyB_aux=_mm_aesenclast_si128 (x2, con); \
  24. KS_BLOCK(1, keyB, keyB_aux);\
  25. x2 =_mm_shuffle_epi8(keyC, mask); \
  26. keyC_aux=_mm_aesenclast_si128 (x2, con); \
  27. KS_BLOCK(2, keyC, keyC_aux);\
  28. x2 =_mm_shuffle_epi8(keyD, mask); \
  29. keyD_aux=_mm_aesenclast_si128 (x2, con); \
  30. KS_BLOCK(3, keyD, keyD_aux);\
  31. con=_mm_slli_epi32(con, 1);\
  32. _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
  33. _mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \
  34. _mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \
  35. _mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \
  36. }
  37. #define KS_round_last(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
  38. keyA_aux=_mm_aesenclast_si128 (x2, con); \
  39. x2 =_mm_shuffle_epi8(keyB, mask); \
  40. keyB_aux=_mm_aesenclast_si128 (x2, con); \
  41. x2 =_mm_shuffle_epi8(keyC, mask); \
  42. keyC_aux=_mm_aesenclast_si128 (x2, con); \
  43. x2 =_mm_shuffle_epi8(keyD, mask); \
  44. keyD_aux=_mm_aesenclast_si128 (x2, con); \
  45. KS_BLOCK(0, keyA, keyA_aux);\
  46. KS_BLOCK(1, keyB, keyB_aux);\
  47. KS_BLOCK(2, keyC, keyC_aux);\
  48. KS_BLOCK(3, keyD, keyD_aux);\
  49. _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
  50. _mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \
  51. _mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \
  52. _mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \
  53. }
  54. #define READ_KEYS(i) {keyA = _mm_loadu_si128((__m128i const*)(keyptr[0].KEY+i*16));\
  55. keyB = _mm_loadu_si128((__m128i const*)(keyptr[1].KEY+i*16));\
  56. keyC = _mm_loadu_si128((__m128i const*)(keyptr[2].KEY+i*16));\
  57. keyD = _mm_loadu_si128((__m128i const*)(keyptr[3].KEY+i*16));\
  58. keyE = _mm_loadu_si128((__m128i const*)(keyptr[4].KEY+i*16));\
  59. keyF = _mm_loadu_si128((__m128i const*)(keyptr[5].KEY+i*16));\
  60. keyG = _mm_loadu_si128((__m128i const*)(keyptr[6].KEY+i*16));\
  61. keyH = _mm_loadu_si128((__m128i const*)(keyptr[7].KEY+i*16));\
  62. }
  63. #define ENC_round(i) {block1=_mm_aesenc_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
  64. block2=_mm_aesenc_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \
  65. block3=_mm_aesenc_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \
  66. block4=_mm_aesenc_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \
  67. block5=_mm_aesenc_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \
  68. block6=_mm_aesenc_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \
  69. block7=_mm_aesenc_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \
  70. block8=_mm_aesenc_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \
  71. }
  72. #define ENC_round_last(i) {block1=_mm_aesenclast_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
  73. block2=_mm_aesenclast_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \
  74. block3=_mm_aesenclast_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \
  75. block4=_mm_aesenclast_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \
  76. block5=_mm_aesenclast_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \
  77. block6=_mm_aesenclast_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \
  78. block7=_mm_aesenclast_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \
  79. block8=_mm_aesenclast_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \
  80. }
  81. #define KS1_BLOCK(t, reg, reg2) {globAux=_mm_slli_epi64(reg, 32);\
  82. reg=_mm_xor_si128(globAux, reg);\
  83. globAux=_mm_shuffle_epi8(reg, con3);\
  84. reg=_mm_xor_si128(globAux, reg);\
  85. reg=_mm_xor_si128(reg2, reg);\
  86. }
  87. #define KS1_round(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
  88. keyA_aux=_mm_aesenclast_si128 (x2, con); \
  89. KS1_BLOCK(0, keyA, keyA_aux);\
  90. con=_mm_slli_epi32(con, 1);\
  91. _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
  92. }
  93. #define KS1_round_last(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
  94. keyA_aux=_mm_aesenclast_si128 (x2, con); \
  95. KS1_BLOCK(0, keyA, keyA_aux);\
  96. _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
  97. }
  98. #define READ_KEYS1(i) {keyA = _mm_loadu_si128((__m128i const*)(keyptr[0].KEY+i*16));\
  99. }
  100. #define ENC1_round(i) {block1=_mm_aesenc_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
  101. }
  102. #define ENC1_round_last(i) {block1=_mm_aesenclast_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
  103. }
  104. //generates nkeys round keys from the bytes stored in key_bytes
  105. void intrin_sequential_ks4(ROUND_KEYS* ks, unsigned char* key_bytes, int nkeys) {
  106. ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
  107. register __m128i keyA, keyB, keyC, keyD, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
  108. int i;
  109. int _con1[4]={1,1,1,1};
  110. int _con2[4]={0x1b,0x1b,0x1b,0x1b};
  111. int _mask[4]={0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d};
  112. int _con3[4]={0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504};
  113. __m128i con3=_mm_loadu_si128((__m128i const*)_con3);
  114. int lim = (nkeys/4)*4;
  115. for (i=0;i<lim;i+=4){
  116. keyptr[0].nr=10;
  117. keyptr[1].nr=10;
  118. keyptr[2].nr=10;
  119. keyptr[3].nr=10;
  120. keyA = _mm_loadu_si128((__m128i const*)(key_bytes));
  121. keyB = _mm_loadu_si128((__m128i const*)(key_bytes+16));
  122. keyC = _mm_loadu_si128((__m128i const*)(key_bytes+32));
  123. keyD = _mm_loadu_si128((__m128i const*)(key_bytes+48));
  124. _mm_storeu_si128((__m128i *)keyptr[0].KEY, keyA);
  125. _mm_storeu_si128((__m128i *)keyptr[1].KEY, keyB);
  126. _mm_storeu_si128((__m128i *)keyptr[2].KEY, keyC);
  127. _mm_storeu_si128((__m128i *)keyptr[3].KEY, keyD);
  128. con = _mm_loadu_si128((__m128i const*)_con1);
  129. mask = _mm_loadu_si128((__m128i const*)_mask);
  130. KS_round(1)
  131. KS_round(2)
  132. KS_round(3)
  133. KS_round(4)
  134. KS_round(5)
  135. KS_round(6)
  136. KS_round(7)
  137. KS_round(8)
  138. con = _mm_loadu_si128((__m128i const*)_con2);
  139. KS_round(9)
  140. KS_round_last(10)
  141. keyptr+=4;
  142. key_bytes+=64;
  143. }
  144. for(; i<nkeys; i++) {
  145. keyptr[0].nr=10;
  146. keyA = _mm_loadu_si128((__m128i const*)(key_bytes));
  147. _mm_storeu_si128((__m128i *)keyptr[0].KEY, keyA);
  148. con = _mm_loadu_si128((__m128i const*)_con1);
  149. mask = _mm_loadu_si128((__m128i const*)_mask);
  150. KS1_round(1)
  151. KS1_round(2)
  152. KS1_round(3)
  153. KS1_round(4)
  154. KS1_round(5)
  155. KS1_round(6)
  156. KS1_round(7)
  157. KS1_round(8)
  158. con = _mm_loadu_si128((__m128i const*)_con2);
  159. KS1_round(9)
  160. KS1_round_last(10)
  161. keyptr++;
  162. key_bytes+=16;
  163. }
  164. }
  165. void intrin_sequential_enc8(const unsigned char* PT, unsigned char* CT, int n_aesiters, int nkeys, ROUND_KEYS* ks){
  166. ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
  167. register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
  168. unsigned char *ptptr, ctptr;
  169. int i, j, ptoffset, ctoffset;
  170. ctoffset = n_aesiters * 16;
  171. for (i=0;i<nkeys;i+=8){
  172. for(j=0;j<n_aesiters; j++) {
  173. register __m128i block1 = _mm_loadu_si128((__m128i const*)(0*16+PT));
  174. register __m128i block2 = _mm_loadu_si128((__m128i const*)(1*16+PT));
  175. register __m128i block3 = _mm_loadu_si128((__m128i const*)(2*16+PT));
  176. register __m128i block4 = _mm_loadu_si128((__m128i const*)(3*16+PT));
  177. register __m128i block5 = _mm_loadu_si128((__m128i const*)(4*16+PT));
  178. register __m128i block6 = _mm_loadu_si128((__m128i const*)(5*16+PT));
  179. register __m128i block7 = _mm_loadu_si128((__m128i const*)(6*16+PT));
  180. register __m128i block8 = _mm_loadu_si128((__m128i const*)(7*16+PT));
  181. READ_KEYS(0)
  182. block1 = _mm_xor_si128(keyA, block1);
  183. block2 = _mm_xor_si128(keyB, block2);
  184. block3 = _mm_xor_si128(keyC, block3);
  185. block4 = _mm_xor_si128(keyD, block4);
  186. block5 = _mm_xor_si128(keyE, block5);
  187. block6 = _mm_xor_si128(keyF, block6);
  188. block7 = _mm_xor_si128(keyG, block7);
  189. block8 = _mm_xor_si128(keyH, block8);
  190. ENC_round(1)
  191. ENC_round(2)
  192. ENC_round(3)
  193. ENC_round(4)
  194. ENC_round(5)
  195. ENC_round(6)
  196. ENC_round(7)
  197. ENC_round(8)
  198. ENC_round(9)
  199. ENC_round_last(10)
  200. _mm_storeu_si128((__m128i *)(CT+0*16), block1);
  201. _mm_storeu_si128((__m128i *)(CT+1*16), block2);
  202. _mm_storeu_si128((__m128i *)(CT+2*16), block3);
  203. _mm_storeu_si128((__m128i *)(CT+3*16), block4);
  204. _mm_storeu_si128((__m128i *)(CT+4*16), block5);
  205. _mm_storeu_si128((__m128i *)(CT+5*16), block6);
  206. _mm_storeu_si128((__m128i *)(CT+6*16), block7);
  207. _mm_storeu_si128((__m128i *)(CT+7*16), block8);
  208. PT+=128;
  209. CT+=128;
  210. }
  211. keyptr+=8;
  212. }
  213. }
  214. void intrin_sequential_gen_rnd8(unsigned char* ctr_buf, const unsigned long long ctr, unsigned char* CT,
  215. int n_aesiters, int nkeys, ROUND_KEYS* ks){
  216. ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
  217. register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
  218. unsigned char *ctptr;
  219. int i, j, ctoffset;
  220. unsigned long long* tmpctr = (unsigned long long*) ctr_buf;
  221. ctoffset = n_aesiters * 16;
  222. register __m128i inblock, block1, block2, block3, block4, block5, block6, block7, block8;
  223. int lim = (nkeys/8)*8;
  224. for (i=0;i<lim;i+=8){
  225. ctptr=CT + i*ctoffset;
  226. (*tmpctr) = ctr;
  227. for(j=0;j<n_aesiters; j++) {
  228. (*tmpctr)++;
  229. inblock = _mm_loadu_si128((__m128i const*)(ctr_buf));
  230. READ_KEYS(0)
  231. block1 = _mm_xor_si128(keyA, inblock);
  232. block2 = _mm_xor_si128(keyB, inblock);
  233. block3 = _mm_xor_si128(keyC, inblock);
  234. block4 = _mm_xor_si128(keyD, inblock);
  235. block5 = _mm_xor_si128(keyE, inblock);
  236. block6 = _mm_xor_si128(keyF, inblock);
  237. block7 = _mm_xor_si128(keyG, inblock);
  238. block8 = _mm_xor_si128(keyH, inblock);
  239. ENC_round(1)
  240. ENC_round(2)
  241. ENC_round(3)
  242. ENC_round(4)
  243. ENC_round(5)
  244. ENC_round(6)
  245. ENC_round(7)
  246. ENC_round(8)
  247. ENC_round(9)
  248. ENC_round_last(10)
  249. _mm_storeu_si128((__m128i *)(ctptr+0*ctoffset), block1);
  250. _mm_storeu_si128((__m128i *)(ctptr+1*ctoffset), block2);
  251. _mm_storeu_si128((__m128i *)(ctptr+2*ctoffset), block3);
  252. _mm_storeu_si128((__m128i *)(ctptr+3*ctoffset), block4);
  253. _mm_storeu_si128((__m128i *)(ctptr+4*ctoffset), block5);
  254. _mm_storeu_si128((__m128i *)(ctptr+5*ctoffset), block6);
  255. _mm_storeu_si128((__m128i *)(ctptr+6*ctoffset), block7);
  256. _mm_storeu_si128((__m128i *)(ctptr+7*ctoffset), block8);
  257. ctptr+=16;
  258. }
  259. keyptr+=8;
  260. }
  261. for (;i<nkeys;i++){
  262. ctptr=CT + i*ctoffset;
  263. (*tmpctr) = ctr;
  264. for(j=0;j<n_aesiters; j++) {
  265. (*tmpctr)++;
  266. inblock = _mm_loadu_si128((__m128i const*)(ctr_buf));
  267. READ_KEYS1(0)
  268. block1 = _mm_xor_si128(keyA, inblock);
  269. ENC1_round(1)
  270. ENC1_round(2)
  271. ENC1_round(3)
  272. ENC1_round(4)
  273. ENC1_round(5)
  274. ENC1_round(6)
  275. ENC1_round(7)
  276. ENC1_round(8)
  277. ENC1_round(9)
  278. ENC1_round_last(10)
  279. _mm_storeu_si128((__m128i *)(ctptr), block1);
  280. ctptr+=16;
  281. }
  282. keyptr++;
  283. }
  284. }
  285. #endif