123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354 |
- #include "intrin_sequential_enc8.h"
- #ifdef USE_PIPELINED_AES_NI
- #define KS_BLOCK(t, reg, reg2) {globAux=_mm_slli_epi64(reg, 32);\
- reg=_mm_xor_si128(globAux, reg);\
- globAux=_mm_shuffle_epi8(reg, con3);\
- reg=_mm_xor_si128(globAux, reg);\
- reg=_mm_xor_si128(reg2, reg);\
- }
- #define KS_round(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
- keyA_aux=_mm_aesenclast_si128 (x2, con); \
- KS_BLOCK(0, keyA, keyA_aux);\
- x2 =_mm_shuffle_epi8(keyB, mask); \
- keyB_aux=_mm_aesenclast_si128 (x2, con); \
- KS_BLOCK(1, keyB, keyB_aux);\
- x2 =_mm_shuffle_epi8(keyC, mask); \
- keyC_aux=_mm_aesenclast_si128 (x2, con); \
- KS_BLOCK(2, keyC, keyC_aux);\
- x2 =_mm_shuffle_epi8(keyD, mask); \
- keyD_aux=_mm_aesenclast_si128 (x2, con); \
- KS_BLOCK(3, keyD, keyD_aux);\
- con=_mm_slli_epi32(con, 1);\
- _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
- _mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \
- _mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \
- _mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \
- }
- #define KS_round_last(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
- keyA_aux=_mm_aesenclast_si128 (x2, con); \
- x2 =_mm_shuffle_epi8(keyB, mask); \
- keyB_aux=_mm_aesenclast_si128 (x2, con); \
- x2 =_mm_shuffle_epi8(keyC, mask); \
- keyC_aux=_mm_aesenclast_si128 (x2, con); \
- x2 =_mm_shuffle_epi8(keyD, mask); \
- keyD_aux=_mm_aesenclast_si128 (x2, con); \
- KS_BLOCK(0, keyA, keyA_aux);\
- KS_BLOCK(1, keyB, keyB_aux);\
- KS_BLOCK(2, keyC, keyC_aux);\
- KS_BLOCK(3, keyD, keyD_aux);\
- _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
- _mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \
- _mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \
- _mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \
- }
- #define READ_KEYS(i) {keyA = _mm_loadu_si128((__m128i const*)(keyptr[0].KEY+i*16));\
- keyB = _mm_loadu_si128((__m128i const*)(keyptr[1].KEY+i*16));\
- keyC = _mm_loadu_si128((__m128i const*)(keyptr[2].KEY+i*16));\
- keyD = _mm_loadu_si128((__m128i const*)(keyptr[3].KEY+i*16));\
- keyE = _mm_loadu_si128((__m128i const*)(keyptr[4].KEY+i*16));\
- keyF = _mm_loadu_si128((__m128i const*)(keyptr[5].KEY+i*16));\
- keyG = _mm_loadu_si128((__m128i const*)(keyptr[6].KEY+i*16));\
- keyH = _mm_loadu_si128((__m128i const*)(keyptr[7].KEY+i*16));\
- }
- #define ENC_round(i) {block1=_mm_aesenc_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
- block2=_mm_aesenc_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \
- block3=_mm_aesenc_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \
- block4=_mm_aesenc_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \
- block5=_mm_aesenc_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \
- block6=_mm_aesenc_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \
- block7=_mm_aesenc_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \
- block8=_mm_aesenc_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \
- }
- #define ENC_round_last(i) {block1=_mm_aesenclast_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
- block2=_mm_aesenclast_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \
- block3=_mm_aesenclast_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \
- block4=_mm_aesenclast_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \
- block5=_mm_aesenclast_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \
- block6=_mm_aesenclast_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \
- block7=_mm_aesenclast_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \
- block8=_mm_aesenclast_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \
- }
- #define KS1_BLOCK(t, reg, reg2) {globAux=_mm_slli_epi64(reg, 32);\
- reg=_mm_xor_si128(globAux, reg);\
- globAux=_mm_shuffle_epi8(reg, con3);\
- reg=_mm_xor_si128(globAux, reg);\
- reg=_mm_xor_si128(reg2, reg);\
- }
- #define KS1_round(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
- keyA_aux=_mm_aesenclast_si128 (x2, con); \
- KS1_BLOCK(0, keyA, keyA_aux);\
- con=_mm_slli_epi32(con, 1);\
- _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
- }
- #define KS1_round_last(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
- keyA_aux=_mm_aesenclast_si128 (x2, con); \
- KS1_BLOCK(0, keyA, keyA_aux);\
- _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
- }
- #define READ_KEYS1(i) {keyA = _mm_loadu_si128((__m128i const*)(keyptr[0].KEY+i*16));\
- }
- #define ENC1_round(i) {block1=_mm_aesenc_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
- }
- #define ENC1_round_last(i) {block1=_mm_aesenclast_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
- }
- void intrin_sequential_ks4(ROUND_KEYS* ks, unsigned char* key_bytes, int nkeys) {
- ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
- register __m128i keyA, keyB, keyC, keyD, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
- int i;
- int _con1[4]={1,1,1,1};
- int _con2[4]={0x1b,0x1b,0x1b,0x1b};
- int _mask[4]={0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d};
- int _con3[4]={0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504};
- __m128i con3=_mm_loadu_si128((__m128i const*)_con3);
- int lim = (nkeys/4)*4;
- for (i=0;i<lim;i+=4){
- keyptr[0].nr=10;
- keyptr[1].nr=10;
- keyptr[2].nr=10;
- keyptr[3].nr=10;
- keyA = _mm_loadu_si128((__m128i const*)(key_bytes));
- keyB = _mm_loadu_si128((__m128i const*)(key_bytes+16));
- keyC = _mm_loadu_si128((__m128i const*)(key_bytes+32));
- keyD = _mm_loadu_si128((__m128i const*)(key_bytes+48));
- _mm_storeu_si128((__m128i *)keyptr[0].KEY, keyA);
- _mm_storeu_si128((__m128i *)keyptr[1].KEY, keyB);
- _mm_storeu_si128((__m128i *)keyptr[2].KEY, keyC);
- _mm_storeu_si128((__m128i *)keyptr[3].KEY, keyD);
- con = _mm_loadu_si128((__m128i const*)_con1);
- mask = _mm_loadu_si128((__m128i const*)_mask);
- KS_round(1)
- KS_round(2)
- KS_round(3)
- KS_round(4)
- KS_round(5)
- KS_round(6)
- KS_round(7)
- KS_round(8)
- con = _mm_loadu_si128((__m128i const*)_con2);
- KS_round(9)
- KS_round_last(10)
- keyptr+=4;
- key_bytes+=64;
- }
- for(; i<nkeys; i++) {
- keyptr[0].nr=10;
- keyA = _mm_loadu_si128((__m128i const*)(key_bytes));
- _mm_storeu_si128((__m128i *)keyptr[0].KEY, keyA);
- con = _mm_loadu_si128((__m128i const*)_con1);
- mask = _mm_loadu_si128((__m128i const*)_mask);
- KS1_round(1)
- KS1_round(2)
- KS1_round(3)
- KS1_round(4)
- KS1_round(5)
- KS1_round(6)
- KS1_round(7)
- KS1_round(8)
- con = _mm_loadu_si128((__m128i const*)_con2);
- KS1_round(9)
- KS1_round_last(10)
- keyptr++;
- key_bytes+=16;
- }
- }
- void intrin_sequential_enc8(const unsigned char* PT, unsigned char* CT, int n_aesiters, int nkeys, ROUND_KEYS* ks){
- ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
- register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
- unsigned char *ptptr, ctptr;
- int i, j, ptoffset, ctoffset;
- ctoffset = n_aesiters * 16;
- for (i=0;i<nkeys;i+=8){
- for(j=0;j<n_aesiters; j++) {
- register __m128i block1 = _mm_loadu_si128((__m128i const*)(0*16+PT));
- register __m128i block2 = _mm_loadu_si128((__m128i const*)(1*16+PT));
- register __m128i block3 = _mm_loadu_si128((__m128i const*)(2*16+PT));
- register __m128i block4 = _mm_loadu_si128((__m128i const*)(3*16+PT));
- register __m128i block5 = _mm_loadu_si128((__m128i const*)(4*16+PT));
- register __m128i block6 = _mm_loadu_si128((__m128i const*)(5*16+PT));
- register __m128i block7 = _mm_loadu_si128((__m128i const*)(6*16+PT));
- register __m128i block8 = _mm_loadu_si128((__m128i const*)(7*16+PT));
- READ_KEYS(0)
- block1 = _mm_xor_si128(keyA, block1);
- block2 = _mm_xor_si128(keyB, block2);
- block3 = _mm_xor_si128(keyC, block3);
- block4 = _mm_xor_si128(keyD, block4);
- block5 = _mm_xor_si128(keyE, block5);
- block6 = _mm_xor_si128(keyF, block6);
- block7 = _mm_xor_si128(keyG, block7);
- block8 = _mm_xor_si128(keyH, block8);
- ENC_round(1)
- ENC_round(2)
- ENC_round(3)
- ENC_round(4)
- ENC_round(5)
- ENC_round(6)
- ENC_round(7)
- ENC_round(8)
- ENC_round(9)
- ENC_round_last(10)
- _mm_storeu_si128((__m128i *)(CT+0*16), block1);
- _mm_storeu_si128((__m128i *)(CT+1*16), block2);
- _mm_storeu_si128((__m128i *)(CT+2*16), block3);
- _mm_storeu_si128((__m128i *)(CT+3*16), block4);
- _mm_storeu_si128((__m128i *)(CT+4*16), block5);
- _mm_storeu_si128((__m128i *)(CT+5*16), block6);
- _mm_storeu_si128((__m128i *)(CT+6*16), block7);
- _mm_storeu_si128((__m128i *)(CT+7*16), block8);
- PT+=128;
- CT+=128;
- }
- keyptr+=8;
- }
- }
- void intrin_sequential_gen_rnd8(unsigned char* ctr_buf, const unsigned long long ctr, unsigned char* CT,
- int n_aesiters, int nkeys, ROUND_KEYS* ks){
- ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
- register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
- unsigned char *ctptr;
- int i, j, ctoffset;
- unsigned long long* tmpctr = (unsigned long long*) ctr_buf;
- ctoffset = n_aesiters * 16;
- register __m128i inblock, block1, block2, block3, block4, block5, block6, block7, block8;
- int lim = (nkeys/8)*8;
- for (i=0;i<lim;i+=8){
- ctptr=CT + i*ctoffset;
- (*tmpctr) = ctr;
- for(j=0;j<n_aesiters; j++) {
- (*tmpctr)++;
- inblock = _mm_loadu_si128((__m128i const*)(ctr_buf));
- READ_KEYS(0)
- block1 = _mm_xor_si128(keyA, inblock);
- block2 = _mm_xor_si128(keyB, inblock);
- block3 = _mm_xor_si128(keyC, inblock);
- block4 = _mm_xor_si128(keyD, inblock);
- block5 = _mm_xor_si128(keyE, inblock);
- block6 = _mm_xor_si128(keyF, inblock);
- block7 = _mm_xor_si128(keyG, inblock);
- block8 = _mm_xor_si128(keyH, inblock);
- ENC_round(1)
- ENC_round(2)
- ENC_round(3)
- ENC_round(4)
- ENC_round(5)
- ENC_round(6)
- ENC_round(7)
- ENC_round(8)
- ENC_round(9)
- ENC_round_last(10)
- _mm_storeu_si128((__m128i *)(ctptr+0*ctoffset), block1);
- _mm_storeu_si128((__m128i *)(ctptr+1*ctoffset), block2);
- _mm_storeu_si128((__m128i *)(ctptr+2*ctoffset), block3);
- _mm_storeu_si128((__m128i *)(ctptr+3*ctoffset), block4);
- _mm_storeu_si128((__m128i *)(ctptr+4*ctoffset), block5);
- _mm_storeu_si128((__m128i *)(ctptr+5*ctoffset), block6);
- _mm_storeu_si128((__m128i *)(ctptr+6*ctoffset), block7);
- _mm_storeu_si128((__m128i *)(ctptr+7*ctoffset), block8);
- ctptr+=16;
- }
- keyptr+=8;
- }
- for (;i<nkeys;i++){
- ctptr=CT + i*ctoffset;
- (*tmpctr) = ctr;
- for(j=0;j<n_aesiters; j++) {
- (*tmpctr)++;
- inblock = _mm_loadu_si128((__m128i const*)(ctr_buf));
- READ_KEYS1(0)
- block1 = _mm_xor_si128(keyA, inblock);
- ENC1_round(1)
- ENC1_round(2)
- ENC1_round(3)
- ENC1_round(4)
- ENC1_round(5)
- ENC1_round(6)
- ENC1_round(7)
- ENC1_round(8)
- ENC1_round(9)
- ENC1_round_last(10)
- _mm_storeu_si128((__m128i *)(ctptr), block1);
- ctptr+=16;
- }
- keyptr++;
- }
- }
- #endif
|