bitutils.h 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. #ifndef DPF_BITUTILS_H__
  2. #define DPF_BITUTILS_H__
  3. #include <bitset> // std::bitset
  4. #include <x86intrin.h> // SSE and AVX intrinsics
  5. namespace dpf
  6. {
  7. static const __m128i bool128_mask[2] = {
  8. _mm_set_epi64x(0,1), // 0b00...0001
  9. _mm_set_epi64x(1,0) // 0b00...0001 << 64
  10. };
  11. static const __m256i bool256_mask[4] = {
  12. _mm256_set_epi64x(0,0,0,1), // 0b00...0001
  13. _mm256_set_epi64x(0,0,1,0), // 0b00...0001 << 64
  14. _mm256_set_epi64x(0,1,0,0), // 0b00...0001 << 128
  15. _mm256_set_epi64x(1,0,0,0) // 0b00...0001 << 192
  16. };
  17. static const __m128i lsb128_mask[4] = {
  18. _mm_setzero_si128(), // 0b00...0000
  19. _mm_set_epi64x(0,1), // 0b00...0001
  20. _mm_set_epi64x(0,2), // 0b00...0010
  21. _mm_set_epi64x(0,3) // 0b00...0011
  22. };
  23. static const __m128i lsb128_mask_inv[4] = {
  24. _mm_set1_epi8(-1), // 0b11...1111
  25. _mm_set_epi64x(-1,-2), // 0b11...1110
  26. _mm_set_epi64x(-1,-3), // 0b11...1101
  27. _mm_set_epi64x(-1,-4) // 0b11...1100
  28. };
  29. static const __m128i if128_mask[2] = {
  30. _mm_setzero_si128(), // 0b00...0000
  31. _mm_set1_epi8(-1) // 0b11...1111
  32. };
  33. static const __m256i lsb256_mask[4] = {
  34. _mm256_setzero_si256(), // 0b00...0000
  35. _mm256_set_epi64x(0,0,0,1), // 0b00...0001
  36. _mm256_set_epi64x(0,0,0,2), // 0b00...0010
  37. _mm256_set_epi64x(0,0,0,3) // 0b00...0011
  38. };
  39. static const __m256i lsb256_mask_inv[4] = {
  40. _mm256_set1_epi8(-1), // 0b11...1111
  41. _mm256_set_epi64x(-1,-1,-1,-2), // 0b11...1110
  42. _mm256_set_epi64x(-1,-1,-1,-3), // 0b11...1101
  43. _mm256_set_epi64x(-1,-1,-1,-4) // 0b11...1100
  44. };
  45. static const __m256i if256_mask[2] = {
  46. _mm256_setzero_si256(), // 0b00...0000
  47. _mm256_set1_epi8(-1) // 0b11...1111
  48. };
  49. inline __m128i xor_if(const __m128i & block1, const __m128i & block2, __m128i flag)
  50. {
  51. return _mm_xor_si128(block1, _mm_and_si128(block2, flag));
  52. }
  53. inline __m256i xor_if(const __m256i & block1, const __m256i & block2, __m256i flag)
  54. {
  55. return _mm256_xor_si256(block1, _mm256_and_si256(block2, flag));
  56. }
  57. inline __m128i xor_if(const __m128i & block1, const __m128i & block2, bool flag)
  58. {
  59. return _mm_xor_si128(block1, _mm_and_si128(block2, if128_mask[flag ? 1 : 0]));
  60. }
  61. inline __m256i xor_if(const __m256i & block1, const __m256i & block2, bool flag)
  62. {
  63. return _mm256_xor_si256(block1, _mm256_and_si256(block2, if256_mask[flag ? 1 : 0]));
  64. }
  65. inline uint8_t get_lsb(const __m128i & block, uint8_t bits = 0b01)
  66. {
  67. __m128i vcmp = _mm_xor_si128(_mm_and_si128(block, lsb128_mask[bits]), lsb128_mask[bits]);
  68. return static_cast<uint8_t>(_mm_testz_si128(vcmp, vcmp));
  69. }
  70. inline uint8_t get_lsb(const __m256i & block, uint8_t bits = 0b01)
  71. {
  72. __m256i vcmp = _mm256_xor_si256(_mm256_and_si256(block, lsb256_mask[bits]), lsb256_mask[bits]);
  73. return static_cast<uint8_t>(_mm256_testz_si256(vcmp, vcmp));
  74. }
  75. template <typename __mX>
  76. inline uint8_t get_lsb01(const __mX & block) { return get_lsb(block, 0b01); }
  77. template <typename __mX>
  78. inline uint8_t get_lsb10(const __mX & block) { return get_lsb(block, 0b10); }
  79. inline __m128i clear_lsb(const __m128i & block, uint8_t bits = 0b01)
  80. {
  81. return _mm_and_si128(block, lsb128_mask_inv[bits]);
  82. }
  83. inline __m256i clear_lsb(const __m256i & block, uint8_t bits = 0b01)
  84. {
  85. return _mm256_and_si256(block, lsb256_mask_inv[bits]);
  86. }
  87. // template<typename row_t = __m256i >
  88. // inline std::array<row_t, 128> bitsliced_clear_lsb(std::array<row_t, 128>& block, uint8_t bits = 0b11)
  89. // {
  90. // if(bits == 0b11)
  91. // {
  92. // block[0] = _mm_set_epi64x(0, 0);
  93. // block[1] = _mm_set_epi64x(0, 0);
  94. // }
  95. // if(bits == 0b01)
  96. // {
  97. // block[0] = _mm_set_epi64x(0, 0);
  98. // }
  99. // return block;
  100. // }
  101. template<typename row_t = __m256i, size_t nrows >
  102. inline row_t bitslicled_get_lsb(std::array<row_t, nrows> block, uint8_t bit = 0b01)
  103. {
  104. if(bit == 0b01)
  105. {
  106. return block[0];
  107. }
  108. else if (bit == 0b10)
  109. {
  110. return block[1];
  111. }
  112. else
  113. {
  114. return block[0];
  115. }
  116. }
  117. template <typename __mX>
  118. inline __mX clear_lsb01(const __mX & block) { return clear_lsb(block, 0b01); }
  119. template <typename __mX>
  120. inline __mX clear_lsb10(const __mX & block) { return clear_lsb(block, 0b10); }
  121. template <typename __mX>
  122. inline __mX clear_lsb11(const __mX & block) { return clear_lsb(block, 0b11); }
  123. inline void set_ones(__m128i & input)
  124. {
  125. input = _mm_set1_epi64x(-1);
  126. }
  127. inline void set_ones(__m256i & input)
  128. {
  129. input = _mm256_set1_epi64x(-1);
  130. }
  131. inline void set_zeros(__m128i & input)
  132. {
  133. input = _mm_setzero_si128();
  134. }
  135. inline void set_zeros(__m256i & input)
  136. {
  137. input = _mm256_setzero_si256();
  138. }
  139. // inline void zeros(block<__m128i> & input)
  140. // {
  141. // input = _mm_setzero_si128();
  142. // }
  143. // inline void zeros(block<__m256i> & input)
  144. // {
  145. // input = _mm256_setzero_si256();
  146. // }
  147. inline __m128i set_lsb(const __m128i & block, const bool val = true)
  148. {
  149. return _mm_or_si128(clear_lsb(block, 0b01), lsb128_mask[val ? 0b01 : 0b00]);
  150. }
  151. inline __m256i set_lsb(const __m256i & block, const bool val = true)
  152. {
  153. return _mm256_or_si256(clear_lsb(block, 0b01), lsb256_mask[val ? 0b01 : 0b00]);;
  154. }
  155. inline __m128i set_lsbs(const __m128i & block, const bool bits[2])
  156. {
  157. int i = (bits[0] ? 1 : 0) + 2 * (bits[1] ? 1 : 0);
  158. return _mm_or_si128(clear_lsb(block, 0b11), lsb128_mask[i]);
  159. }
  160. inline __m256i set_lsbs(const __m256i & block, const bool bits[2])
  161. {
  162. int i = (bits[0] ? 1 : 0) + 2 * (bits[1] ? 1 : 0);
  163. return _mm256_or_si256(clear_lsb(block, 0b11), lsb256_mask[i]);
  164. }
  165. } // namespace lowmc
  166. #endif // DPF_BITUTILS_H__