ntt.rs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. #[cfg(target_feature = "avx2")]
  2. use std::arch::x86_64::*;
  3. use crate::{
  4. arith::*,
  5. number_theory::*,
  6. params::*,
  7. };
  8. pub fn powers_of_primitive_root(root: u64, modulus: u64, poly_len_log2: usize) -> Vec<u64> {
  9. let poly_len = 1usize << poly_len_log2;
  10. let mut root_powers = vec![0u64; poly_len];
  11. let mut power = root;
  12. for i in 1..poly_len {
  13. let idx = reverse_bits(i as u64, poly_len_log2) as usize;
  14. root_powers[idx] = power;
  15. power = multiply_uint_mod(power, root, modulus);
  16. }
  17. root_powers[0] = 1;
  18. root_powers
  19. }
  20. pub fn scale_powers_u64(modulus: u64, poly_len: usize, inp: &[u64]) -> Vec<u64> {
  21. let mut scaled_powers = vec![0; poly_len];
  22. for i in 0..poly_len {
  23. let wide_val = (inp[i] as u128) << 64u128;
  24. let quotient = wide_val / (modulus as u128);
  25. scaled_powers[i] = quotient as u64;
  26. }
  27. scaled_powers
  28. }
  29. pub fn scale_powers_u32(modulus: u32, poly_len: usize, inp: &[u64]) -> Vec<u64> {
  30. let mut scaled_powers = vec![0; poly_len];
  31. for i in 0..poly_len {
  32. let wide_val = inp[i] << 32;
  33. let quotient = wide_val / (modulus as u64);
  34. scaled_powers[i] = (quotient as u32) as u64;
  35. }
  36. scaled_powers
  37. }
  38. pub fn build_ntt_tables(poly_len: usize, moduli: &[u64]) -> Vec<Vec<Vec<u64>>> {
  39. let poly_len_log2 = log2(poly_len as u64) as usize;
  40. let mut output: Vec<Vec<Vec<u64>>> = vec![Vec::new(); moduli.len()];
  41. for coeff_mod in 0..moduli.len() {
  42. let modulus = moduli[coeff_mod];
  43. let modulus_as_u32 = modulus.try_into().unwrap();
  44. let root = get_minimal_primitive_root(2 * poly_len as u64, modulus).unwrap();
  45. let inv_root = invert_uint_mod(root, modulus).unwrap();
  46. let root_powers = powers_of_primitive_root(root, modulus, poly_len_log2);
  47. let scaled_root_powers = scale_powers_u32(modulus_as_u32, poly_len, root_powers.as_slice());
  48. let mut inv_root_powers = powers_of_primitive_root(inv_root, modulus, poly_len_log2);
  49. for i in 0..poly_len {
  50. inv_root_powers[i] = div2_uint_mod(inv_root_powers[i], modulus);
  51. }
  52. let scaled_inv_root_powers =
  53. scale_powers_u32(modulus_as_u32, poly_len, inv_root_powers.as_slice());
  54. output[coeff_mod] = vec![
  55. root_powers,
  56. scaled_root_powers,
  57. inv_root_powers,
  58. scaled_inv_root_powers,
  59. ];
  60. }
  61. output
  62. }
  63. #[cfg(not(target_feature = "avx2"))]
  64. pub fn ntt_forward(params: &Params, operand_overall: &mut [u64]) {
  65. let log_n = params.poly_len_log2;
  66. let n = 1 << log_n;
  67. for coeff_mod in 0..params.crt_count {
  68. let operand = &mut operand_overall[coeff_mod*n..coeff_mod*n+n];
  69. let forward_table = params.get_ntt_forward_table(coeff_mod);
  70. let forward_table_prime = params.get_ntt_forward_prime_table(coeff_mod);
  71. let modulus_small = params.moduli[coeff_mod] as u32;
  72. let two_times_modulus_small: u32 = 2 * modulus_small;
  73. for mm in 0..log_n {
  74. let m = 1 << mm;
  75. let t = n >> (mm + 1);
  76. let mut it = operand.chunks_exact_mut(2 * t);
  77. for i in 0..m {
  78. let w = forward_table[m+i];
  79. let w_prime = forward_table_prime[m+i];
  80. let op = it.next().unwrap();
  81. for j in 0..t {
  82. let x: u32 = op[j] as u32;
  83. let y: u32 = op[t + j] as u32;
  84. let curr_x: u32 = x - (two_times_modulus_small * ((x >= two_times_modulus_small) as u32));
  85. let q_tmp: u64 = ((y as u64) * (w_prime as u64)) >> 32u64;
  86. let q_new = w * (y as u64) - q_tmp * (modulus_small as u64);
  87. op[j] = curr_x as u64 + q_new;
  88. op[t + j] = curr_x as u64 + ((two_times_modulus_small as u64) - q_new);
  89. }
  90. }
  91. }
  92. for i in 0..n {
  93. operand[i] -= ((operand[i] >= two_times_modulus_small as u64) as u64) * two_times_modulus_small as u64;
  94. operand[i] -= ((operand[i] >= modulus_small as u64) as u64) * modulus_small as u64;
  95. }
  96. }
  97. }
  98. #[cfg(target_feature = "avx2")]
  99. pub fn ntt_forward(params: &Params, operand_overall: &mut [u64]) {
  100. let log_n = params.poly_len_log2;
  101. let n = 1 << log_n;
  102. for coeff_mod in 0..params.crt_count {
  103. let operand = &mut operand_overall[coeff_mod*n..coeff_mod*n+n];
  104. let forward_table = params.get_ntt_forward_table(coeff_mod);
  105. let forward_table_prime = params.get_ntt_forward_prime_table(coeff_mod);
  106. let modulus_small = params.moduli[coeff_mod] as u32;
  107. let two_times_modulus_small: u32 = 2 * modulus_small;
  108. for mm in 0..log_n {
  109. let m = 1 << mm;
  110. let t = n >> (mm + 1);
  111. let mut it = operand.chunks_exact_mut(2 * t);
  112. for i in 0..m {
  113. let w = forward_table[m+i];
  114. let w_prime = forward_table_prime[m+i];
  115. let op = it.next().unwrap();
  116. if t < 4 {
  117. for j in 0..t {
  118. let x: u32 = op[j] as u32;
  119. let y: u32 = op[t + j] as u32;
  120. let curr_x: u32 = x - (two_times_modulus_small * ((x >= two_times_modulus_small) as u32));
  121. let q_tmp: u64 = ((y as u64) * (w_prime as u64)) >> 32u64;
  122. let q_new = w * (y as u64) - q_tmp * (modulus_small as u64);
  123. op[j] = curr_x as u64 + q_new;
  124. op[t + j] = curr_x as u64 + ((two_times_modulus_small as u64) - q_new);
  125. }
  126. } else {
  127. unsafe {
  128. for j in (0..t).step_by(4) {
  129. // Use AVX2 here
  130. let p_x = &mut op[j] as *mut u64;
  131. let p_y = &mut op[j + t] as *mut u64;
  132. let x = _mm256_loadu_si256(p_x as *const __m256i);
  133. let y = _mm256_loadu_si256(p_y as *const __m256i);
  134. let cmp_val = _mm256_set1_epi64x(two_times_modulus_small as i64);
  135. let gt_mask = _mm256_cmpgt_epi64(x, cmp_val);
  136. let to_subtract = _mm256_and_si256(gt_mask, cmp_val);
  137. let curr_x = _mm256_sub_epi64(x, to_subtract);
  138. // uint32_t q_val = ((y) * (uint64_t)(Wprime)) >> 32;
  139. let w_prime_vec = _mm256_set1_epi64x(w_prime as i64);
  140. let product = _mm256_mul_epu32(y, w_prime_vec);
  141. let q_val = _mm256_srli_epi64(product, 32);
  142. // q_val = W * y - q_val * modulus_small;
  143. let w_vec = _mm256_set1_epi64x(w as i64);
  144. let w_times_y = _mm256_mul_epu32(y, w_vec);
  145. let modulus_small_vec = _mm256_set1_epi64x(modulus_small as i64);
  146. let q_scaled = _mm256_mul_epu32(q_val, modulus_small_vec);
  147. let q_final = _mm256_sub_epi64(w_times_y, q_scaled);
  148. let new_x = _mm256_add_epi64(curr_x, q_final);
  149. let q_final_inverted = _mm256_sub_epi64(cmp_val, q_final);
  150. let new_y = _mm256_add_epi64(curr_x, q_final_inverted);
  151. _mm256_storeu_si256(p_x as *mut __m256i, new_x);
  152. _mm256_storeu_si256(p_y as *mut __m256i, new_y);
  153. }
  154. }
  155. }
  156. }
  157. }
  158. for i in (0..n).step_by(4) {
  159. unsafe {
  160. let p_x = &mut operand[i] as *mut u64;
  161. let cmp_val1 = _mm256_set1_epi64x(two_times_modulus_small as i64);
  162. let mut x = _mm256_loadu_si256(p_x as *const __m256i);
  163. let mut gt_mask = _mm256_cmpgt_epi64(x, cmp_val1);
  164. let mut to_subtract = _mm256_and_si256(gt_mask, cmp_val1);
  165. x = _mm256_sub_epi64(x, to_subtract);
  166. let cmp_val2 = _mm256_set1_epi64x(modulus_small as i64);
  167. gt_mask = _mm256_cmpgt_epi64(x, cmp_val2);
  168. to_subtract = _mm256_and_si256(gt_mask, cmp_val2);
  169. x = _mm256_sub_epi64(x, to_subtract);
  170. _mm256_storeu_si256(p_x as *mut __m256i, x);
  171. }
  172. }
  173. }
  174. }
  175. #[cfg(not(target_feature = "avx2"))]
  176. pub fn ntt_inverse(params: &Params, operand_overall: &mut [u64]) {
  177. for coeff_mod in 0..params.crt_count {
  178. let n = params.poly_len;
  179. let operand = &mut operand_overall[coeff_mod*n..coeff_mod*n+n];
  180. let inverse_table = params.get_ntt_inverse_table(coeff_mod);
  181. let inverse_table_prime = params.get_ntt_inverse_prime_table(coeff_mod);
  182. let modulus = params.moduli[coeff_mod];
  183. let two_times_modulus: u64 = 2 * modulus;
  184. for mm in (0..params.poly_len_log2).rev() {
  185. let h = 1 << mm;
  186. let t = n >> (mm + 1);
  187. let mut it = operand.chunks_exact_mut(2 * t);
  188. for i in 0..h {
  189. let w = inverse_table[h+i];
  190. let w_prime = inverse_table_prime[h+i];
  191. let op = it.next().unwrap();
  192. for j in 0..t {
  193. let x = op[j];
  194. let y = op[t + j];
  195. let t_tmp = two_times_modulus - y + x;
  196. let curr_x = x + y - (two_times_modulus * (((x << 1) >= t_tmp) as u64));
  197. let h_tmp = (t_tmp * w_prime) >> 32;
  198. let res_x= (curr_x + (modulus * ((t_tmp & 1) as u64))) >> 1;
  199. let res_y= w * t_tmp - h_tmp * modulus;
  200. op[j] = res_x;
  201. op[t + j] = res_y;
  202. }
  203. }
  204. }
  205. for i in 0..n {
  206. operand[i] -= ((operand[i] >= two_times_modulus) as u64) * two_times_modulus;
  207. operand[i] -= ((operand[i] >= modulus) as u64) * modulus;
  208. }
  209. }
  210. }
  211. #[cfg(target_feature = "avx2")]
  212. pub fn ntt_inverse(params: &Params, operand_overall: &mut [u64]) {
  213. for coeff_mod in 0..params.crt_count {
  214. let n = params.poly_len;
  215. let operand = &mut operand_overall[coeff_mod*n..coeff_mod*n+n];
  216. let inverse_table = params.get_ntt_inverse_table(coeff_mod);
  217. let inverse_table_prime = params.get_ntt_inverse_prime_table(coeff_mod);
  218. let modulus = params.moduli[coeff_mod];
  219. let two_times_modulus: u64 = 2 * modulus;
  220. for mm in (0..params.poly_len_log2).rev() {
  221. let h = 1 << mm;
  222. let t = n >> (mm + 1);
  223. let mut it = operand.chunks_exact_mut(2 * t);
  224. for i in 0..h {
  225. let w = inverse_table[h+i];
  226. let w_prime = inverse_table_prime[h+i];
  227. let op = it.next().unwrap();
  228. if t < 4 {
  229. for j in 0..t {
  230. let x = op[j];
  231. let y = op[t + j];
  232. let t_tmp = two_times_modulus - y + x;
  233. let curr_x = x + y - (two_times_modulus * (((x << 1) >= t_tmp) as u64));
  234. let h_tmp = (t_tmp * w_prime) >> 32;
  235. let res_x= (curr_x + (modulus * ((t_tmp & 1) as u64))) >> 1;
  236. let res_y= w * t_tmp - h_tmp * modulus;
  237. op[j] = res_x;
  238. op[t + j] = res_y;
  239. }
  240. } else {
  241. unsafe {
  242. for j in (0..t).step_by(4) {
  243. // Use AVX2 here
  244. let p_x = &mut op[j] as *mut u64;
  245. let p_y = &mut op[j + t] as *mut u64;
  246. let x = _mm256_loadu_si256(p_x as *const __m256i);
  247. let y = _mm256_loadu_si256(p_y as *const __m256i);
  248. let modulus_vec = _mm256_set1_epi64x(modulus as i64);
  249. let two_times_modulus_vec = _mm256_set1_epi64x(two_times_modulus as i64);
  250. let mut t_tmp = _mm256_set1_epi64x(two_times_modulus as i64);
  251. t_tmp = _mm256_sub_epi64(t_tmp, y);
  252. t_tmp = _mm256_add_epi64(t_tmp, x);
  253. let gt_mask = _mm256_cmpgt_epi64(_mm256_slli_epi64(x, 1), t_tmp);
  254. let to_subtract = _mm256_and_si256(gt_mask, two_times_modulus_vec);
  255. let mut curr_x = _mm256_add_epi64(x, y);
  256. curr_x = _mm256_sub_epi64(curr_x, to_subtract);
  257. let w_prime_vec = _mm256_set1_epi64x(w_prime as i64);
  258. let mut h_tmp = _mm256_mul_epu32(t_tmp, w_prime_vec);
  259. h_tmp = _mm256_srli_epi64(h_tmp, 32);
  260. let and_mask = _mm256_set_epi64x(1, 1, 1, 1);
  261. let eq_mask = _mm256_cmpeq_epi64(_mm256_and_si256(t_tmp, and_mask), and_mask);
  262. let to_add = _mm256_and_si256(eq_mask, modulus_vec);
  263. let new_x = _mm256_srli_epi64(_mm256_add_epi64(curr_x, to_add), 1);
  264. let w_vec = _mm256_set1_epi64x(w as i64);
  265. let w_times_t_tmp = _mm256_mul_epu32(t_tmp, w_vec);
  266. let h_tmp_times_modulus = _mm256_mul_epu32(h_tmp, modulus_vec);
  267. let new_y = _mm256_sub_epi64(w_times_t_tmp, h_tmp_times_modulus);
  268. _mm256_storeu_si256(p_x as *mut __m256i, new_x);
  269. _mm256_storeu_si256(p_y as *mut __m256i, new_y);
  270. }
  271. }
  272. }
  273. }
  274. }
  275. for i in 0..n {
  276. operand[i] -= ((operand[i] >= two_times_modulus) as u64) * two_times_modulus;
  277. operand[i] -= ((operand[i] >= modulus) as u64) * modulus;
  278. }
  279. }
  280. }
  281. #[cfg(test)]
  282. mod test {
  283. use super::*;
  284. use rand::Rng;
  285. use crate::util::*;
  286. fn get_params() -> Params {
  287. get_test_params()
  288. }
  289. const REF_VAL: u64 = 519370102;
  290. #[test]
  291. fn build_ntt_tables_correct() {
  292. let moduli = [268369921u64, 249561089u64];
  293. let poly_len = 2048usize;
  294. let res = build_ntt_tables(poly_len, moduli.as_slice());
  295. assert_eq!(res.len(), 2);
  296. assert_eq!(res[0].len(), 4);
  297. assert_eq!(res[0][0].len(), poly_len);
  298. assert_eq!(res[0][2][0], 134184961u64);
  299. assert_eq!(res[0][2][1], 96647580u64);
  300. let mut x1 = 0u64;
  301. for i in 0..res.len() {
  302. for j in 0..res[0].len() {
  303. for k in 0..res[0][0].len() {
  304. x1 ^= res[i][j][k];
  305. }
  306. }
  307. }
  308. assert_eq!(x1, REF_VAL);
  309. }
  310. #[test]
  311. fn ntt_forward_correct() {
  312. let params = get_params();
  313. let mut v1 = vec![0; 2*2048];
  314. v1[0] = 100;
  315. v1[2048] = 100;
  316. ntt_forward(&params, v1.as_mut_slice());
  317. assert_eq!(v1[50], 100);
  318. assert_eq!(v1[2048 + 50], 100);
  319. }
  320. #[test]
  321. fn ntt_inverse_correct() {
  322. let params = get_params();
  323. let mut v1 = vec![100; 2*2048];
  324. ntt_inverse(&params, v1.as_mut_slice());
  325. assert_eq!(v1[0], 100);
  326. assert_eq!(v1[2048], 100);
  327. assert_eq!(v1[50], 0);
  328. assert_eq!(v1[2048 + 50], 0);
  329. }
  330. #[test]
  331. fn ntt_correct() {
  332. let params = get_params();
  333. let mut v1 = vec![0; params.crt_count * params.poly_len];
  334. let mut rng = rand::thread_rng();
  335. for i in 0..params.crt_count {
  336. for j in 0..params.poly_len {
  337. let idx = calc_index(&[i, j], &[params.crt_count, params.poly_len]);
  338. let val: u64 = rng.gen();
  339. v1[idx] = val % params.moduli[i];
  340. }
  341. }
  342. let mut v2 = v1.clone();
  343. ntt_forward(&params, v2.as_mut_slice());
  344. ntt_inverse(&params, v2.as_mut_slice());
  345. for i in 0..params.crt_count*params.poly_len {
  346. assert_eq!(v1[i], v2[i]);
  347. }
  348. }
  349. #[test]
  350. fn calc_index_correct() {
  351. assert_eq!(calc_index(&[2, 3, 4], &[10, 10, 100]), 2304);
  352. assert_eq!(calc_index(&[2, 3, 4], &[3, 5, 7]), 95);
  353. }
  354. }