use spiral_rs::aligned_memory::*;
use spiral_rs::arith::*;
use spiral_rs::params::*;
use spiral_rs::poly::*;
use spiral_rs::server::*;
use spiral_rs::util::*;

use rayon::scope;

pub fn load_item_from_slice<'a>(
    params: &'a Params,
    slice: &[u8],
    instance: usize,
    trial: usize,
    item_idx: usize,
) -> PolyMatrixRaw<'a> {
    let db_item_size = params.db_item_size;
    let instances = params.instances;
    let trials = params.n * params.n;

    let chunks = instances * trials;
    let bytes_per_chunk = f64::ceil(db_item_size as f64 / chunks as f64) as usize;
    let logp = f64::ceil(f64::log2(params.pt_modulus as f64)) as usize;
    let modp_words_per_chunk = f64::ceil((bytes_per_chunk * 8) as f64 / logp as f64) as usize;
    assert!(modp_words_per_chunk <= params.poly_len);

    let idx_item_in_file = item_idx * db_item_size;
    let idx_chunk = instance * trials + trial;
    let idx_poly_in_file = idx_item_in_file + idx_chunk * bytes_per_chunk;

    let mut out = PolyMatrixRaw::zero(params, 1, 1);

    let modp_words_read = f64::ceil((bytes_per_chunk * 8) as f64 / logp as f64) as usize;
    assert!(modp_words_read <= params.poly_len);

    for i in 0..modp_words_read {
        out.data[i] = read_arbitrary_bits(
            &slice[idx_poly_in_file..idx_poly_in_file + bytes_per_chunk],
            i * logp,
            logp,
        );
        assert!(out.data[i] <= params.pt_modulus);
    }

    out
}

pub fn load_db_from_slice_mt(params: &Params, slice: &[u8], num_threads: usize) -> AlignedMemory64 {
    let instances = params.instances;
    let trials = params.n * params.n;
    let dim0 = 1 << params.db_dim_1;
    let num_per = 1 << params.db_dim_2;
    let num_items = dim0 * num_per;
    let db_size_words = instances * trials * num_items * params.poly_len;
    let mut v: AlignedMemory64 = AlignedMemory64::new(db_size_words);

    // Get a pointer to the memory pool of the AlignedMemory64.  We
    // treat it as a usize explicitly so we can pass the same pointer to
    // multiple threads, each of which will cast it to a *mut u64, in
    // order to *write* into the memory pool concurrently. There is a
    // caveat that the threads *must not* try to write into the same
    // memory location.  In Spiral, each polynomial created from the
    // database ends up scattered into noncontiguous words of memory,
    // but any one word still only comes from one polynomial.  So with
    // this mechanism, different threads can read different parts of the
    // database to produce different polynomials, and write those
    // polynomials into the same memory pool (but *not* the same memory
    // locations) at the same time.

    let vptrusize = unsafe { v.as_mut_ptr() as usize };

    for instance in 0..instances {
        for trial in 0..trials {
            scope(|s| {
                let mut item_thread_start = 0usize;
                let items_per_thread_base = num_items / num_threads;
                let items_per_thread_extra = num_items % num_threads;
                for thr in 0..num_threads {
                    let items_this_thread =
                        items_per_thread_base + if thr < items_per_thread_extra { 1 } else { 0 };
                    let item_thread_end = item_thread_start + items_this_thread;
                    s.spawn(move |_| {
                        let vptr = vptrusize as *mut u64;
                        for i in item_thread_start..item_thread_end {
                            // Swap the halves of the item index so that
                            // the polynomials based on the items are
                            // written to the AlignedMemory64 more
                            // sequentially
                            let ii = i / dim0;
                            let j = i % dim0;
                            let db_idx = j * num_per + ii;

                            let mut db_item =
                                load_item_from_slice(params, slice, instance, trial, db_idx);
                            // db_item.reduce_mod(params.pt_modulus);

                            for z in 0..params.poly_len {
                                db_item.data[z] = recenter_mod(
                                    db_item.data[z],
                                    params.pt_modulus,
                                    params.modulus,
                                );
                            }

                            let db_item_ntt = db_item.ntt();
                            for z in 0..params.poly_len {
                                let idx_dst = calc_index(
                                    &[instance, trial, z, ii, j],
                                    &[instances, trials, params.poly_len, num_per, dim0],
                                );

                                unsafe {
                                    vptr.add(idx_dst).write(
                                        db_item_ntt.data[z]
                                            | (db_item_ntt.data[params.poly_len + z]
                                                << PACKED_OFFSET_2),
                                    );
                                }
                            }
                        }
                    });
                    item_thread_start = item_thread_end;
                }
            });
        }
    }
    v
}