#include "include/signal_path.h"

/* Global variables decleration*/
static int counter=0;
static int mu;

#ifdef LPDSP16
//static int leak=24576; //0.75
//static int leak=29491; //0.9
//static int leak=31129; //0.95 // no effect
static int leak=32735; //0.999 // (1 ? µ?)
//static int leak=32766; //0.99999
#else
//static int leak=2145336164; //0.999 // (1 ? µ?)
static int leak=2147462173; //0.999 // (1 ? µ?)
#endif


#if BLOCK_LEN == 1
int chess_storage(DMB) fir_lms_delay_line[MAX_FIR_COEFFS];
BufferPtrDMB chess_storage(DMB) ptr_fir_lms_delay_line;
BufferPtr ptr_fir_lms_coeffs;

#else
int chess_storage(DMA%(sizeof(long long))) fir_lms_delay_line[BLOCK_LEN + MAX_FIR_COEFFS]; // The delay line for the adaptive filter
BufferPtr ptr_fir_lms_delay_line;
BufferPtr ptr_fir_lms_coeffs;
#endif

int chess_storage(DMA % (sizeof(long long))) fir_lms_coeffs[MAX_FIR_COEFFS]; // The coefficients for the adaptive filter


#ifdef PLATFORM_GENERIC
    // lpdsp32 functionallity moddeling functions
    accum_t fract_mult(int a, int b){
        long int a_long = a;
        long int b_long = b;
        return (b_long * a_long);
    }
    accum_t to_accum(int a){
        long int a_long = (long int) a;
        return a_long << 31;
    }
    int rnd_saturate(accum_t a){
        return a >> 31;
    }
    int extract_high(accum_t a){
        return a >> 31;
    }
    void lldecompose(unsigned long long l, int* int1, int* int2){
        *int2 = (int)(l >> 32);
        *int1 = (int)(l);
    }
    uint64_t llcompose(int a, int b) {
        uint64_t result = (uint64_t)b; // Assign b to the higher 32 bits of the result
        result <<= 32; // Shift the higher 32 bits to the left
        result |= (uint32_t)a; // Bitwise OR operation with the lower 32 bits of a
        return result;
    }
    // unsigned long long llcompose(int a, int b){
    //     unsigned long long l;
    //     l = a << 32;
    //     l |= b;
    //     return l;
    //}
    int* cyclic_add(int *ptr, int i_pp, int *ptr_start, int buffer_len){
        int *p_ptr=ptr;
        for (int i=0; i < abs(i_pp); i+=1){ // end of buffer wraparound
            if (i_pp > 0){
                p_ptr ++;
                if (p_ptr >= ptr_start + buffer_len){
                    p_ptr=ptr_start;
                }
            }
            else{ // start of buffer wraparound
                p_ptr--;
                if (p_ptr < ptr_start){
                    p_ptr=ptr_start + (buffer_len -1);
                }
            }
        }
        return p_ptr;
    }
#endif


/*Round saturate with 16 bits return value */
int static inline rnd_saturate16(accum_t acc){ //maybe int16_fast type?
    acc = to_accum( // saturate
        rnd_saturate(acc << 32)
        );
    return rnd_saturate(acc >> 16); //round
}


int sig_init_buffer(BufferPtr *buffer, int *buffer_start_add, int length, int max_buffer_len) {
    buffer->buffer_len = length;
    buffer->ptr_start = buffer_start_add;
    buffer->ptr_current = buffer_start_add;
    // initialize delay line with 0
    for (int i = 0; i < length; i++) {
        buffer_start_add[i] = 0;
    }
    if (length<max_buffer_len){
        return 0;
    }
    else{
        return 1;
    }
}

int sig_init_buffer_DMB(BufferPtrDMB chess_storage(DMB) *buffer, int chess_storage(DMB) *buffer_start_add, int length, int max_buffer_len){
    buffer->buffer_len = length;
    buffer->ptr_start = buffer_start_add;
    buffer->ptr_current = buffer_start_add;
    // initialize delay line with 0
    for (int i = 0; i < length; i++) {
        buffer_start_add[i] = 0;
    }
    if (length<max_buffer_len){
        return 0;
    }
    else{
        return 1;
    }
}

void sig_cirular_buffer_ptr_increment(BufferPtr *buffer, int i_incr){
    buffer->ptr_current = cyclic_add(buffer->ptr_current, i_incr, buffer->ptr_start, buffer->buffer_len);
}

void sig_cirular_buffer_ptr_increment_DMB(BufferPtrDMB *buffer, int i_incr){
    buffer->ptr_current = cyclic_add(buffer->ptr_current, i_incr, buffer->ptr_start, buffer->buffer_len);
}

void sig_cirular_buffer_ptr_put_sample(BufferPtr *buffer, int sample){
    *buffer->ptr_current = sample;
    buffer->ptr_current = cyclic_add(buffer->ptr_current, 1, buffer->ptr_start, buffer->buffer_len);
}

void sig_cirular_buffer_ptr_put_sample_DMB(BufferPtrDMB chess_storage(DMB) *buffer, int sample){
    *buffer->ptr_current = sample;
    buffer->ptr_current = cyclic_add(buffer->ptr_current, 1, buffer->ptr_start, buffer->buffer_len);
}

void static inline sig_circular_buffer_ptr_put_block(BufferPtr *buffer, int* block){
    // increment pointer to oldest block
    //buffer->ptr_current = cyclic_add(buffer->ptr_current, BLOCK_LEN, buffer->ptr_start, buffer->buffer_len);
    // load the next block
    for (int i=0; i<BLOCK_LEN;  i+=2){
        buffer->ptr_current[0] = block[i]; // TODO: use llcompose
        buffer->ptr_current[1] = block[i+1];
        buffer->ptr_current = cyclic_add(buffer->ptr_current, 2, buffer->ptr_start, buffer->buffer_len);
    }
}

void sig_init_preemph_coef(SingleSignalPath *signal, double b0, double b1, double b2, double a1, double a2, int scale_bits) {
    // Check first if filter is actually activated
    if (b0 == 1. && b1 == 0. && b2 == 0. && a1 == 0. && a2 == 0.) {
        signal->preemph_activated = 0;
    }
    else{
        signal->preemph_activated = 1;
        signal->_preemph_scale_nbits = scale_bits;
        int scale = pow(2, scale_bits) - 1;
        signal->b_preemph[0] = b0 * scale;
        signal->b_preemph[1] = b1 * scale;
        signal->b_preemph[2] = b2 * scale;
        signal->b_preemph[3] = a1 * scale;
        signal->b_preemph[4] = a2 * scale;
    }
}

/*Initialization functions - make sure all of them were called to ensure functionality*/
int sig_init_delay(SingleSignalPath *signal, int n_delay) {
    return sig_init_buffer(&signal->delay_buffer, signal->_delay_buffer, n_delay, MAX_DELAY_SAMPS);
}

void sig_init_weight(SingleSignalPath *signal, double weight, int scale_nbits) {
    if (weight == 1.) {
        signal->weight_actived = 0;
    }
    else{
        signal->weight_actived = 1;
        int scale = pow(2, scale_nbits) - 1;
        signal->weight = weight * scale;
        signal->_weight_scale_nbits = scale_nbits;
    }
}

/*Calculator functions for the given signal path*/
/*Calculate one biquad filter element*/
int sig_calc_biquad(SingleSignalPath *signal, int x) {
    if (signal->preemph_activated == 0) {
        return x;
    }
    accum_t sum =
        fract_mult(x, signal->b_preemph[0]) + fract_mult(signal->_xd[0], signal->b_preemph[1]) +
        fract_mult(signal->_xd[1], signal->b_preemph[2]) + fract_mult(signal->_yd[0], signal->b_preemph[3]) +
        fract_mult(signal->_yd[1],signal->b_preemph[4]);
    
    #ifdef LPDSP16
    int y = rnd_saturate16(sum << 1);
    #else
    int y = rnd_saturate(sum << 1);
    #endif
    
    signal->_xd[1] = signal->_xd[0];
    signal->_xd[0] = x;
    signal->_yd[1] = signal->_yd[0];
    signal->_yd[0] = y;
    return y;
}
int inline sig_get_delayed_sample(SingleSignalPath *signal) {
    return *signal->delay_buffer.ptr_current;
}

int sig_delay_buffer_load_and_get(SingleSignalPath *signal, int x) {
    if (signal->delay_buffer.buffer_len == 0) {
        return x;
    }
    int out = *signal->delay_buffer.ptr_current;
    *signal->delay_buffer.ptr_current = x;
    sig_cirular_buffer_ptr_increment(&signal->delay_buffer, 1);
    return out;
}

int sig_calc_weight(SingleSignalPath *signal, int x) {
    if (signal->weight_actived == 0) {
        return x;
    }
    accum_t acc = fract_mult(x, signal->weight);

    return rnd_saturate(acc);
}

#if BLOCK_LEN!=1 // Block processing
/*lpdsp32 fir filter example adapted from user guide
#define NS 256 //No. of samples
#define N 64 //No. of filter coefficients or No. of tap weights
int chess_storage(DMB) y[NS]; //Output Signal
int chess_storage(DMA %(sizeof(long long))) x[NS+N-1]; //Input Signal
//Filter coefficients or tap weights
int chess_storage(DMA %(sizeof(long long))) h[N];
 */
void sig_calc_fir_lpdsp32_block(BufferPtr *ptr_fir_lms_delay_line, BufferPtr *ptr_fir_lms_coeffs, int chess_storage(DMB) *out){
//void fir(int *y, int *x, int *h)
    static int chess_storage(DMA) *p_x; // pointer to the start of the last added block
    static int chess_storage(DMA) *p_h; // pointer to the start of the filter coefficients
    static int chess_storage(DMB) *p_y; // pointer to the output port

    p_y = out;

    int *px_start = ptr_fir_lms_delay_line->ptr_start;
    int *ph_start = ptr_fir_lms_coeffs->ptr_current;
    int delay_line_len = ptr_fir_lms_delay_line->buffer_len;
    int n_coeff = ptr_fir_lms_coeffs->buffer_len;

    int coef1, coef2;
    int dat1, dat2;

    for(unsigned int n=0; n<BLOCK_LEN; n+=2) chess_loop_range(1,){
        //p_x = x + n;
        p_x = cyclic_add(px_start, n, px_start, delay_line_len);
        p_h = ph_start;

        #ifdef PLATFORM_GENERIC
            lldecompose(*((long long *)p_h), &coef1, &coef2);
        #else
            lldecompose(*((long long *)p_h), coef1, coef2);
        #endif
        p_h+=2;
        #ifdef PLATFORM_GENERIC
            lldecompose(*((long long *)p_x), &dat2, &dat1);
        #else
            lldecompose(*((long long *)p_x), dat2, dat1);
        #endif
        p_x = cyclic_add(p_x, -2, px_start, delay_line_len);

        accum_t sum1 = fract_mult(dat1, coef1);
        accum_t sum2 = fract_mult(dat2, coef1);
        sum1 += fract_mult(dat2 , coef2);
        sum1 = to_accum(rnd_saturate(sum1));
        for(int k=2; k < n_coeff; k+=2) chess_loop_range(1,){
            #ifdef PLATFORM_GENERIC
                lldecompose(*((long long *)p_x), &dat2, &dat1);
            #else
                lldecompose(*((long long *)p_x), dat2, dat1);
            #endif
            p_x = cyclic_add(p_x, -2, px_start, delay_line_len);

            sum2 += fract_mult(dat1, coef2);
            sum2 = to_accum(rnd_saturate(sum2));

            #ifdef PLATFORM_GENERIC
                lldecompose(*((long long *)p_h), &coef1, &coef2);
            #else
                lldecompose(*((long long *)p_h), coef1, coef2);
            #endif
            p_h+=2;

            sum1 += fract_mult(dat1, coef1);
            sum2 += fract_mult(dat2, coef1);
            sum1 += fract_mult(dat2, coef2);
            sum1 = to_accum(rnd_saturate(sum1));
        }
        #ifdef PLATFORM_GENERIC
            lldecompose(*((long long *)p_x), &dat2, &dat1);
        #else
            lldecompose(*((long long *)p_x), dat2, dat1);
        #endif
        sum2 += fract_mult(dat1, coef2);
        sum2 = to_accum(rnd_saturate(sum2));

        *p_y++ = extract_high(sum2);
        *p_y++ = extract_high(sum1);
    }
}
void sig_calc_fir_generic_block(BufferPtr *ptr_fir_lms_delay_line, BufferPtr *ptr_fir_lms_coeffs, int chess_storage(DMB) *out){
    static int chess_storage(DMA) *p_x; // pointer to the start of the last added block
    static int chess_storage(DMA) *p_h; // pointer to the start of the filter coefficients
    static int chess_storage(DMB) *p_y; // pointer to the output port

    static int coef1, coef2;
    static int dat1, dat2;

    p_x = ptr_fir_lms_delay_line->ptr_current;
    p_h = ptr_fir_lms_coeffs->ptr_current;
    p_y = out;

    for(int n=0; n<BLOCK_LEN; n+=2)
    {
        p_x = cyclic_add(ptr_fir_lms_delay_line->ptr_current, n, ptr_fir_lms_delay_line->ptr_start, ptr_fir_lms_delay_line->buffer_len); // can be done in increments of two, assuming the buffer pointer increment is even
        accum_t sum = to_accum(0);
        for(int k=0; k < ptr_fir_lms_coeffs->buffer_len; k+=2) chess_loop_range(1,)
        {
            sum += fract_mult(p_x[0] , p_h[k]);
            sum += fract_mult(p_x[1] , p_h[k+1]);

            sum = to_accum(rnd_saturate(sum));
            p_x = cyclic_add(p_x, -2, ptr_fir_lms_delay_line->ptr_start, ptr_fir_lms_delay_line->buffer_len); // can be done in increments of two, assuming the buffer pointer increment is even
        }
    *p_y++ = extract_high(sum);
    }
}
/* "out" is actually an input to the function and is the output of the fir_lms filter system*/
void adapt_coeffs_lpdsp32_block(BufferPtr *ptr_fir_lms_delay_line, BufferPtr *ptr_fir_lms_coeffs, int out){ // only works for even delay line sample pointers!!

    int *p_x = ptr_fir_lms_delay_line->ptr_current; // pointer to the start of the last added block - TODO: doublecheck this - might be wrong because the pointer actually points to the end of the block!
    int *p_x_start = ptr_fir_lms_delay_line->ptr_start;
    int *p_h = ptr_fir_lms_coeffs->ptr_current; // pointer to the start of the filter coefficients
    int delay_line_len = ptr_fir_lms_delay_line->buffer_len;
    int n_coeff = ptr_fir_lms_coeffs->buffer_len;
    int prod0, x0, x1, h0, h1;

    // Calculate the first term of the coefficient adaption
    accum_t acc_C = fract_mult(mu, out);
    prod0 = rnd_saturate(acc_C);
    //acc_D = fract_mult(mu, out1);
    //prod1 = rnd_saturate(acc_C);
    for (int i=0; i<n_coeff; i+=2) chess_loop_range(1, ){
        // Calculate the coefficient wise adaption

        // utilize dual load and dual pointer update
        // load first sample and coefficient
        #ifdef PLATFORM_GENERIC
            lldecompose(*((long long *)p_h), &h0, &h1);
        #else
            lldecompose(*((long long *)p_h), h0, h1);
        #endif

        accum_t acc_A = to_accum(h0);
        accum_t acc_B = to_accum(h1);

        #ifdef PLATFORM_GENERIC
            lldecompose(*((long long *)p_x), &x0, &x1);
        #else
            lldecompose(*((long long *)p_x), x0, x1);
        #endif
        p_x = cyclic_add(p_x, -2, p_x_start, delay_line_len); // can be done in increments of two, assuming the buffer pointer increment is even

        // initialize accumulators with old coefficients, calculate the adaptions and accumulate
        acc_A += fract_mult(prod0, x0); // TODO: This can be further optimized by using all 4 available accums!
        acc_B += fract_mult(prod0, x1);
        // update the current filter coefficients - dual rnd_sat; dual store
        *((long long *)p_h) = llcompose(rnd_saturate(acc_A), rnd_saturate(acc_B));// load/store hazard ! - 1nop
        p_h+=2;
    }
}
#else

int inline sig_calc_fir_lpdsp32_single(BufferPtrDMB chess_storage(DMB) *ptr_fir_lms_delay_line, BufferPtr *ptr_fir_lms_coeffs){

    // Calculate the fir filter output on x to get the canceller
    int chess_storage(DMB) *p_x0 = ptr_fir_lms_delay_line->ptr_current; // chess_storage(DMB)
    int chess_storage(DMB) *px_start = ptr_fir_lms_delay_line->ptr_start;
    int *p_h = ptr_fir_lms_coeffs->ptr_current;
    int delay_line_len = ptr_fir_lms_delay_line->buffer_len;
    int n_coeff = ptr_fir_lms_coeffs->buffer_len;

    int d0,d1,h0,h1;
    accum_t acc1_A = to_accum(0);
    accum_t acc1_B = to_accum(0);
    accum_t acc1_C;

    // iterate over the coefficients to calculate the filter on x - the canceller
    /* Abschaetzung cycles per 2coefficient:
    dual - load : 1
    dual mac and dual load: 1
    -> 48/2 * 2 = 48 cycles for 48 coefficents
    */
    for (int i=0; i < n_coeff; i+=2) chess_loop_range(1,){
        // Use dual load and dual pointer update
        d0 = *p_x0;
        h0 = *p_h;
        p_h++;
        p_x0 = cyclic_add(p_x0, -1, px_start, delay_line_len);

        d1 = *p_x0;
        h1 = *p_h;
        p_h++;
        p_x0 = cyclic_add(p_x0, -1, px_start, delay_line_len);

        acc1_A+=fract_mult(d0, h0);
        acc1_B+=fract_mult(d1, h1);
        #ifndef LPDSP16
        acc1_A = to_accum(rnd_saturate(acc1_A));
        acc1_B = to_accum(rnd_saturate(acc1_B));
        #endif
        
    }
    // Calculate the output sample
    acc1_C = acc1_A + acc1_B;
    //out32 = rnd_saturate(acc1_A);
    #ifdef LPDSP16
    return rnd_saturate16(acc1_C);
    #else
    return rnd_saturate(acc1_C);
    #endif 
}

void static inline adapt_coeffs_lpdsp32_single_v1(BufferPtrDMB chess_storage(DMB) *ptr_fir_lms_delay_line, BufferPtr *ptr_fir_lms_coeffs, int out){

    int chess_storage(DMA) *p_h0 = ptr_fir_lms_coeffs->ptr_start; //coeff load pointer
    //int chess_storage(DMA) *p_h1 = ptr_fir_lms_coeffs->ptr_start; //coeff store pointer
    int chess_storage(DMB) *p_x0 = ptr_fir_lms_delay_line->ptr_current; // chess_storage(DMB)
    int chess_storage(DMB) *p_x1 = ptr_fir_lms_delay_line->ptr_current; // chess_storage(DMB)

    p_x1 = cyclic_add(p_x1, -1, ptr_fir_lms_delay_line->ptr_start, ptr_fir_lms_delay_line->buffer_len);

    int prod, x0, x1, h0, h1;
    int chess_storage(DMB) *px_start = ptr_fir_lms_delay_line->ptr_start;
    int delay_line_len = ptr_fir_lms_delay_line->buffer_len;
    int n_coeff = ptr_fir_lms_coeffs->buffer_len;

    accum_t acc_A, acc_B;

    // Calculate the first term of the coefficient adaption
    accum_t acc_C = fract_mult(mu, out);
    #ifdef LPDSP16
    prod = rnd_saturate16(acc_C);
    #else
    prod = rnd_saturate(acc_C);
    #endif
    /* AbschÃ¤tzung cycles per 2 coefficient:
    dual load coeffs: 1
    single load tab value: 2
    dual mac: 1
    dual rnd_sat - store: 1
    load/store hazard nop: 1
    */
    for (int i=0; i< n_coeff; i+=2) chess_loop_range(1,){
        // Calculate the coefficient wise adaption
        #ifdef PLATFORM_GENERIC
            lldecompose(*((long long *)p_h0), &h0, &h1);
        #else
            lldecompose(*((long long *)p_h0), h0, h1);
        #endif

        acc_A = to_accum(h0);
        acc_B = to_accum(h1);
        
        #ifdef LPDSP16
        acc_A += fract_mult(prod, *p_x0) << 16; // TODO: This could be further optimized by using all 4 available accums?
        acc_B += fract_mult(prod, *p_x1) << 16;
        #else
        acc_A += fract_mult(prod, *p_x0); // TODO: This could be further optimized by using all 4 available accums?
        acc_B += fract_mult(prod, *p_x1);
        #endif
         
        p_x0 = cyclic_add(p_x0, -2, px_start, delay_line_len);
        p_x1 = cyclic_add(p_x1, -2, px_start, delay_line_len);

        // update the current filter coefficients - dual sat; dual store
        *((long long *)p_h0) = llcompose(rnd_saturate(acc_A), rnd_saturate(acc_B));//load/store hazard ! - 1 nop is needed
        p_h0+=2;
    }
}

void static inline adapt_coeffs_lpdsp32_single_leaky(BufferPtrDMB chess_storage(DMB) *ptr_fir_lms_delay_line, BufferPtr *ptr_fir_lms_coeffs, int out){

    int chess_storage(DMA) *p_h0 = ptr_fir_lms_coeffs->ptr_start; //coeff load pointer
    //int chess_storage(DMA) *p_h1 = ptr_fir_lms_coeffs->ptr_start; //coeff store pointer
    int chess_storage(DMB) *p_x0 = ptr_fir_lms_delay_line->ptr_current; // chess_storage(DMB)
    int chess_storage(DMB) *p_x1 = ptr_fir_lms_delay_line->ptr_current; // chess_storage(DMB)

    p_x1 = cyclic_add(p_x1, -1, ptr_fir_lms_delay_line->ptr_start, ptr_fir_lms_delay_line->buffer_len);

    int prod, x0, x1, h0, h1;
    int chess_storage(DMB) *px_start = ptr_fir_lms_delay_line->ptr_start;
    int delay_line_len = ptr_fir_lms_delay_line->buffer_len;
    int n_coeff = ptr_fir_lms_coeffs->buffer_len;

    accum_t acc_A, acc_B;

    // Calculate the first term of the coefficient adaption
    accum_t acc_C = fract_mult(mu, out);
    #ifdef LPDSP16
    prod = rnd_saturate16(acc_C);
    #else
    prod = rnd_saturate(acc_C);
    #endif

    for (int i=0; i< n_coeff; i+=2) chess_loop_range(1,){
        // Calculate the coefficient wise adaption
        #ifdef PLATFORM_GENERIC
            lldecompose(*((long long *)p_h0), &h0, &h1);
        #else
            lldecompose(*((long long *)p_h0), h0, h1);
        #endif

        acc_A = fract_mult(h0, leak); // leaky
        acc_B = fract_mult(h1, leak);
        
        acc_A += fract_mult(prod, *p_x0); // TODO: This could be further optimized by using all 4 available accums?
        acc_B += fract_mult(prod, *p_x1);
      
        p_x0 = cyclic_add(p_x0, -2, px_start, delay_line_len);
        p_x1 = cyclic_add(p_x1, -2, px_start, delay_line_len);

        // update the current filter coefficients - dual sat; dual store
        #ifdef LPDSP16
        *((long long *)p_h0) = llcompose(rnd_saturate16(acc_A), rnd_saturate16(acc_B));//load/store hazard ! - 1 nop is needed
        #else
        *((long long *)p_h0) = llcompose(rnd_saturate(acc_A), rnd_saturate(acc_B));//load/store hazard ! - 1 nop is needed
        #endif
        p_h0+=2;
    }
}

void adapt_coeffs_generic_single(BufferPtrDMB chess_storage(DMB) *ptr_fir_lms_delay_line, BufferPtr *ptr_fir_lms_coeffs, int out){
    int *p_h0 = ptr_fir_lms_coeffs->ptr_start; //coeff load pointer
    int chess_storage(DMB) *p_x0 = ptr_fir_lms_delay_line->ptr_current; // chess_storage(DMB)

    int prod;

    accum_t acc_A, acc_B;

    // Calculate the first term of the coefficient adaption
    accum_t acc_C = fract_mult(mu, out);
    prod = rnd_saturate(acc_C);
    for (int i=0; i< ptr_fir_lms_delay_line->buffer_len; i++){
        // Calculate the coefficient wise adaption
        acc_A = to_accum(p_h0[i]);
        acc_A += fract_mult(prod, *p_x0);
        p_x0 = cyclic_add(p_x0, -1, ptr_fir_lms_delay_line->ptr_start, ptr_fir_lms_delay_line->buffer_len);
        p_h0[i]=rnd_saturate(acc_A);
    }
}
#endif

void init(
    SingleSignalPath *cSensorSignal,
    SingleSignalPath *accSensorSignal,
    //BufferPtrDMB *ptr_fir_lms_delay_line,
    //BufferPtr *ptr_fir_lms_coeffs,
    double *b_c,
    double *b_acc,
    int delay_c,
    int delay_acc,
    double weight_c,
    double weight_acc,
    double lms_mu,
    int lms_fir_num_coeffs
    ){
    #ifdef LPDSP16
    int scale_bits=15;
    #else
    int scale_bits=31;
    #endif

    sig_init_preemph_coef(cSensorSignal, b_c[0], b_c[1], b_c[2], b_c[3], b_c[4], scale_bits);
    sig_init_delay(cSensorSignal, delay_c);
    sig_init_weight(cSensorSignal, weight_c, scale_bits);

    // // Initialize the accSensor signal subpath
    sig_init_preemph_coef(accSensorSignal, b_acc[0], b_acc[1], b_acc[2], b_acc[3], b_acc[4], scale_bits);
    sig_init_delay(accSensorSignal, delay_acc);
    sig_init_weight(accSensorSignal, weight_acc, 31);

    // initialize the lms filter parameters
    int scale = pow(2, scale_bits) - 1;
    mu = lms_mu * scale;
    // initialize the fir_lms buffers
    #if BLOCK_LEN == 1
        sig_init_buffer_DMB(&ptr_fir_lms_delay_line, fir_lms_delay_line, lms_fir_num_coeffs, MAX_FIR_COEFFS);
        sig_init_buffer(&ptr_fir_lms_coeffs, fir_lms_coeffs, lms_fir_num_coeffs, MAX_FIR_COEFFS);
    #else
        sig_init_buffer(&ptr_fir_lms_delay_line, fir_lms_delay_line, lms_fir_num_coeffs + BLOCK_LEN, BLOCK_LEN + MAX_FIR_COEFFS);
        sig_init_buffer(&ptr_fir_lms_coeffs, fir_lms_coeffs, lms_fir_num_coeffs, MAX_FIR_COEFFS);

    #endif
    for (int i = 0; i < lms_fir_num_coeffs; i++) {
        ptr_fir_lms_delay_line.ptr_start[i] = 0;
        ptr_fir_lms_coeffs.ptr_start[i] = 0;
    }
}

// Data d(cSensor) is signal + noise
// x (accSensor) is reference noise signal
void calc(
    SingleSignalPath *cSensorSignal,
    SingleSignalPath *accSensorSignal,
    // BufferPtrDMB *ptr_fir_lms_delay_line,
    // BufferPtr *ptr_fir_lms_coeffs,
    OutputMode output_mode,
    #if BLOCK_LEN != 1
    int16_t *cSensor,
    int16_t *accSensor,
    #else
    int16_t volatile chess_storage(DMB) *cSensor,
    int16_t volatile chess_storage(DMB) *accSensor,
    #endif

    int16_t volatile chess_storage(DMB) *out_16 
    
    ){
    static int chess_storage(DMA) c_block_pre[BLOCK_LEN];
    static int chess_storage(DMA) acc_block_pre[BLOCK_LEN];
    static int chess_storage(DMA) cSensor_32[BLOCK_LEN];
    static int chess_storage(DMA) accSensor_32[BLOCK_LEN];
    
    static int chess_storage(DMB) acc_block_filt[BLOCK_LEN];
    static int chess_storage(DMB) out_32[BLOCK_LEN];

    static int chess_storage(DMA) *p_c_block_pre =c_block_pre;
    static int chess_storage(DMA) *p_acc_block_filt =acc_block_pre;
    static int chess_storage(DMB) *p_out_32=out_32;


    #ifdef LPDSP16
    for (uint32_t i=0; i<BLOCK_LEN; i++) chess_loop_range(1,){
        cSensor_32[i]=  (int) cSensor[i] ;
        accSensor_32[i]= (int) accSensor[i];
    }
    
    #else //LPDDSP32
    for (uint32_t i=0; i<BLOCK_LEN; i++) chess_loop_range(1,){
        cSensor_32[i] =  ((int) cSensor[i]) << BITSHIFT_16_TO_32;
        accSensor_32[i] = ((int) accSensor[i]) << BITSHIFT_16_TO_32;
    }    
    #endif 
  
    // Apply bitshift, calculate the pre emphasis filter, delay and weight to each channel     
    //#define PRE_FILTER
    #ifdef PRE_FILTER
    int x_csensor_emph, x_accsensor_emph, x_csensor_emph_delay, x_accsensor_emph_delay;
    for (uint32_t i=0; i<BLOCK_LEN; i++) chess_loop_range(1,){
         x_csensor_emph = sig_calc_biquad(cSensorSignal, cSensor_32);
         x_accsensor_emph = sig_calc_biquad(accSensorSignal, accSensor_32);
         x_csensor_emph_delay = sig_delay_buffer_load_and_get(cSensorSignal, x_csensor_emph);
         x_accsensor_emph_delay = sig_delay_buffer_load_and_get(accSensorSignal, x_accsensor_emph);
         c_block_pre[i] = sig_calc_weight(cSensorSignal, x_csensor_emph_delay);
         acc_block_pre[i] = sig_calc_weight(accSensorSignal, x_accsensor_emph_delay);
    }
    #else
    for (uint32_t i=0; i<BLOCK_LEN; i++) chess_loop_range(1,){
         c_block_pre[i] = cSensor_32[i];
         acc_block_pre[i] = accSensor_32[i];
    }
    #endif
    
    // Calculate the output in dependency of the selected output mode
    switch (output_mode)
    {
    case OUTPUT_MODE_C_SENSOR:
        for (uint32_t i=0; i<BLOCK_LEN; i++){
            out_32[i] = c_block_pre[i];
        }
        break;
    case OUTPUT_MODE_ACC_SENSOR:
        for (uint32_t i=0; i<BLOCK_LEN; i++){
            out_32[i] = acc_block_pre[i];
        }
        break;
    case OUTPUT_MODE_FIR: //output filtered cSensor signal
        #if BLOCK_LEN == 1
            // Increment the buffer pointer and set the current sample to the delay line
            sig_cirular_buffer_ptr_put_sample_DMB(&ptr_fir_lms_delay_line, c_block_pre[0]);
            out_32[0] = sig_calc_fir_lpdsp32_single(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs);

        #else // Block processing
            // Put the next block to the buffer
            sig_circular_buffer_ptr_put_block(&ptr_fir_lms_delay_line, c_block_pre);
            // Calculate the fir filter output
            sig_calc_fir_lpdsp32_block(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs, out_32);
            // Increment the buffer pointer to get ready for the next block
            //sig_cirular_buffer_ptr_increment(&lms->delay_line, BLOCK_LEN);
        #endif
        break;
    case OUTPUT_MODE_FIR_LMS: // apply lms filter on cSensor signal
        #if BLOCK_LEN == 1
            // Increment the buffer pointer and set the current sample to the delay line
            sig_cirular_buffer_ptr_put_sample_DMB(&ptr_fir_lms_delay_line, acc_block_pre[0]);
            //*ptr_fir_lms_delay_line.ptr_current = acc_block_pre[0];
            //ptr_fir_lms_delay_line.ptr_current = cyclic_add(ptr_fir_lms_delay_line.ptr_current, 1, ptr_fir_lms_delay_line.ptr_start, ptr_fir_lms_delay_line.buffer_len);

            // Calculate the fir filter output on acc to get the canceller
            acc_block_filt[0]= sig_calc_fir_lpdsp32_single(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs);
            // Calculate the ouptut signal by c_block_pre - acc_block_filt
            out_32[0] = c_block_pre[0] - acc_block_filt[0];
            //if (counter >= 0){ //TODO: implement this and make it configurable
                // Calculate the coefficient adaptation
            //adapt_coeffs_generic_single(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs, out_32[0]);
            adapt_coeffs_lpdsp32_single_v1(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs, out_32[0]);
                //counter=0;
            // }
            // else{
            //     counter++;
            // }
        #else // Block processing
            // Put the next block to the buffer
            sig_circular_buffer_ptr_put_block(&ptr_fir_lms_delay_line, acc_block_pre);
            // Calculate the fir filter output on acc to get the canceller
            sig_calc_fir_lpdsp32_block(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs, acc_block_filt);

            // Calculate the ouptut signal by c_block_pre - acc_block_filt
            for (int i=0; i<BLOCK_LEN; i++) chess_flatten_loop
            {
                //sig_cirular_buffer_ptr_put_sample(&lms->delay_line, acc_block_pre[i]);
                //acc_block_filt[i]= sig_calc_fir_lpdsp32_single(lms);
                out_32[i] = c_block_pre[i] - acc_block_filt[i]; // 15 cycles with 4 samples/block
                // adapt the coefficients with respect to the last sample in the block
            }
            //adapt_coeffs_lpdsp32_single(lms, out_32[1]);
            adapt_coeffs_lpdsp32_block(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs, out_32[0]);
            // Increment the buffer pointer to get ready for the next block
            //sig_cirular_buffer_ptr_increment(&lms->delay_line, BLOCK_LEN);
        #endif
        break;
    case OUTPUT_MODE_FIR_LMS_LEAKY: // apply lms filter on cSensor signal
        // Increment the buffer pointer and set the current sample to the delay line
        sig_cirular_buffer_ptr_put_sample_DMB(&ptr_fir_lms_delay_line, acc_block_pre[0]);

        // Calculate the fir filter output on acc to get the canceller
        acc_block_filt[0]= sig_calc_fir_lpdsp32_single(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs);
        // Calculate the ouptut signal by c_block_pre - acc_block_filt
        out_32[0] = c_block_pre[0] - acc_block_filt[0];
        //if (counter >= 0){ //TODO: implement this and make it configurable
            // Calculate the coefficient adaptation
        //adapt_coeffs_generic_single(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs, out_32[0]);
        adapt_coeffs_lpdsp32_single_leaky(&ptr_fir_lms_delay_line, &ptr_fir_lms_coeffs, out_32[0]);
        
        break;
    default: // MUTED
        for (uint32_t i=0; i<BLOCK_LEN; i++){
            out_32[i] = 0;
        }
        break;
    }
    // TODO: Add a couple of biqads after ANC
    for (uint32_t i=0; i<BLOCK_LEN; i++) chess_flatten_loop
    {
        #ifdef LPDSP16
        out_16[i] = (int16_t) out_32[i];
        #else
        out_16[i] = rnd_saturate(to_accum(out_32[i]) >> BITSHIFT_16_TO_32); // 12 cycles for blocksize 4 //TODO: use rnd_saturate(out_32[i] >> input_nbit_bitshift)
        #endif 
        
        
    }
    //out_16[0] = cSensor[0];
}