/*  2 stage CIC filter with IIR compensation.  first stage is 2 integrators, with 9 taps on
 *  the differentiators, decimation by 8.  lookup tables are used to do the integrations
 *  quickly.  a lookup table is required per integration stage, if i get around to it ill
 *  write up formula for caculating those coefficients.  second stage is 3 integrators with
 *  2 taps on the differentiators, decimation by 8.  total decimation is 64, cutoff frequency
 *  of first stage is Fs/72, second stage is Fs/128.  IIR boosts the high frequencies, and
 *  but still has 6dB loss at the high end.  cutoff frequency ends up being around Fs/180.
 *  transfer function is 1/(1+z^-1+0.75z^-2).  SNR tops out around 60dB for very loud signals.
 *  gain is set by the bit shifting at the end.  a higher order first stage integrator or tap
 *  number would be better, but was unstable with my microphone, which had a large DC offset.
 *  maybe different microphones would allow for this, or some sort of DC removal integrated
 *  into the first integration stage.  IIR high pass filters were unstable for this.  full
 *  transfer function is:
 *  -10log((1+cos(2pi*64x)+0.75cos(4pi*64x))^2+(-sin(2pi*64x)-0.75sin(4pi*64x))^2)+3*10log((sin(16pi*8x)/sin(pi*8x))^2)+2*10log((sin(72pi*x)/sin(pi*x))^2)
 *  
 *  lots of places where the code needs to get cleaned up.  lookup tables shoud be precompiled.
 *  execution time is 3us per sample.
*/
#define SAMD51_PDM 1

#include <Arduino.h>

#include <Adafruit_ZeroI2S.h>
#include <math.h>

/* max volume for 32 bit data */
#define VOLUME ( (1UL << 31) - 1)

/* create a buffer for both the left and right channel data */
#define BUFSIZE 1
uint32_t left;
uint32_t right;

// this table is extraneous, and is only use to compile bitsum2,3
const byte bitsum[256] = { \
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4, \
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, \
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, \
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, \
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, \
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, \
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, \
3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, \
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, \
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, \
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, \
3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, \
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, \
3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, \
3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, \
4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};

int8_t bitsum2[256]; // to do: should be stored as const and not compiled in setup
int8_t bitsum3[256] = {0};


// Use default pins in board variant
Adafruit_ZeroI2S i2s = Adafruit_ZeroI2S();

void setup() {

  for(byte i = 0; i<255; i++) {
    bitsum2[i] = ((bitsum[i]-4)<<1);  
    for(byte j=1; j<8; j++) {
      if((i>>j)&1) {
        bitsum3[i] -= j;
      }
      else {
        bitsum3[i] += j;
      }
    }
  }
  bitsum2[255] = ((bitsum[255]-4)<<1);
  for(byte j=1; j<8; j++) bitsum3[255] -= j;
 
  i2s.begin(I2S_32_BIT, 44100);
  i2s.enableRx();
  pinMode(9,OUTPUT);
}

byte data = 0;

int32_t runningsum = 0;
int32_t integrator1 = 0;
int32_t integrator2 = 0;
int32_t integrator3 = 0;
int32_t integrator4 = 0;
int32_t integrator5 = 0;
byte j = 0;
int32_t olddata[9] = {0};
int32_t olddata2[8] = {0};
int32_t olddata3[8] = {0};
int32_t olddata4[8] = {0};
int32_t olddata5[9] = {0};
int32_t olddata6[9] = {0};
int16_t overflow = 0;
int32_t temp;
int32_t temp5;
int32_t temp6 = 0;
int32_t temp7 = 0;
byte k = 0;

void loop() {

    while(!(i2s.rxReady())) ; // wait for data
    i2s.read(&left, &right); // fetch a word

digitalWrite(9,1); // speed test
    data--;
    if (data == 0) right = left;
    else {
      for(byte i=4; i>0; i--) { // 2 integrator stages done with lookup tables
        runningsum += bitsum2[(right&0xff)];
        integrator1 += (runningsum<<3)+bitsum3[(right&0xff)];
        right >>= 8;
        temp = integrator1 - olddata[k];
        olddata[k] = integrator1;
        temp5 = temp - olddata5[k];
        olddata5[k] = temp;
        k++;
        if (k == 9) k =0; // 9 tap delay line - more would be better, but it gets unstable
        integrator2 +=  temp5;
        integrator3 += integrator2;
        integrator4 += integrator3;
      }
      for(byte i=4; i>0; i--) {
        runningsum += bitsum2[(left&0xff)];
        integrator1 += (runningsum<<3)+bitsum3[(left&0xff)];
        left >>= 8;
        temp = integrator1 - olddata[k];
        olddata[k] = integrator1;
        temp5 = temp - olddata5[k];
        olddata5[k] = temp;
        k++;
        if (k == 9) k =0;
        integrator2 +=  temp5; // 3 more integrator stages
        integrator3 += integrator2;
        integrator4 += integrator3;
      }
      int32_t temp2 = integrator4 - olddata2[j]; // 2 tap delay line
      olddata2[j] = integrator4;
      int32_t temp3 = temp2 - olddata3[j];
      olddata3[j] = temp2;
      int32_t temp4 = temp3 - olddata4[j];
      olddata4[j] = temp3;
      j++;
      if (j == 2) j=0;
      temp4 -= temp6 + (temp7 - (temp7 >> 2)); // simple IIR compensation filter
      temp7 = temp6;
      temp6 = temp4;
 digitalWrite(9,0);
      analogWrite(A0, ((temp4 >> 6) + 0x0800)); // adjust gain here
      data = 1;
    }
 digitalWrite(9,0);
}
