xiph · baillyge · Apr 7, 2019 · Apr 7, 2019 · Apr 7, 2019 · Apr 7, 2019
diff --git a/src/dump_data.c b/src/dump_data.c
@@ -41,6 +41,52 @@
 #include <assert.h>
 #include "lpcnet.h"
 #include "lpcnet_private.h"
+#include "opus.h"
+
+float preemph_offset[NB_BANDS] = {1.772676, 2.937053, 0.278042, 0.299267, 0.126341, 0.060082, 0.019509, -0.017281, 0.000530, -0.000156, -0.007375, -0.010533, -0.002903, -0.005244, -0.003251, -0.000492, -0.000174, -0.004998};
+
+void compute_band_energy_from_lpc(float *bandE, float g, const float *lpc) {
+  int i;
+  float sum[NB_BANDS] = {0};
+  float x[WINDOW_SIZE];
+  kiss_fft_cpx X[FREQ_SIZE];
+  {
+      RNN_CLEAR(x, WINDOW_SIZE);
+      x[0] = 1;
+      //x[1] = -PREEMPHASIS;
+      for (i=0;i<LPC_ORDER;i++) x[i+1] = -lpc[i];
+      forward_transform(X, x);
+  }
+#if 0
+  for (i=0;i<FREQ_SIZE;i++) {
+      float E = SQUARE(X[i].r) + SQUARE(X[i].i);
+      printf("%g ", 1.f/(1e-15+E));
+  }
+  printf("\n");
+#endif
+  for (i=0;i<NB_BANDS-1;i++)
+  {
+    int j;
+    int band_size;
+    band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
+    for (j=0;j<band_size;j++) {
+      float tmp;
+      float frac = (float)j/band_size;
+      tmp = SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r);
+      tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
+      tmp = 1.f/(tmp + 1e-9);
+      sum[i] += (1-frac)*tmp;
+      sum[i+1] += frac*tmp;
+    }
+  }
+  sum[0] *= 2;
+  sum[NB_BANDS-1] *= 2;
+  for (i=0;i<NB_BANDS;i++)
+  {
+    bandE[i] = sum[i];
+  }
+  for (i=0;i<NB_BANDS;i++) bandE[i] *= .2*g*g*(1.f/((float)WINDOW_SIZE*WINDOW_SIZE*WINDOW_SIZE*WINDOW_SIZE));
+}
 
 
 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
@@ -76,13 +122,16 @@ void compute_noise(int *noise, float noise_std) {
 
 void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file) {
   int i, k;
+  fwrite(pcm, 4*FRAME_SIZE, 2, file);
+  return;
   for (k=0;k<4;k++) {
   unsigned char data[4*FRAME_SIZE];
   for (i=0;i<FRAME_SIZE;i++) {
     float p=0;
     float e;
     int j;
     for (j=0;j<LPC_ORDER;j++) p -= st->features[k][2*NB_BANDS+3+j]*st->sig_mem[j];
+    //printf("%f\n", pcm[k*FRAME_SIZE+i] - p);
     e = lin2ulaw(pcm[k*FRAME_SIZE+i] - p);
     /* Signal. */
     data[4*i] = lin2ulaw(st->sig_mem[0]);
@@ -100,7 +149,7 @@ void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *f
     st->sig_mem[0] = p + ulaw2lin(e);
     st->exc_mem = e;
   }
-  fwrite(data, 4*FRAME_SIZE, 1, file);
+  //fwrite(data, 4*FRAME_SIZE, 1, file);
   }
 }
 
@@ -128,6 +177,7 @@ int main(int argc, char **argv) {
   FILE *fpcm=NULL;
   short pcm[FRAME_SIZE]={0};
   short pcmbuf[FRAME_SIZE*4]={0};
+  float xbuf[FRAME_SIZE*4]={0};
   int noisebuf[FRAME_SIZE*4]={0};
   short tmp[FRAME_SIZE] = {0};
   float savedX[FRAME_SIZE] = {0};
@@ -140,7 +190,17 @@ int main(int argc, char **argv) {
   int training = -1;
   int encode = 0;
   int decode = 0;
+  int delay = TRAINING_OFFSET;
   int quantize = 0;
+  OpusEncoder *enc;
+  OpusDecoder *dec;
+  enc = opus_encoder_create(16000, 1, OPUS_APPLICATION_VOIP, NULL);
+  opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
+  opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_WIDEBAND));
+  opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&delay));
+  delay = 160;
+  fprintf(stderr, "delay is %d\n", delay);
+  dec = opus_decoder_create(16000, 1, NULL);
   st = lpcnet_encoder_create();
   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
   if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
@@ -242,32 +302,107 @@ int main(int argc, char **argv) {
     }
     biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
     biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
-    preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
     for (i=0;i<FRAME_SIZE;i++) {
       float g;
       float f = (float)i/FRAME_SIZE;
       g = f*speech_gain + (1-f)*old_speech_gain;
       x[i] *= g;
     }
+    for (i=0;i<FRAME_SIZE;i++)
+        xbuf[st->pcount*FRAME_SIZE + i] = (1.f/32768.f)*x[i];
+    //preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
     for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
     /* PCM is delayed by 1/2 frame to make the features centered on the frames. */
-    for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
-    compute_frame_features(st, x);
+    for (i=0;i<FRAME_SIZE-delay;i++) pcm[i+delay] = float2short(x[i]);
+    //compute_frame_features(st, x);
 
     RNN_COPY(&pcmbuf[st->pcount*FRAME_SIZE], pcm, FRAME_SIZE);
+    if (st->pcount == 1 || st->pcount == 3) {
+        unsigned char bytes[100];
+        float pcm_dec[320];
+        float data[4][19];
+        float bandE[4][NB_BANDS];
+        int nb_bytes;
+        int nb_samples;
+        int pick;
+        static float mem_preemph2 = 0;
+        nb_bytes = opus_encode_float(enc, &xbuf[(st->pcount-1)*FRAME_SIZE], 320, bytes, 100);
+        nb_samples = opus_decode_float(dec, bytes, nb_bytes, pcm_dec, 320, 0);
+        preemphasis(pcm_dec, &mem_preemph2, pcm_dec, PREEMPHASIS, 2*FRAME_SIZE);
+        if (nb_samples != 320) break;
+        for (i=0;i<320;i++) pcm_dec[i] *= 32768;
+        st->pcount--;
+        compute_frame_features(st, pcm_dec);
+        st->pcount++;
+        compute_frame_features(st, pcm_dec+160);
+        get_fdump(data);
+#if 1
+        for (i=0;i<4;i++) compute_band_energy_from_lpc(bandE[i], data[i][18], data[i]);
+        for (i=0;i<NB_BANDS;i++) bandE[0][i] = log10(1e-2+bandE[0][i]+bandE[1][i]);
+        for (i=0;i<NB_BANDS;i++) bandE[2][i] = log10(1e-2+bandE[2][i]+bandE[3][i]);
+        dct(&st->features[st->pcount-1][NB_BANDS], bandE[0]);
+        dct(&st->features[st->pcount][NB_BANDS]  , bandE[2]);
+        st->features[st->pcount-1][NB_BANDS] -= 4;
+        st->features[st->pcount][NB_BANDS] -= 4;
+#endif
+        pick = data[0][17] > data[1][17] ? 0 : 1;
+        st->features[st->pcount-1][36] = .02*(data[pick][16] - 100);
+        st->features[st->pcount-1][37] = data[pick][17] - .5;
+        pick = data[2][17] > data[3][17] ? 2 : 3;
+        st->features[st->pcount][36] = .02*(data[pick][16] - 100);
+        st->features[st->pcount][37] = data[pick][17] - .5;
+
+        for (i=0;i<16;i++) st->features[st->pcount-1][39+i] = -data[0][i];
+        for (i=0;i<16;i++) st->features[st->pcount][39+i] = -data[2][i];
+
+        //lpc_from_cepstrum(&st->features[st->pcount-1][2*NB_BANDS+3], st->features[st->pcount-1]);
+        //lpc_from_cepstrum(&st->features[st->pcount][2*NB_BANDS+3], st->features[st->pcount]);
+        //for (i=0;i<55;i++) printf("%f ", st->features[st->pcount-1][i]);
+        //for (i=0;i<55;i++) printf("%f ", st->features[st->pcount][i]);
+        //printf("\n");
+        //printf("%f %f %f %f %f\n", st->features[st->pcount-1][37], data[1][16], data[3][16], 100+50*st->features[st->pcount-1][36], 100+50*st->features[st->pcount][36]);
+    }
     if (fpcm) {
         compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std);
     }
     st->pcount++;
     /* Running on groups of 4 frames. */
     if (st->pcount == 4) {
+#if 0
       unsigned char buf[8];
       process_superframe(st, buf, ffeat, encode, quantize);
-      if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm);
+#else
+      float ftemp[55];
+      static float fmem[55] = {0};
+      static float last_pitch = 0;
+      for (i=3;i>=0;i--) {
+        if (st->features[i][36] > -1.99) last_pitch = st->features[i][36];
+        else st->features[i][36] = last_pitch;
+      }
+      last_pitch = st->features[3][36];
+#if 0
+      RNN_COPY(ftemp, &st->features[3][0], 55);
+      for (i=3;i>=1;i--) {
+          RNN_COPY(&st->features[i][NB_BANDS], &st->features[i-1][NB_BANDS], NB_BANDS+2);
+      }
+      RNN_COPY(&st->features[0][NB_BANDS], &fmem[NB_BANDS], NB_BANDS+2);
+      RNN_COPY(fmem, ftemp, 55);
+#endif
+      for (i=0;i<4;i++) {
+          int j;
+          for (j=0;j<NB_BANDS;j++) st->features[i][NB_BANDS+j] -= st->features[i][j];
+      }
+      if (ffeat) {
+        for (i=0;i<4;i++) {
+          fwrite(st->features[i], sizeof(float), 38, ffeat);
+        }
+    }
+#endif
+    if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm);
       st->pcount = 0;
     }
     //if (fpcm) fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm);
-    for (i=0;i<TRAINING_OFFSET;i++) pcm[i] = float2short(x[i+FRAME_SIZE-TRAINING_OFFSET]);
+    for (i=0;i<delay;i++) pcm[i] = float2short(x[i+FRAME_SIZE-delay]);
     old_speech_gain = speech_gain;
     count++;
   }

diff --git a/src/freq.c b/src/freq.c
@@ -40,9 +40,7 @@
 #include "celt_lpc.h"
 #include <assert.h>
 
-#define SQUARE(x) ((x)*(x))
-
-static const opus_int16 eband5ms[] = {
+const opus_int16 eband5ms[] = {
 /*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k*/
   0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40
 };

diff --git a/src/freq.h b/src/freq.h
@@ -43,6 +43,10 @@
 #define NB_BANDS 18
 #define NB_BANDS_1 (NB_BANDS - 1)
 
+#define SQUARE(x) ((x)*(x))
+
+extern const opus_int16 eband5ms[];
+
 void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
 void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);
 

diff --git a/src/lpcnet.c b/src/lpcnet.c
@@ -129,14 +129,17 @@ LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *lpcnet, const float *features,
     int pitch;
     float pitch_gain;
     /* Matches the Python code -- the 0.1 avoids rounding issues. */
-    pitch = (int)floor(.1 + 50*features[36]+100);
+    pitch = IMIN(255, (int)floor(.1 + 50*features[36]+100));
     pitch_gain = lpcnet->old_gain[FEATURES_DELAY-1];
     memmove(&lpcnet->old_gain[1], &lpcnet->old_gain[0], (FEATURES_DELAY-1)*sizeof(lpcnet->old_gain[0]));
     lpcnet->old_gain[0] = features[PITCH_GAIN_FEATURE];
     run_frame_network(lpcnet, condition, gru_a_condition, features, pitch);
     memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
     memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
     lpc_from_cepstrum(lpcnet->old_lpc[0], features);
+    //for (i=0;i<16;i++) printf("%f ", lpcnet->old_lpc[0][i]);
+    //printf("\n");
+
     if (lpcnet->frame_count <= FEATURES_DELAY)
     {
         RNN_CLEAR(output, N);

diff --git a/src/lpcnet_demo.c b/src/lpcnet_demo.c
@@ -109,13 +109,15 @@ int main(int argc, char **argv) {
         LPCNetState *net;
         net = lpcnet_create();
         while (1) {
+            int i;
             float in_features[NB_TOTAL_FEATURES];
             float features[NB_FEATURES];
             short pcm[LPCNET_FRAME_SIZE];
             fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
             if (feof(fin)) break;
             RNN_COPY(features, in_features, NB_FEATURES);
-            RNN_CLEAR(&features[18], 18);
+            //for (i=0;i<16;i++) printf("%f ", in_features[NB_TOTAL_FEATURES-16+i]);
+            //RNN_CLEAR(&features[18], 18);
             lpcnet_synthesize(net, features, pcm, LPCNET_FRAME_SIZE);
             fwrite(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fout);
         }

diff --git a/src/train_lpcnet.py b/src/train_lpcnet.py
@@ -87,14 +87,15 @@
 
 features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
 features = features[:, :, :nb_used_features]
-features[:,:,18:36] = 0
+#features[:,:,18:36] = 0
 
 fpad1 = np.concatenate([features[0:1, 0:2, :], features[:-1, -2:, :]], axis=0)
 fpad2 = np.concatenate([features[1:, :2, :], features[0:1, -2:, :]], axis=0)
 features = np.concatenate([fpad1, features, fpad2], axis=1)
 
 
 periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')
+periods = np.minimum(periods, 255);
 
 in_data = np.concatenate([sig, pred, in_exc], axis=-1)
 
@@ -103,7 +104,7 @@
 del in_exc
 
 # dump models to disk as we go
-checkpoint = ModelCheckpoint('lpcnet24g_384_10_G16_{epoch:02d}.h5')
+checkpoint = ModelCheckpoint('lpcnet27b_384_10_G16_{epoch:02d}.h5')
 
 #Set this to True to adapt an existing model (e.g. on new data)
 adaptation = False
@@ -121,4 +122,5 @@
     decay = 5e-5
 
 model.compile(optimizer=Adam(lr, amsgrad=True, decay=decay), loss='sparse_categorical_crossentropy')
+model.save_weights('lpcnet27b_384_10_G16_00.h5');
 model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])