Skip to content

Parameter 37 - correlation #121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 141 additions & 6 deletions src/dump_data.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,52 @@
#include <assert.h>
#include "lpcnet.h"
#include "lpcnet_private.h"
#include "opus.h"

float preemph_offset[NB_BANDS] = {1.772676, 2.937053, 0.278042, 0.299267, 0.126341, 0.060082, 0.019509, -0.017281, 0.000530, -0.000156, -0.007375, -0.010533, -0.002903, -0.005244, -0.003251, -0.000492, -0.000174, -0.004998};

void compute_band_energy_from_lpc(float *bandE, float g, const float *lpc) {
int i;
float sum[NB_BANDS] = {0};
float x[WINDOW_SIZE];
kiss_fft_cpx X[FREQ_SIZE];
{
RNN_CLEAR(x, WINDOW_SIZE);
x[0] = 1;
//x[1] = -PREEMPHASIS;
for (i=0;i<LPC_ORDER;i++) x[i+1] = -lpc[i];
forward_transform(X, x);
}
#if 0
for (i=0;i<FREQ_SIZE;i++) {
float E = SQUARE(X[i].r) + SQUARE(X[i].i);
printf("%g ", 1.f/(1e-15+E));
}
printf("\n");
#endif
for (i=0;i<NB_BANDS-1;i++)
{
int j;
int band_size;
band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
for (j=0;j<band_size;j++) {
float tmp;
float frac = (float)j/band_size;
tmp = SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r);
tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
tmp = 1.f/(tmp + 1e-9);
sum[i] += (1-frac)*tmp;
sum[i+1] += frac*tmp;
}
}
sum[0] *= 2;
sum[NB_BANDS-1] *= 2;
for (i=0;i<NB_BANDS;i++)
{
bandE[i] = sum[i];
}
for (i=0;i<NB_BANDS;i++) bandE[i] *= .2*g*g*(1.f/((float)WINDOW_SIZE*WINDOW_SIZE*WINDOW_SIZE*WINDOW_SIZE));
}


static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
Expand Down Expand Up @@ -76,13 +122,16 @@ void compute_noise(int *noise, float noise_std) {

void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file) {
int i, k;
fwrite(pcm, 4*FRAME_SIZE, 2, file);
return;
for (k=0;k<4;k++) {
unsigned char data[4*FRAME_SIZE];
for (i=0;i<FRAME_SIZE;i++) {
float p=0;
float e;
int j;
for (j=0;j<LPC_ORDER;j++) p -= st->features[k][2*NB_BANDS+3+j]*st->sig_mem[j];
//printf("%f\n", pcm[k*FRAME_SIZE+i] - p);
e = lin2ulaw(pcm[k*FRAME_SIZE+i] - p);
/* Signal. */
data[4*i] = lin2ulaw(st->sig_mem[0]);
Expand All @@ -100,7 +149,7 @@ void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *f
st->sig_mem[0] = p + ulaw2lin(e);
st->exc_mem = e;
}
fwrite(data, 4*FRAME_SIZE, 1, file);
//fwrite(data, 4*FRAME_SIZE, 1, file);
}
}

Expand Down Expand Up @@ -128,6 +177,7 @@ int main(int argc, char **argv) {
FILE *fpcm=NULL;
short pcm[FRAME_SIZE]={0};
short pcmbuf[FRAME_SIZE*4]={0};
float xbuf[FRAME_SIZE*4]={0};
int noisebuf[FRAME_SIZE*4]={0};
short tmp[FRAME_SIZE] = {0};
float savedX[FRAME_SIZE] = {0};
Expand All @@ -140,7 +190,17 @@ int main(int argc, char **argv) {
int training = -1;
int encode = 0;
int decode = 0;
int delay = TRAINING_OFFSET;
int quantize = 0;
OpusEncoder *enc;
OpusDecoder *dec;
enc = opus_encoder_create(16000, 1, OPUS_APPLICATION_VOIP, NULL);
opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_WIDEBAND));
opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&delay));
delay = 160;
fprintf(stderr, "delay is %d\n", delay);
dec = opus_decoder_create(16000, 1, NULL);
st = lpcnet_encoder_create();
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
Expand Down Expand Up @@ -242,32 +302,107 @@ int main(int argc, char **argv) {
}
biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
for (i=0;i<FRAME_SIZE;i++) {
float g;
float f = (float)i/FRAME_SIZE;
g = f*speech_gain + (1-f)*old_speech_gain;
x[i] *= g;
}
for (i=0;i<FRAME_SIZE;i++)
xbuf[st->pcount*FRAME_SIZE + i] = (1.f/32768.f)*x[i];
//preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
/* PCM is delayed by 1/2 frame to make the features centered on the frames. */
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
compute_frame_features(st, x);
for (i=0;i<FRAME_SIZE-delay;i++) pcm[i+delay] = float2short(x[i]);
//compute_frame_features(st, x);

RNN_COPY(&pcmbuf[st->pcount*FRAME_SIZE], pcm, FRAME_SIZE);
if (st->pcount == 1 || st->pcount == 3) {
unsigned char bytes[100];
float pcm_dec[320];
float data[4][19];
float bandE[4][NB_BANDS];
int nb_bytes;
int nb_samples;
int pick;
static float mem_preemph2 = 0;
nb_bytes = opus_encode_float(enc, &xbuf[(st->pcount-1)*FRAME_SIZE], 320, bytes, 100);
nb_samples = opus_decode_float(dec, bytes, nb_bytes, pcm_dec, 320, 0);
preemphasis(pcm_dec, &mem_preemph2, pcm_dec, PREEMPHASIS, 2*FRAME_SIZE);
if (nb_samples != 320) break;
for (i=0;i<320;i++) pcm_dec[i] *= 32768;
st->pcount--;
compute_frame_features(st, pcm_dec);
st->pcount++;
compute_frame_features(st, pcm_dec+160);
get_fdump(data);
#if 1
for (i=0;i<4;i++) compute_band_energy_from_lpc(bandE[i], data[i][18], data[i]);
for (i=0;i<NB_BANDS;i++) bandE[0][i] = log10(1e-2+bandE[0][i]+bandE[1][i]);
for (i=0;i<NB_BANDS;i++) bandE[2][i] = log10(1e-2+bandE[2][i]+bandE[3][i]);
dct(&st->features[st->pcount-1][NB_BANDS], bandE[0]);
dct(&st->features[st->pcount][NB_BANDS] , bandE[2]);
st->features[st->pcount-1][NB_BANDS] -= 4;
st->features[st->pcount][NB_BANDS] -= 4;
#endif
pick = data[0][17] > data[1][17] ? 0 : 1;
st->features[st->pcount-1][36] = .02*(data[pick][16] - 100);
st->features[st->pcount-1][37] = data[pick][17] - .5;
pick = data[2][17] > data[3][17] ? 2 : 3;
st->features[st->pcount][36] = .02*(data[pick][16] - 100);
st->features[st->pcount][37] = data[pick][17] - .5;

for (i=0;i<16;i++) st->features[st->pcount-1][39+i] = -data[0][i];
for (i=0;i<16;i++) st->features[st->pcount][39+i] = -data[2][i];

//lpc_from_cepstrum(&st->features[st->pcount-1][2*NB_BANDS+3], st->features[st->pcount-1]);
//lpc_from_cepstrum(&st->features[st->pcount][2*NB_BANDS+3], st->features[st->pcount]);
//for (i=0;i<55;i++) printf("%f ", st->features[st->pcount-1][i]);
//for (i=0;i<55;i++) printf("%f ", st->features[st->pcount][i]);
//printf("\n");
//printf("%f %f %f %f %f\n", st->features[st->pcount-1][37], data[1][16], data[3][16], 100+50*st->features[st->pcount-1][36], 100+50*st->features[st->pcount][36]);
}
if (fpcm) {
compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std);
}
st->pcount++;
/* Running on groups of 4 frames. */
if (st->pcount == 4) {
#if 0
unsigned char buf[8];
process_superframe(st, buf, ffeat, encode, quantize);
if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm);
#else
float ftemp[55];
static float fmem[55] = {0};
static float last_pitch = 0;
for (i=3;i>=0;i--) {
if (st->features[i][36] > -1.99) last_pitch = st->features[i][36];
else st->features[i][36] = last_pitch;
}
last_pitch = st->features[3][36];
#if 0
RNN_COPY(ftemp, &st->features[3][0], 55);
for (i=3;i>=1;i--) {
RNN_COPY(&st->features[i][NB_BANDS], &st->features[i-1][NB_BANDS], NB_BANDS+2);
}
RNN_COPY(&st->features[0][NB_BANDS], &fmem[NB_BANDS], NB_BANDS+2);
RNN_COPY(fmem, ftemp, 55);
#endif
for (i=0;i<4;i++) {
int j;
for (j=0;j<NB_BANDS;j++) st->features[i][NB_BANDS+j] -= st->features[i][j];
}
if (ffeat) {
for (i=0;i<4;i++) {
fwrite(st->features[i], sizeof(float), 38, ffeat);
}
}
#endif
if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm);
st->pcount = 0;
}
//if (fpcm) fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm);
for (i=0;i<TRAINING_OFFSET;i++) pcm[i] = float2short(x[i+FRAME_SIZE-TRAINING_OFFSET]);
for (i=0;i<delay;i++) pcm[i] = float2short(x[i+FRAME_SIZE-delay]);
old_speech_gain = speech_gain;
count++;
}
Expand Down
4 changes: 1 addition & 3 deletions src/freq.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@
#include "celt_lpc.h"
#include <assert.h>

#define SQUARE(x) ((x)*(x))

static const opus_int16 eband5ms[] = {
const opus_int16 eband5ms[] = {
/*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k*/
0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40
};
Expand Down
4 changes: 4 additions & 0 deletions src/freq.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@
#define NB_BANDS 18
#define NB_BANDS_1 (NB_BANDS - 1)

#define SQUARE(x) ((x)*(x))

extern const opus_int16 eband5ms[];

void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);

Expand Down
5 changes: 4 additions & 1 deletion src/lpcnet.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,17 @@ LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *lpcnet, const float *features,
int pitch;
float pitch_gain;
/* Matches the Python code -- the 0.1 avoids rounding issues. */
pitch = (int)floor(.1 + 50*features[36]+100);
pitch = IMIN(255, (int)floor(.1 + 50*features[36]+100));
pitch_gain = lpcnet->old_gain[FEATURES_DELAY-1];
memmove(&lpcnet->old_gain[1], &lpcnet->old_gain[0], (FEATURES_DELAY-1)*sizeof(lpcnet->old_gain[0]));
lpcnet->old_gain[0] = features[PITCH_GAIN_FEATURE];
run_frame_network(lpcnet, condition, gru_a_condition, features, pitch);
memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
lpc_from_cepstrum(lpcnet->old_lpc[0], features);
//for (i=0;i<16;i++) printf("%f ", lpcnet->old_lpc[0][i]);
//printf("\n");

if (lpcnet->frame_count <= FEATURES_DELAY)
{
RNN_CLEAR(output, N);
Expand Down
4 changes: 3 additions & 1 deletion src/lpcnet_demo.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,15 @@ int main(int argc, char **argv) {
LPCNetState *net;
net = lpcnet_create();
while (1) {
int i;
float in_features[NB_TOTAL_FEATURES];
float features[NB_FEATURES];
short pcm[LPCNET_FRAME_SIZE];
fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
if (feof(fin)) break;
RNN_COPY(features, in_features, NB_FEATURES);
RNN_CLEAR(&features[18], 18);
//for (i=0;i<16;i++) printf("%f ", in_features[NB_TOTAL_FEATURES-16+i]);
//RNN_CLEAR(&features[18], 18);
lpcnet_synthesize(net, features, pcm, LPCNET_FRAME_SIZE);
fwrite(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fout);
}
Expand Down
6 changes: 4 additions & 2 deletions src/train_lpcnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,15 @@

features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
features = features[:, :, :nb_used_features]
features[:,:,18:36] = 0
#features[:,:,18:36] = 0

fpad1 = np.concatenate([features[0:1, 0:2, :], features[:-1, -2:, :]], axis=0)
fpad2 = np.concatenate([features[1:, :2, :], features[0:1, -2:, :]], axis=0)
features = np.concatenate([fpad1, features, fpad2], axis=1)


periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')
periods = np.minimum(periods, 255);

in_data = np.concatenate([sig, pred, in_exc], axis=-1)

Expand All @@ -103,7 +104,7 @@
del in_exc

# dump models to disk as we go
checkpoint = ModelCheckpoint('lpcnet24g_384_10_G16_{epoch:02d}.h5')
checkpoint = ModelCheckpoint('lpcnet27b_384_10_G16_{epoch:02d}.h5')

#Set this to True to adapt an existing model (e.g. on new data)
adaptation = False
Expand All @@ -121,4 +122,5 @@
decay = 5e-5

model.compile(optimizer=Adam(lr, amsgrad=True, decay=decay), loss='sparse_categorical_crossentropy')
model.save_weights('lpcnet27b_384_10_G16_00.h5');
model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])