From a1a1355e94570b6d0e244543348703ad40272b06 Mon Sep 17 00:00:00 2001 From: joseph calderon Date: Fri, 14 Aug 2020 10:19:36 -0700 Subject: [PATCH 1/6] free memory still allocated in common struct when finished --- src/denoise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/denoise.c b/src/denoise.c index d1c21dc0..500eeb5c 100644 --- a/src/denoise.c +++ b/src/denoise.c @@ -277,6 +277,7 @@ DenoiseState *rnnoise_create(RNNModel *model) { } void rnnoise_destroy(DenoiseState *st) { + if (common.init) opus_fft_free(common.kfft, 0); free(st->rnn.vad_gru_state); free(st->rnn.noise_gru_state); free(st->rnn.denoise_gru_state); From 73a482b240992cb7406b9258766f56fb7c1e0bac Mon Sep 17 00:00:00 2001 From: joseph calderon Date: Fri, 14 Aug 2020 10:30:02 -0700 Subject: [PATCH 2/6] fix static analyzer warnings found by clang analyzer --- src/celt_lpc.c | 2 +- src/kiss_fft.c | 2 +- src/rnn.c | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/celt_lpc.c b/src/celt_lpc.c index 521351e9..f86a050c 100644 --- a/src/celt_lpc.c +++ b/src/celt_lpc.c @@ -204,7 +204,7 @@ int _celt_autocorr( int n) { opus_val32 d; - int i, k; + int i = 0, k = 0; int fastN=n-lag; int shift; const opus_val16 *xptr; diff --git a/src/kiss_fft.c b/src/kiss_fft.c index 922dacc6..8971aa4a 100644 --- a/src/kiss_fft.c +++ b/src/kiss_fft.c @@ -447,7 +447,7 @@ kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, if (st) { opus_int16 *bitrev; kiss_twiddle_cpx *twiddles; - + memset(st, 0, sizeof(kiss_fft_state)); st->nfft=nfft; #ifdef FIXED_POINT st->scale_shift = celt_ilog2(st->nfft); diff --git a/src/rnn.c b/src/rnn.c index c54958eb..3509e4ae 100644 --- a/src/rnn.c +++ b/src/rnn.c @@ -162,6 +162,8 @@ void compute_rnn(RNNState *rnn, float *gains, float *vad, const float *input) { float dense_out[MAX_NEURONS]; float noise_input[MAX_NEURONS*3]; float denoise_input[MAX_NEURONS*3]; + + memset(dense_out, 0, sizeof(dense_out)); compute_dense(rnn->model->input_dense, dense_out, input); compute_gru(rnn->model->vad_gru, rnn->vad_gru_state, dense_out); compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state); From 2a55474765a1598e714da226ffd530536491c89d Mon Sep 17 00:00:00 2001 From: joseph calderon Date: Fri, 14 Aug 2020 10:35:01 -0700 Subject: [PATCH 3/6] define M_PI if it's not, eg. if compiling with -std=c99 --- src/denoise.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/denoise.c b/src/denoise.c index 500eeb5c..c777eb97 100644 --- a/src/denoise.c +++ b/src/denoise.c @@ -29,6 +29,10 @@ #include "config.h" #endif +#ifndef M_PI +#define M_PI (3.14159265358979323846) +#endif + #include #include #include From de03f80be3316efbe2db7b796777050d8db28f9a Mon Sep 17 00:00:00 2001 From: joseph calderon Date: Fri, 14 Aug 2020 10:35:23 -0700 Subject: [PATCH 4/6] fix cosmetic typo in rnn.h --- src/rnn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rnn.h b/src/rnn.h index 10329f55..b59cc594 100644 --- a/src/rnn.h +++ b/src/rnn.h @@ -66,4 +66,4 @@ void compute_gru(const GRULayer *gru, float *state, const float *input); void compute_rnn(RNNState *rnn, float *gains, float *vad, const float *input); -#endif /* _MLP_H_ */ +#endif /* _RNN_H_ */ From 4a348476eb24fd2e376003e6729f7b9979a8e8bf Mon Sep 17 00:00:00 2001 From: joseph calderon Date: Fri, 14 Aug 2020 10:47:58 -0700 Subject: [PATCH 5/6] remove custom asserts and default to ACTIVATION_RELU --- src/rnn.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/rnn.c b/src/rnn.c index 3509e4ae..92f00144 100644 --- a/src/rnn.c +++ b/src/rnn.c @@ -92,17 +92,20 @@ void compute_dense(const DenseLayer *layer, float *output, const float *input) sum += layer->input_weights[j*stride + i]*input[j]; output[i] = WEIGHTS_SCALE*sum; } - if (layer->activation == ACTIVATION_SIGMOID) { + switch (layer->activation) { + case ACTIVATION_SIGMOID: for (i=0;iactivation == ACTIVATION_TANH) { + break; + case ACTIVATION_TANH: for (i=0;iactivation == ACTIVATION_RELU) { + break; + default: + case ACTIVATION_RELU: for (i=0;iinput_weights[2*N + j*stride + i]*input[j]; for (j=0;jrecurrent_weights[2*N + j*stride + i]*state[j]*r[j]; - if (gru->activation == ACTIVATION_SIGMOID) sum = sigmoid_approx(WEIGHTS_SCALE*sum); - else if (gru->activation == ACTIVATION_TANH) sum = tansig_approx(WEIGHTS_SCALE*sum); - else if (gru->activation == ACTIVATION_RELU) sum = relu(WEIGHTS_SCALE*sum); - else *(int*)0=0; - h[i] = z[i]*state[i] + (1-z[i])*sum; - } - for (i=0;iactivation) { + case ACTIVATION_SIGMOID: sum = sigmoid_approx(WEIGHTS_SCALE*sum);break; + case ACTIVATION_TANH: sum = tansig_approx(WEIGHTS_SCALE*sum); break; + default: + case ACTIVATION_RELU: sum = relu(WEIGHTS_SCALE*sum); break; + } + h[i] = z[i]*state[i] + (1-z[i])*sum; + } + for (i=0;i Date: Sat, 15 Aug 2020 17:48:31 -0700 Subject: [PATCH 6/6] improve perfomance by using blas-like ops (eg. faxpy/fma) for gru hadamard product --- src/rnn.c | 106 +++++++++++++++++++++++++++++------------------------- 1 file changed, 57 insertions(+), 49 deletions(-) diff --git a/src/rnn.c b/src/rnn.c index 92f00144..721c3650 100644 --- a/src/rnn.c +++ b/src/rnn.c @@ -76,35 +76,38 @@ static OPUS_INLINE float relu(float x) return x < 0 ? 0 : x; } +static void faxpy(float *restrict a, const rnn_weight *restrict b, int k, float u) +{ + if (u == 0.0) return; + for (int idx = 0; idx < k; idx++) + a[idx] += b[idx] * u; +} + void compute_dense(const DenseLayer *layer, float *output, const float *input) { int i, j; int N, M; - int stride; M = layer->nb_inputs; N = layer->nb_neurons; - stride = N; - for (i=0;ibias[i]; - for (j=0;jinput_weights[j*stride + i]*input[j]; - output[i] = WEIGHTS_SCALE*sum; - } + const rnn_weight *ip = layer->input_weights; + /* Compute update gate. */ + for(i = 0; i < N; i++) + output[i] = layer->bias[i]; + for (j=0;jactivation) { case ACTIVATION_SIGMOID: for (i=0;inb_inputs; N = gru->nb_neurons; stride = 3*N; - for (i=0;ibias[i]; - for (j=0;jinput_weights[j*stride + i]*input[j]; - for (j=0;jrecurrent_weights[j*stride + i]*state[j]; - z[i] = sigmoid_approx(WEIGHTS_SCALE*sum); - } - for (i=0;ibias[N + i]; - for (j=0;jinput_weights[N + j*stride + i]*input[j]; - for (j=0;jrecurrent_weights[N + j*stride + i]*state[j]; - r[i] = sigmoid_approx(WEIGHTS_SCALE*sum); + const rnn_weight *ip = gru->input_weights; + const rnn_weight *rp = gru->recurrent_weights; + /* Compute update gate. */ + for(i = 0; i < N; i++) + z[i] = gru->bias[i]; + for (j=0;jbias[N+i]; + ip = gru->input_weights + N; + rp = gru->recurrent_weights + N; + for (j=0;jbias[2*N+i]; + ip = gru->input_weights + 2*N; + rp = gru->recurrent_weights + 2*N; + for (j=0;jactivation) { + case ACTIVATION_SIGMOID: h[i] = sigmoid_approx(WEIGHTS_SCALE*h[i]);break; + case ACTIVATION_TANH: h[i] = tansig_approx(WEIGHTS_SCALE*h[i]); break; + default: + case ACTIVATION_RELU: h[i] = relu(WEIGHTS_SCALE*h[i]); break; + } + h[i] = z[i]*state[i] + (1-z[i])*h[i]; } for (i=0;ibias[2*N + i]; - for (j=0;jinput_weights[2*N + j*stride + i]*input[j]; - for (j=0;jrecurrent_weights[2*N + j*stride + i]*state[j]*r[j]; - switch (gru->activation) { - case ACTIVATION_SIGMOID: sum = sigmoid_approx(WEIGHTS_SCALE*sum);break; - case ACTIVATION_TANH: sum = tansig_approx(WEIGHTS_SCALE*sum); break; - default: - case ACTIVATION_RELU: sum = relu(WEIGHTS_SCALE*sum); break; - } - h[i] = z[i]*state[i] + (1-z[i])*sum; - } - for (i=0;i