Skip to content

Commit 10c09fb

Browse files
authored
[audio utils] fix fft_bin_width computation (#1274)
1 parent 991b259 commit 10c09fb

File tree

4 files changed

+14
-20
lines changed

4 files changed

+14
-20
lines changed

src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export class ASTFeatureExtractor extends FeatureExtractor {
1010

1111
const sampling_rate = this.config.sampling_rate;
1212
const mel_filters = mel_filter_bank(
13-
256, // num_frequency_bins
13+
257, // num_frequency_bins
1414
this.config.num_mel_bins, // num_mel_filters
1515
20, // min_frequency
1616
Math.floor(sampling_rate / 2), // max_frequency
@@ -19,11 +19,6 @@ export class ASTFeatureExtractor extends FeatureExtractor {
1919
"kaldi", // mel_scale
2020
true, // triangularize_in_mel_space
2121
);
22-
23-
// Do padding:
24-
for (let i = 0; i < mel_filters.length; ++i) {
25-
mel_filters[i].push(0);
26-
}
2722
this.mel_filters = mel_filters;
2823

2924
this.window = window_function(400, 'hann', {

src/models/seamless_m4t/feature_extraction_seamless_m4t.js

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
99

1010
const sampling_rate = this.config.sampling_rate;
1111
const mel_filters = mel_filter_bank(
12-
256, // num_frequency_bins
12+
257, // num_frequency_bins
1313
this.config.num_mel_bins, // num_mel_filters
1414
20, // min_frequency
1515
Math.floor(sampling_rate / 2), // max_frequency
@@ -18,11 +18,6 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
1818
"kaldi", // mel_scale
1919
true, // triangularize_in_mel_space
2020
);
21-
22-
// Do padding:
23-
for (let i = 0; i < mel_filters.length; ++i) {
24-
mel_filters[i].push(0);
25-
}
2621
this.mel_filters = mel_filters;
2722

2823
this.window = window_function(400, 'povey', {

src/models/wespeaker/feature_extraction_wespeaker.js

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export class WeSpeakerFeatureExtractor extends FeatureExtractor {
1010

1111
const sampling_rate = this.config.sampling_rate;
1212
const mel_filters = mel_filter_bank(
13-
256, // num_frequency_bins
13+
257, // num_frequency_bins
1414
this.config.num_mel_bins, // num_mel_filters
1515
20, // min_frequency
1616
Math.floor(sampling_rate / 2), // max_frequency
@@ -19,11 +19,6 @@ export class WeSpeakerFeatureExtractor extends FeatureExtractor {
1919
"kaldi", // mel_scale
2020
true, // triangularize_in_mel_space
2121
);
22-
23-
// Do padding:
24-
for (let i = 0; i < mel_filters.length; ++i) {
25-
mel_filters[i].push(0);
26-
}
2722
this.mel_filters = mel_filters;
2823

2924
this.window = window_function(400, 'hamming', {

src/utils/audio.js

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,8 @@ function linspace(start, end, num) {
235235
* various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
236236
* are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
237237
* features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
238-
* @param {number} num_frequency_bins Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
238+
* @param {number} num_frequency_bins Number of frequency bins (should be the same as `n_fft // 2 + 1`
239+
* where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
239240
* @param {number} num_mel_filters Number of mel filters to generate.
240241
* @param {number} min_frequency Lowest frequency of interest in Hz.
241242
* @param {number} max_frequency Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
@@ -261,6 +262,14 @@ export function mel_filter_bank(
261262
throw new Error('norm must be one of null or "slaney"');
262263
}
263264

265+
if (num_frequency_bins < 2) {
266+
throw new Error(`Require num_frequency_bins: ${num_frequency_bins} >= 2`);
267+
}
268+
269+
if (min_frequency > max_frequency) {
270+
throw new Error(`Require min_frequency: ${min_frequency} <= max_frequency: ${max_frequency}`);
271+
}
272+
264273
const mel_min = hertz_to_mel(min_frequency, mel_scale);
265274
const mel_max = hertz_to_mel(max_frequency, mel_scale);
266275
const mel_freqs = linspace(mel_min, mel_max, num_mel_filters + 2);
@@ -269,7 +278,7 @@ export function mel_filter_bank(
269278
let fft_freqs; // frequencies of FFT bins in Hz
270279

271280
if (triangularize_in_mel_space) {
272-
const fft_bin_width = sampling_rate / (num_frequency_bins * 2);
281+
const fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2);
273282
fft_freqs = hertz_to_mel(Float64Array.from({ length: num_frequency_bins }, (_, i) => i * fft_bin_width), mel_scale);
274283
filter_freqs = mel_freqs;
275284
} else {

0 commit comments

Comments
 (0)