Skip to content

[audio utils] fix fft_bin_width computation #1274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export class ASTFeatureExtractor extends FeatureExtractor {

const sampling_rate = this.config.sampling_rate;
const mel_filters = mel_filter_bank(
256, // num_frequency_bins
257, // num_frequency_bins
this.config.num_mel_bins, // num_mel_filters
20, // min_frequency
Math.floor(sampling_rate / 2), // max_frequency
Expand All @@ -19,11 +19,6 @@ export class ASTFeatureExtractor extends FeatureExtractor {
"kaldi", // mel_scale
true, // triangularize_in_mel_space
);

// Do padding:
for (let i = 0; i < mel_filters.length; ++i) {
mel_filters[i].push(0);
}
this.mel_filters = mel_filters;

this.window = window_function(400, 'hann', {
Expand Down
7 changes: 1 addition & 6 deletions src/models/seamless_m4t/feature_extraction_seamless_m4t.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {

const sampling_rate = this.config.sampling_rate;
const mel_filters = mel_filter_bank(
256, // num_frequency_bins
257, // num_frequency_bins
this.config.num_mel_bins, // num_mel_filters
20, // min_frequency
Math.floor(sampling_rate / 2), // max_frequency
Expand All @@ -18,11 +18,6 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
"kaldi", // mel_scale
true, // triangularize_in_mel_space
);

// Do padding:
for (let i = 0; i < mel_filters.length; ++i) {
mel_filters[i].push(0);
}
this.mel_filters = mel_filters;

this.window = window_function(400, 'povey', {
Expand Down
7 changes: 1 addition & 6 deletions src/models/wespeaker/feature_extraction_wespeaker.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export class WeSpeakerFeatureExtractor extends FeatureExtractor {

const sampling_rate = this.config.sampling_rate;
const mel_filters = mel_filter_bank(
256, // num_frequency_bins
257, // num_frequency_bins
this.config.num_mel_bins, // num_mel_filters
20, // min_frequency
Math.floor(sampling_rate / 2), // max_frequency
Expand All @@ -19,11 +19,6 @@ export class WeSpeakerFeatureExtractor extends FeatureExtractor {
"kaldi", // mel_scale
true, // triangularize_in_mel_space
);

// Do padding:
for (let i = 0; i < mel_filters.length; ++i) {
mel_filters[i].push(0);
}
this.mel_filters = mel_filters;

this.window = window_function(400, 'hamming', {
Expand Down
13 changes: 11 additions & 2 deletions src/utils/audio.js
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,8 @@ function linspace(start, end, num) {
* various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
* are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
* features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
* @param {number} num_frequency_bins Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
* @param {number} num_frequency_bins Number of frequency bins (should be the same as `n_fft // 2 + 1`
* where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
* @param {number} num_mel_filters Number of mel filters to generate.
* @param {number} min_frequency Lowest frequency of interest in Hz.
* @param {number} max_frequency Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
Expand All @@ -261,6 +262,14 @@ export function mel_filter_bank(
throw new Error('norm must be one of null or "slaney"');
}

if (num_frequency_bins < 2) {
throw new Error(`Require num_frequency_bins: ${num_frequency_bins} >= 2`);
}

if (min_frequency > max_frequency) {
throw new Error(`Require min_frequency: ${min_frequency} <= max_frequency: ${max_frequency}`);
}

const mel_min = hertz_to_mel(min_frequency, mel_scale);
const mel_max = hertz_to_mel(max_frequency, mel_scale);
const mel_freqs = linspace(mel_min, mel_max, num_mel_filters + 2);
Expand All @@ -269,7 +278,7 @@ export function mel_filter_bank(
let fft_freqs; // frequencies of FFT bins in Hz

if (triangularize_in_mel_space) {
const fft_bin_width = sampling_rate / (num_frequency_bins * 2);
const fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2);
fft_freqs = hertz_to_mel(Float64Array.from({ length: num_frequency_bins }, (_, i) => i * fft_bin_width), mel_scale);
filter_freqs = mel_freqs;
} else {
Expand Down