@@ -691,6 +691,12 @@ class _NltkCorpusReaderDataset(CachedDataset2):
691
691
692
692
693
693
class ExtractAudioFeatures :
694
+ """
695
+ Currently uses librosa to extract MFCC features.
696
+ We could also use python_speech_features.
697
+ We could also add support e.g. to directly extract log-filterbanks or so.
698
+ """
699
+
694
700
def __init__ (self ,
695
701
window_len = 0.025 , step_len = 0.010 ,
696
702
num_feature_filters = 40 , with_delta = False , norm_mean = None , norm_std_dev = None ,
@@ -1524,9 +1530,6 @@ def __init__(self, path, prefix, bpe, audio, partition_epoch=None, fixed_random_
1524
1530
self .prefix = prefix
1525
1531
assert prefix in ["train" , "dev" , "eval" ]
1526
1532
assert os .path .exists (path + "/train-clean-100" )
1527
- import Util
1528
- Util .monkeyfix_glib ()
1529
- Util .monkeypatch_audioread ()
1530
1533
self .bpe = BytePairEncoding (** bpe )
1531
1534
self .labels = self .bpe .labels
1532
1535
self ._fixed_random_seed = fixed_random_seed
@@ -1619,13 +1622,20 @@ def _collect_single_seq(self, seq_idx):
1619
1622
:param int seq_idx:
1620
1623
:rtype: DatasetSeq
1621
1624
"""
1625
+ # Don't use librosa.load which internally uses audioread which would use Gstreamer as a backend,
1626
+ # which has multiple issues:
1627
+ # https://github.com/beetbox/audioread/issues/62
1628
+ # https://github.com/beetbox/audioread/issues/63
1629
+ # Instead, use PySoundFile, which is also faster. See here for discussions:
1630
+ # https://github.com/beetbox/audioread/issues/64
1631
+ # https://github.com/librosa/librosa/issues/681
1622
1632
import os
1623
- import librosa
1633
+ import soundfile # pip install pysoundfile
1624
1634
subdir , speaker_id , chapter_id , seq_id = self ._reference_seq_order [self ._get_ref_seq_idx (seq_idx )]
1625
1635
audio_fn = "%(p)s/%(sd)s/%(sp)i/%(ch)i/%(sp)i-%(ch)i-%(i)04i.flac" % {
1626
1636
"p" : self .path , "sd" : subdir , "sp" : speaker_id , "ch" : chapter_id , "i" : seq_id }
1627
1637
assert os .path .exists (audio_fn )
1628
- audio , sample_rate = librosa . load (audio_fn , sr = None )
1638
+ audio , sample_rate = soundfile . read (audio_fn )
1629
1639
features = self .feature_extractor .get_audio_features (audio = audio , sample_rate = sample_rate )
1630
1640
targets_txt = self .transs [(subdir , speaker_id , chapter_id , seq_id )]
1631
1641
targets = numpy .array (self .bpe .get_seq (targets_txt ), dtype = "int32" )
0 commit comments