use PySoundFile instead of librosa.load (audioread)

albertz · Spotlight0xff · commit 98a6faca176d · 2018-03-01T18:12:32.000+01:00
beetbox/audioread#64 librosa/librosa#681 beetbox/audioread#62 beetbox/audioread#63
diff --git a/GeneratingDataset.py b/GeneratingDataset.py
@@ -691,6 +691,12 @@ class _NltkCorpusReaderDataset(CachedDataset2):
 
 
 class ExtractAudioFeatures:
+  """
+  Currently uses librosa to extract MFCC features.
+  We could also use python_speech_features.
+  We could also add support e.g. to directly extract log-filterbanks or so.
+  """
+
   def __init__(self,
                window_len=0.025, step_len=0.010,
                num_feature_filters=40, with_delta=False, norm_mean=None, norm_std_dev=None,
@@ -1524,9 +1530,6 @@ def __init__(self, path, prefix, bpe, audio, partition_epoch=None, fixed_random_
     self.prefix = prefix
     assert prefix in ["train", "dev", "eval"]
     assert os.path.exists(path + "/train-clean-100")
-    import Util
-    Util.monkeyfix_glib()
-    Util.monkeypatch_audioread()
     self.bpe = BytePairEncoding(**bpe)
     self.labels = self.bpe.labels
     self._fixed_random_seed = fixed_random_seed
@@ -1619,13 +1622,20 @@ def _collect_single_seq(self, seq_idx):
     :param int seq_idx:
     :rtype: DatasetSeq
     """
+    # Don't use librosa.load which internally uses audioread which would use Gstreamer as a backend,
+    # which has multiple issues:
+    # https://github.com/beetbox/audioread/issues/62
+    # https://github.com/beetbox/audioread/issues/63
+    # Instead, use PySoundFile, which is also faster. See here for discussions:
+    # https://github.com/beetbox/audioread/issues/64
+    # https://github.com/librosa/librosa/issues/681
     import os
-    import librosa
+    import soundfile  # pip install pysoundfile
     subdir, speaker_id, chapter_id, seq_id = self._reference_seq_order[self._get_ref_seq_idx(seq_idx)]
     audio_fn = "%(p)s/%(sd)s/%(sp)i/%(ch)i/%(sp)i-%(ch)i-%(i)04i.flac" % {
       "p": self.path, "sd": subdir, "sp": speaker_id, "ch": chapter_id, "i": seq_id}
     assert os.path.exists(audio_fn)
-    audio, sample_rate = librosa.load(audio_fn, sr=None)
+    audio, sample_rate = soundfile.read(audio_fn)
     features = self.feature_extractor.get_audio_features(audio=audio, sample_rate=sample_rate)
     targets_txt = self.transs[(subdir, speaker_id, chapter_id, seq_id)]
     targets = numpy.array(self.bpe.get_seq(targets_txt), dtype="int32")
diff --git a/Util.py b/Util.py
@@ -2570,8 +2570,8 @@ def monkeyfix_glib():
   """
   Fixes some stupid bugs such that SIGINT is not working.
   This is used by audioread, and indirectly by librosa for loading audio.
-
   https://stackoverflow.com/questions/16410852/
+  See also :func:`monkeypatch_audioread`.
   """
   try:
     import gi
@@ -2593,6 +2593,16 @@ def monkeypatch_audioread():
   audioread does not behave optimal in some cases.
   E.g. each call to _ca_available() takes quite long because of the ctypes.util.find_library usage.
   We will patch this.
+
+  However, the recommendation would be to not use audioread (librosa.load).
+  audioread uses Gstreamer as a backend by default currently (on Linux).
+  Gstreamer has multiple issues. See also :func:`monkeyfix_glib`, and here for discussion:
+  https://github.com/beetbox/audioread/issues/62
+  https://github.com/beetbox/audioread/issues/63
+
+  Instead, use PySoundFile, which is also faster. See here for discussions:
+  https://github.com/beetbox/audioread/issues/64
+  https://github.com/librosa/librosa/issues/681
   """
   try:
     import audioread