add rescaling option

r9y9 · r9y9 · commit 09cd766ca027 · 2018-01-27T14:05:37.000+09:00
diff --git a/hparams.py b/hparams.py
@@ -116,6 +116,11 @@
     preemphasis=0.97,
     min_level_db=-100,
     ref_level_db=20,
+    # whether to rescale waveform or not.
+    # Let x is an input waveform, rescaled waveform y is given by:
+    # y = x / np.abs(x).max() * rescaling_max
+    rescaling=False,
+    rescaling_max=0.999,
     # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may
     # happen depends on min_level_db and ref_level_db, causing clipping noise.
     # If False, assertion is added to ensure no clipping happens.
diff --git a/jsut.py b/jsut.py
@@ -44,6 +44,9 @@ def _process_utterance(out_dir, index, wav_path, text):
     else:
         wav, _ = librosa.effects.trim(wav, top_db=30)
 
+    if hparams.rescaling:
+        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
     # Compute the linear-scale spectrogram from the wav:
     spectrogram = audio.spectrogram(wav).astype(np.float32)
     n_frames = spectrogram.shape[1]
diff --git a/ljspeech.py b/ljspeech.py
@@ -53,6 +53,9 @@ def _process_utterance(out_dir, index, wav_path, text):
     # Load the audio to a numpy array:
     wav = audio.load_wav(wav_path)
 
+    if hparams.rescaling:
+        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
     # Compute the linear-scale spectrogram from the wav:
     spectrogram = audio.spectrogram(wav).astype(np.float32)
     n_frames = spectrogram.shape[1]
diff --git a/vctk.py b/vctk.py
@@ -67,6 +67,9 @@ def _process_utterance(out_dir, index, speaker_id, wav_path, text):
     else:
         wav, _ = librosa.effects.trim(wav, top_db=15)
 
+    if hparams.rescaling:
+        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
     # Compute the linear-scale spectrogram from the wav:
     spectrogram = audio.spectrogram(wav).astype(np.float32)
     n_frames = spectrogram.shape[1]