Skip to content

Commit 054fed0

Browse files
alancuckinv-kkudrynski
authored andcommitted
[FastPitch/PyT] Drop parselmouth dependency
1 parent 61bcab7 commit 054fed0

File tree

7 files changed

+10
-47
lines changed

7 files changed

+10
-47
lines changed

PyTorch/SpeechSynthesis/FastPitch/README.md

+3-8
Original file line numberDiff line numberDiff line change
@@ -363,16 +363,11 @@ FastPitch 1.1 aligns input symbols to output mel-spectrogram frames automaticall
363363
on any external aligning model. FastPitch training can now be started on raw waveforms
364364
without any pre-processing: pitch values and mel-spectrograms will be calculated on-line.
365365

366-
For every mel-spectrogram frame, its fundamental frequency in Hz is estimated with either
367-
the Probabilistic YIN algorithm or [Praat](http://praat.org).
368-
369-
The former is more accurate but time consuming, and we recommend to pre-calculate
370-
pitch during the data processing step. The latter is suitable for on-line pitch calculation.
371-
Pitch values are then averaged over every character, in order to provide sparse
372-
pitch cues for the model.
366+
For every mel-spectrogram frame, its fundamental frequency in Hz is estimated with
367+
the Probabilistic YIN algorithm.
373368

374369
<p align="center">
375-
<img src="./img/pitch.png" alt="Pitch estimates extracted with Praat" />
370+
<img src="./img/pitch.png" alt="Pitch contour estimate" />
376371
</p>
377372
<p align="center">
378373
<em>Figure 2. Pitch estimates for mel-spectrogram frames of phrase "in being comparatively"

PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py

+3-32
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232

3333
import librosa
3434
import numpy as np
35-
import parselmouth
3635
import torch
3736
import torch.nn.functional as F
3837
from scipy import ndimage
@@ -88,35 +87,7 @@ def estimate_pitch(wav, mel_len, method='pyin', normalize_mean=None,
8887
if type(normalize_std) is float or type(normalize_std) is list:
8988
normalize_std = torch.tensor(normalize_std)
9089

91-
if method == 'praat':
92-
93-
snd = parselmouth.Sound(wav)
94-
pitch_mel = snd.to_pitch(time_step=snd.duration / (mel_len + 3)
95-
).selected_array['frequency']
96-
assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0
97-
98-
pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0)
99-
100-
if n_formants > 1:
101-
formant = snd.to_formant_burg(
102-
time_step=snd.duration / (mel_len + 3))
103-
formant_n_frames = formant.get_number_of_frames()
104-
assert np.abs(mel_len - formant_n_frames) <= 1.0
105-
106-
formants_mel = np.zeros((formant_n_frames + 1, n_formants - 1))
107-
for i in range(1, formant_n_frames + 1):
108-
formants_mel[i] = np.asarray([
109-
formant.get_value_at_time(
110-
formant_number=f,
111-
time=formant.get_time_from_frame_number(i))
112-
for f in range(1, n_formants)
113-
])
114-
115-
pitch_mel = torch.cat(
116-
[pitch_mel, torch.from_numpy(formants_mel).permute(1, 0)],
117-
dim=0)
118-
119-
elif method == 'pyin':
90+
if method == 'pyin':
12091

12192
snd, sr = librosa.load(wav)
12293
pitch_mel, voiced_flag, voiced_probs = librosa.pyin(
@@ -181,7 +152,7 @@ def __init__(self,
181152
pitch_online_dir=None,
182153
betabinomial_online_dir=None,
183154
use_betabinomial_interpolator=True,
184-
pitch_online_method='praat',
155+
pitch_online_method='pyin',
185156
**ignored):
186157

187158
# Expect a list of filenames
@@ -338,7 +309,7 @@ def get_pitch(self, index, mel_len=None):
338309
if cached_fpath.is_file():
339310
return torch.load(cached_fpath)
340311

341-
# No luck so far - calculate or replace with praat
312+
# No luck so far - calculate
342313
wav = audiopath
343314
if not wav.endswith('.wav'):
344315
wav = re.sub('/mels/', '/wavs/', wav)

PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def parse_args(parser):
7373
parser.add_argument('--n-mel-channels', type=int, default=80)
7474
# Pitch extraction
7575
parser.add_argument('--f0-method', default='pyin', type=str,
76-
choices=('pyin', 'praat'), help='F0 estimation method')
76+
choices=['pyin'], help='F0 estimation method')
7777
# Performance
7878
parser.add_argument('-b', '--batch-size', default=1, type=int)
7979
parser.add_argument('--n-workers', type=int, default=16)

PyTorch/SpeechSynthesis/FastPitch/requirements.txt

-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,5 @@ inflect
44
librosa==0.8.0
55
scipy
66
Unidecode
7-
praat-parselmouth==0.3.3
87
tensorboardX==2.0
98
git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger

PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh

+1-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
set -e
44

55
: ${DATA_DIR:=LJSpeech-1.1}
6-
: ${F0_METHOD:="pyin"}
76
: ${ARGS="--extract-mels"}
87

98
python prepare_dataset.py \
@@ -12,5 +11,5 @@ python prepare_dataset.py \
1211
--batch-size 1 \
1312
--dataset-path $DATA_DIR \
1413
--extract-pitch \
15-
--f0-method $F0_METHOD \
14+
--f0-method pyin \
1615
$ARGS

PyTorch/SpeechSynthesis/FastPitch/scripts/train_benchmark.sh

-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ set -a
66
: ${NUM_GPUS_SEQUENCE:="1 4 8"}
77
: ${EPOCHS:=30}
88
: ${OUTPUT_DIR:="./output"}
9-
: ${F0_METHOD:=praat}
109
: ${BATCH_SIZE:=16}
1110

1211
for NUM_GPUS in $NUM_GPUS_SEQUENCE ; do

PyTorch/SpeechSynthesis/FastPitch/train.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ def parse_args(parser):
147147
'n_speakers > 1 enables speaker embeddings')
148148
cond.add_argument('--load-pitch-from-disk', action='store_true',
149149
help='Use pitch cached on disk with prepare_dataset.py')
150-
cond.add_argument('--pitch-online-method', default='praat',
151-
choices=['praat', 'pyin'],
150+
cond.add_argument('--pitch-online-method', default='pyin',
151+
choices=['pyin'],
152152
help='Calculate pitch on the fly during trainig')
153153
cond.add_argument('--pitch-online-dir', type=str, default=None,
154154
help='A directory for storing pitch calculated on-line')

0 commit comments

Comments
 (0)