-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocess.py
83 lines (70 loc) · 3.42 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import glob
import tqdm
import torch
import argparse
import numpy as np
import hparams as hp
from stft import TacotronSTFT
from utils.utils import read_wav_np
from audio_processing import pitch
from text import phonemes_to_sequence
def main(args):
stft = TacotronSTFT(filter_length=hp.n_fft,
hop_length=hp.hop_length,
win_length=hp.win_length,
n_mel_channels=hp.n_mels,
sampling_rate=hp.sampling_rate,
mel_fmin=hp.fmin,
mel_fmax=hp.fmax)
# wav_file loacation
wav_files = glob.glob(os.path.join(args.wav_root_path, '**', '*.wav'), recursive=True)
#Define all the paths correesponding to the feature
text_path = os.path.join(hp.data_path, 'text')
mel_path = os.path.join(hp.data_path, 'mels')
duration_path = os.path.join(hp.data_path, 'alignment')
energy_path = os.path.join(hp.data_path, 'energy')
pitch_path = os.path.join(hp.data_path, 'pitch')
symbol_path = os.path.join(hp.data_path, 'symbol')
# create directory if doesnt exist
os.makedirs(text_path,exist_ok = True)
os.makedirs(duration_path, exist_ok = True)
os.makedirs(mel_path, exist_ok=True)
os.makedirs(energy_path, exist_ok=True)
os.makedirs(pitch_path, exist_ok=True)
os.makedirs(symbol_path, exist_ok=True)
for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel, energy, and pitch'):
sr, wav = read_wav_np(wavpath)
p = pitch(wav) # [T, ] T = Number of frames
wav = torch.from_numpy(wav).unsqueeze(0)
mel, mag = stft.mel_spectrogram(wav) # mel [1, 80, T] mag [1, num_mag, T]
mel = mel.squeeze(0) # [num_mel, T]
mag = mag.squeeze(0) # [num_mag, T]
e = torch.norm(mag, dim=0) # [T, ]
p = p[:mel.shape[1]]
p = np.array(p, dtype='float32')
id = os.path.basename(wavpath).split(".")[0]
# save the features
np.save('{}/{}.npy'.format(mel_path,id), mel.numpy(), allow_pickle=False)
np.save('{}/{}.npy'.format(energy_path, id), e.numpy(), allow_pickle=False)
np.save('{}/{}.npy'.format(pitch_path, id), p , allow_pickle=False)
with open(hp.filelist_alignment_dir + "alignment.txt", encoding='utf-8') as f: #add all 13100 examples to filelist.txt
for lines in f:
content = lines.split('|')
id = content[4].split()[0].split('.')[0]
if os.path.exists(os.path.join(args.wav_root_path, id + '.wav')):
text = content[0]
duration = content[2]
duration = duration.split()
dur = np.array(duration, dtype = 'float32') #done
phoneme = content[3]
symbol_sequence = phonemes_to_sequence(phoneme)
np.save('{}/{}.npy'.format(text_path, id), (text, phoneme), allow_pickle=False) #what is the input text or phonemen???
np.save('{}/{}.npy'.format(duration_path, id), dur, allow_pickle=False)
np.save('{}/{}.npy'.format(symbol_path, id), symbol_sequence, allow_pickle=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--wav_root_path', type=str, required=True,
help="root directory of wav files")
args = parser.parse_args()
main(args)