From 7bbc7c0ccc5bcb9e00a344f55ecd45b57fc57966 Mon Sep 17 00:00:00 2001 From: yerfor Date: Mon, 16 May 2022 21:02:23 +0800 Subject: [PATCH] small updates for libritts --- README.md | 2 +- egs/datasets/audio/libritts/base_text2mel.yaml | 2 +- egs/datasets/audio/lj/base_text2mel.yaml | 3 ++- modules/tts/syntaspeech/syntaspeech.py | 1 + tasks/tts/dataset_utils.py | 13 +++++++++++-- 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7cfaa35..5390e5a 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config egs/tts/biaobei/synta.yaml - Audio samples in the paper can be found in our [demo page](https://syntaspeech.github.io/). -We also provide [HuggingFace Demo Page](https://huggingface.co/spaces/NATSpeech/PortaSpeech) for LJSpeech. Try your interesting sentences there! +We also provide [HuggingFace Demo Page](https://huggingface.co/spaces/yerfor/SyntaSpeech) for LJSpeech. Try your interesting sentences there! ## Citation diff --git a/egs/datasets/audio/libritts/base_text2mel.yaml b/egs/datasets/audio/libritts/base_text2mel.yaml index 1a34511..2a1a277 100644 --- a/egs/datasets/audio/libritts/base_text2mel.yaml +++ b/egs/datasets/audio/libritts/base_text2mel.yaml @@ -8,7 +8,7 @@ binarization_args: train_range: [ 871, -1 ] test_range: [ 0, 523 ] valid_range: [ 523, 871 ] - shuffle: false + shuffle: true with_spk_id: true with_spk_embed: false test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, diff --git a/egs/datasets/audio/lj/base_text2mel.yaml b/egs/datasets/audio/lj/base_text2mel.yaml index 52a91fe..3efe587 100644 --- a/egs/datasets/audio/lj/base_text2mel.yaml +++ b/egs/datasets/audio/lj/base_text2mel.yaml @@ -14,4 +14,5 @@ test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ] f0_min: 80 f0_max: 600 -vocoder_ckpt: checkpoints/hifi_lj \ No newline at end of file +vocoder_ckpt: checkpoints/hifi_lj +num_valid_plots: 30 \ No newline at end of file diff --git a/modules/tts/syntaspeech/syntaspeech.py b/modules/tts/syntaspeech/syntaspeech.py index 3ac7be6..188f612 100644 --- a/modules/tts/syntaspeech/syntaspeech.py +++ b/modules/tts/syntaspeech/syntaspeech.py @@ -120,6 +120,7 @@ def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel style_embed = self.forward_style_embed(spk_embed, spk_id) # speaker embedding, [B, 1, C] x, tgt_nonpadding = self.run_text_encoder( txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, graph_lst=graph_lst, etypes_lst=etypes_lst) + x = x + style_embed # it maybe necessary to achieve multi-speaker x = x * tgt_nonpadding ret['nonpadding'] = tgt_nonpadding if self.hparams['use_pitch_embed']: diff --git a/tasks/tts/dataset_utils.py b/tasks/tts/dataset_utils.py index 08772c5..19a2c6f 100644 --- a/tasks/tts/dataset_utils.py +++ b/tasks/tts/dataset_utils.py @@ -30,8 +30,17 @@ def __init__(self, prefix, shuffle=False, items=None, data_dir=None): self.avail_idxs = list(range(len(self.sizes))) if prefix == 'train' and hparams['min_frames'] > 0: self.avail_idxs = [x for x in self.avail_idxs if self.sizes[x] >= hparams['min_frames']] - self.sizes = [self.sizes[i] for i in self.avail_idxs] - + try: + self.sizes = [self.sizes[i] for i in self.avail_idxs] + except: + tmp_sizes = [] + for i in self.avail_idxs: + try: + tmp_sizes.append(self.sizes[i]) + except: + continue + self.sizes = tmp_sizes + def _get_item(self, index): if hasattr(self, 'avail_idxs') and self.avail_idxs is not None: index = self.avail_idxs[index]