From 7bbc7c0ccc5bcb9e00a344f55ecd45b57fc57966 Mon Sep 17 00:00:00 2001
From: yerfor <zhenhuiye@zju.edu.cn>
Date: Mon, 16 May 2022 21:02:23 +0800
Subject: [PATCH] small updates for libritts

---
 README.md                                      |  2 +-
 egs/datasets/audio/libritts/base_text2mel.yaml |  2 +-
 egs/datasets/audio/lj/base_text2mel.yaml       |  3 ++-
 modules/tts/syntaspeech/syntaspeech.py         |  1 +
 tasks/tts/dataset_utils.py                     | 13 +++++++++++--
 5 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7cfaa35..5390e5a 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config egs/tts/biaobei/synta.yaml -
 
 Audio samples in the paper can be found in our [demo page](https://syntaspeech.github.io/).
 
-We also provide [HuggingFace Demo Page](https://huggingface.co/spaces/NATSpeech/PortaSpeech) for LJSpeech. Try your interesting sentences there!
+We also provide [HuggingFace Demo Page](https://huggingface.co/spaces/yerfor/SyntaSpeech) for LJSpeech. Try your interesting sentences there!
 
 ## Citation
 
diff --git a/egs/datasets/audio/libritts/base_text2mel.yaml b/egs/datasets/audio/libritts/base_text2mel.yaml
index 1a34511..2a1a277 100644
--- a/egs/datasets/audio/libritts/base_text2mel.yaml
+++ b/egs/datasets/audio/libritts/base_text2mel.yaml
@@ -8,7 +8,7 @@ binarization_args:
   train_range: [ 871, -1 ]
   test_range: [ 0, 523 ]
   valid_range: [ 523, 871 ]
-  shuffle: false
+  shuffle: true
   with_spk_id: true
   with_spk_embed: false
 test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
diff --git a/egs/datasets/audio/lj/base_text2mel.yaml b/egs/datasets/audio/lj/base_text2mel.yaml
index 52a91fe..3efe587 100644
--- a/egs/datasets/audio/lj/base_text2mel.yaml
+++ b/egs/datasets/audio/lj/base_text2mel.yaml
@@ -14,4 +14,5 @@ test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
             316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
 f0_min: 80
 f0_max: 600
-vocoder_ckpt: checkpoints/hifi_lj
\ No newline at end of file
+vocoder_ckpt: checkpoints/hifi_lj
+num_valid_plots: 30
\ No newline at end of file
diff --git a/modules/tts/syntaspeech/syntaspeech.py b/modules/tts/syntaspeech/syntaspeech.py
index 3ac7be6..188f612 100644
--- a/modules/tts/syntaspeech/syntaspeech.py
+++ b/modules/tts/syntaspeech/syntaspeech.py
@@ -120,6 +120,7 @@ def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel
         style_embed = self.forward_style_embed(spk_embed, spk_id) # speaker embedding, [B, 1, C]
         x, tgt_nonpadding = self.run_text_encoder(
             txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, graph_lst=graph_lst, etypes_lst=etypes_lst)
+        x = x + style_embed # it maybe necessary to achieve multi-speaker
         x = x * tgt_nonpadding
         ret['nonpadding'] = tgt_nonpadding
         if self.hparams['use_pitch_embed']:
diff --git a/tasks/tts/dataset_utils.py b/tasks/tts/dataset_utils.py
index 08772c5..19a2c6f 100644
--- a/tasks/tts/dataset_utils.py
+++ b/tasks/tts/dataset_utils.py
@@ -30,8 +30,17 @@ def __init__(self, prefix, shuffle=False, items=None, data_dir=None):
                 self.avail_idxs = list(range(len(self.sizes)))
             if prefix == 'train' and hparams['min_frames'] > 0:
                 self.avail_idxs = [x for x in self.avail_idxs if self.sizes[x] >= hparams['min_frames']]
-            self.sizes = [self.sizes[i] for i in self.avail_idxs]
-
+            try:
+                self.sizes = [self.sizes[i] for i in self.avail_idxs]
+            except:
+                tmp_sizes = []
+                for i in self.avail_idxs:
+                    try:
+                        tmp_sizes.append(self.sizes[i])
+                    except:
+                        continue
+                self.sizes = tmp_sizes
+                
     def _get_item(self, index):
         if hasattr(self, 'avail_idxs') and self.avail_idxs is not None:
             index = self.avail_idxs[index]