File tree 3 files changed +9
-7
lines changed
3 files changed +9
-7
lines changed Original file line number Diff line number Diff line change @@ -156,8 +156,8 @@ false \
156
156
sel \
157
157
1.0 \
158
158
100000 \
159
- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
160
- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
159
+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
160
+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
161
161
/mnt/deepseek-ckpts/DeepSeek-V3-to-mcore-tp8-pp8-ep16 \
162
162
1000000000 \
163
163
10000 \
@@ -192,8 +192,8 @@ true \
192
192
sel \
193
193
1.0 \
194
194
100000 \
195
- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
196
- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
195
+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
196
+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
197
197
/mnt/deepseek-ckpts/DeepSeek-V3-to-mcore-tp8-pp8-ep16 \
198
198
10000 \
199
199
100 \
Original file line number Diff line number Diff line change @@ -76,7 +76,7 @@ NUM_ATTENTION_HEADS=128
76
76
NUM_LAYERS=61
77
77
INTERMEDIATE_SIZE=18432
78
78
MOE_INTERMEDIATE_SIZE=2048
79
- MAX_POSITION_EMBEDDINGS=163840
79
+ MAX_POSITION_EMBEDDINGS=${SEQ_LEN}
80
80
EXTRA_VOCAB_SIZE=467
81
81
Q_LORA_RANK=1536
82
82
KV_LORA_RANK=512
@@ -106,6 +106,8 @@ moe_options=" \
106
106
--moe-router-enable-expert-bias \
107
107
--mscale 1.0 \
108
108
--mscale-all-dim 1.0 \
109
+ --moe-token-drop-policy probs \
110
+ --moe-router-pre-softmax \
109
111
--moe-router-score-function sigmoid \
110
112
--moe-router-bias-update-rate 0.001 \
111
113
--moe-aux-loss-coeff 0.001 \
Original file line number Diff line number Diff line change @@ -541,9 +541,9 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
541
541
EnergonDataloader (get_loader (valid_ds , worker_config = worker_config ))
542
542
for valid_ds in valid_ds1
543
543
]
544
- test_dataloader = None # NOTE: no test
544
+ test_dataloader = None
545
545
546
- return EnergonDataloader (train_dataloader ), valid_dataloader , None
546
+ return EnergonDataloader (train_dataloader ), valid_dataloader , EnergonDataloader ( test_dataloader )
547
547
548
548
549
549
class EnergonDataloader :
You can’t perform that action at this time.
0 commit comments