Skip to content

Commit 0f87624

Browse files
jerryli1981同润
authored and
同润
committed
Revert "Remove drop tokens and presoftmax for ds-v3 and remove useless test dataloader …"
1 parent 93dc0af commit 0f87624

File tree

3 files changed

+9
-7
lines changed

3 files changed

+9
-7
lines changed

examples/deepseek_v3/README.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,8 @@ false \
156156
sel \
157157
1.0 \
158158
100000 \
159-
/mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
160-
/mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
159+
/mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
160+
/mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
161161
/mnt/deepseek-ckpts/DeepSeek-V3-to-mcore-tp8-pp8-ep16 \
162162
1000000000 \
163163
10000 \
@@ -192,8 +192,8 @@ true \
192192
sel \
193193
1.0 \
194194
100000 \
195-
/mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
196-
/mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
195+
/mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
196+
/mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
197197
/mnt/deepseek-ckpts/DeepSeek-V3-to-mcore-tp8-pp8-ep16 \
198198
10000 \
199199
100 \

examples/deepseek_v3/run_mcore_deepseek.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ NUM_ATTENTION_HEADS=128
7676
NUM_LAYERS=61
7777
INTERMEDIATE_SIZE=18432
7878
MOE_INTERMEDIATE_SIZE=2048
79-
MAX_POSITION_EMBEDDINGS=163840
79+
MAX_POSITION_EMBEDDINGS=${SEQ_LEN}
8080
EXTRA_VOCAB_SIZE=467
8181
Q_LORA_RANK=1536
8282
KV_LORA_RANK=512
@@ -106,6 +106,8 @@ moe_options=" \
106106
--moe-router-enable-expert-bias \
107107
--mscale 1.0 \
108108
--mscale-all-dim 1.0 \
109+
--moe-token-drop-policy probs \
110+
--moe-router-pre-softmax \
109111
--moe-router-score-function sigmoid \
110112
--moe-router-bias-update-rate 0.001 \
111113
--moe-aux-loss-coeff 0.001 \

examples/qwen2_vl/pretrain_qwen.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -541,9 +541,9 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
541541
EnergonDataloader(get_loader(valid_ds, worker_config=worker_config))
542542
for valid_ds in valid_ds1
543543
]
544-
test_dataloader = None # NOTE: no test
544+
test_dataloader = None
545545

546-
return EnergonDataloader(train_dataloader), valid_dataloader, None
546+
return EnergonDataloader(train_dataloader), valid_dataloader, EnergonDataloader(test_dataloader)
547547

548548

549549
class EnergonDataloader:

0 commit comments

Comments
 (0)