Skip to content

Commit 382ff07

Browse files
committed
Update
1 parent bbbeb62 commit 382ff07

File tree

3 files changed

+27
-38
lines changed

3 files changed

+27
-38
lines changed

Diff for: llm/auto_parallel/llama/run_llama2_13b_xpu.sh

+6-4
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ unset PADDLE_TRAINERS_NUM
4343
#export BKCL_RDMA_NICS=xgbe1,xgbe1,xgbe2,xgbe2,xgbe3,xgbe3,xgbe4,xgbe4
4444
#export BKCL_SOCKET_IFNAME=xgbe0
4545
#export BKCL_FORCE_L3_RDMA=0
46+
export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib64
4647
echo "bkcl version:"
4748
strings ${bkcl_location}/libbkcl.so | grep COM
4849

@@ -52,8 +53,8 @@ export CUDA_DEVICE_MAX_CONNECTIONS=8
5253
export PYTHONPATH=../../../:$PYTHONPATH
5354

5455
# for debug
55-
#export GLOG_v=6
56-
#export FLAGS_call_stack_level=2
56+
#export GLOG_v=10
57+
export FLAGS_call_stack_level=2
5758

5859
rm -rf output/$task_name_or_path
5960
PYTHONPATH=../:$PYTHONPATH \
@@ -92,7 +93,7 @@ python -u -m paddle.distributed.launch \
9293
--dataloader_num_workers 4 \
9394
--pipeline_parallel_degree 1 \
9495
--tensor_parallel_degree 1 \
95-
--gradient_accumulation_steps 32 \
96+
--gradient_accumulation_steps 1 \
9697
--eval_steps 1000 \
9798
--report_to "visualdl" \
9899
--disable_tqdm true \
@@ -101,4 +102,5 @@ python -u -m paddle.distributed.launch \
101102
--do_train \
102103
--seed 1026 \
103104
--device "xpu" \
104-
--enable_auto_parallel 1
105+
--enable_auto_parallel 1 \
106+
--to_static 1

Diff for: paddlenlp/trainer/auto_trainer.py

-5
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from tqdm.auto import tqdm
2727

2828
from paddlenlp.trainer import Trainer
29-
from paddlenlp.utils.tools import get_env_device
3029

3130
from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
3231
from ..utils.log import logger
@@ -523,10 +522,6 @@ def _inner_training_loop(
523522

524523
logger.info("\nTraining completed. \n")
525524

526-
# Hack for XPU that doesn't support Allgather yet. See LlamaPretrainingCriterion3DAuto in modeling_auto.py for details.
527-
if get_env_device() == "xpu":
528-
tr_loss = tr_loss.mean()
529-
530525
self._total_loss_scalar += self._get_item_from_loss(tr_loss)
531526
train_loss = self._total_loss_scalar / self.state.global_step
532527

Diff for: paddlenlp/transformers/llama/modeling_auto.py

+21-29
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,10 @@ def scaled_dot_product_attention(
195195
return (attn_output, attn_weights) if output_attentions else attn_output
196196

197197

198+
colwise_placements = [dist.Replicate(), dist.Shard(1)]
199+
rowise_placement = [dist.Replicate(), dist.Shard(0)]
200+
201+
198202
class LlamaRMSNormAuto(nn.Layer):
199203
def __init__(self, config, ipp):
200204
super().__init__()
@@ -237,16 +241,6 @@ def __init__(self, config, ipp: Optional[int] = None):
237241
self.fuse_attention_ffn = config.fuse_attention_ffn
238242
self.ipp = ipp
239243
self.config = config
240-
colwise_placements = (
241-
[dist.Replicate(), dist.Shard(1)]
242-
if self.config.tensor_parallel_degree > 1
243-
else [dist.Replicate(), dist.Replicate()]
244-
)
245-
rowise_placement = (
246-
[dist.Replicate(), dist.Shard(0)]
247-
if self.config.tensor_parallel_degree > 1
248-
else [dist.Replicate(), dist.Replicate()]
249-
)
250244

251245
if config.fuse_attention_ffn and not enable_fuse_ffn_qkv_pass():
252246
self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
@@ -316,17 +310,6 @@ def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False, ipp:
316310
self.recompute_granularity = config.recompute_granularity
317311
self.ipp = ipp
318312

319-
colwise_placements = (
320-
[dist.Replicate(), dist.Shard(1)]
321-
if self.config.tensor_parallel_degree > 1
322-
else [dist.Replicate(), dist.Replicate()]
323-
)
324-
rowise_placement = (
325-
[dist.Replicate(), dist.Shard(0)]
326-
if self.config.tensor_parallel_degree > 1
327-
else [dist.Replicate(), dist.Replicate()]
328-
)
329-
330313
self.use_fused_rope = config.use_fused_rope
331314
if self.use_fused_rope and get_env_device() not in ["npu", "mlu", "xpu", "gcu", "intel_hpu"]:
332315
if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
@@ -1201,10 +1184,23 @@ def forward(self, prediction_scores, masked_lm_labels):
12011184
masked_lm_labels.unsqueeze(2),
12021185
)
12031186

1204-
# Hack for XPU that doesn't support Allgather yet.
1187+
# XPU dose not support allgather mask with bool dtype, so we use LocalLayer here.
12051188
if get_env_device() == "xpu":
1206-
# masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
1207-
loss = paddle.mean(masked_lm_loss, axis=-1)
1189+
1190+
class LocalLossLayer(paddle.distributed.LocalLayer):
1191+
def __init__(self, out_dist_attrs):
1192+
super().__init__(out_dist_attrs)
1193+
1194+
def forward(self, x, mask):
1195+
masked_lm_loss = paddle.masked_select(x, mask).astype("float32")
1196+
loss = paddle.mean(masked_lm_loss)
1197+
return loss
1198+
1199+
out_dist_attrs = [
1200+
(masked_lm_loss.process_mesh, [dist.Partial(dist.ReduceType.kRedSum), dist.Replicate()]),
1201+
]
1202+
loss_func = LocalLossLayer(out_dist_attrs)
1203+
loss = loss_func(masked_lm_loss, masked_lm_loss > 0)
12081204
else:
12091205
masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
12101206
loss = paddle.mean(masked_lm_loss)
@@ -1216,11 +1212,7 @@ class LlamaLMHeadAuto(nn.Layer):
12161212
def __init__(self, config: LlamaConfig):
12171213
super(LlamaLMHeadAuto, self).__init__()
12181214
self.config = config
1219-
colwise_placements = (
1220-
[dist.Replicate(), dist.Shard(1)]
1221-
if self.config.tensor_parallel_degree > 1
1222-
else [dist.Replicate(), dist.Replicate()]
1223-
)
1215+
12241216
vocab_size = config.vocab_size
12251217
self.weight = self.create_parameter(
12261218
shape=[config.hidden_size, vocab_size],

0 commit comments

Comments
 (0)