-
Notifications
You must be signed in to change notification settings - Fork 103
Open
Description
拓扑生成
python3 ./astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py -topo Spectrum-X -g 256 -gt A100 -bw 100Gbps -nvbw 2400Gbps
工作负载生成
sh ./scripts/megatron_workload_with_aiob.sh -m 22 --world_size 256 --tensor_model_parallel_size 8 --pipeline_model_parallel 4 --frame Megatron --global_batch 1024 --micro_batch 1 --seq_length 4096 --swiglu --use_flash_attn --aiob_enable
日志错误
运行SimAI后/etc/astra-sim/SimAI.log日志中会出现No corresponding group information is generated, and there is an error in creating the ring channel.
RingChannels MockNcclGroup::genringchannels(int rank, MockNccl::GroupType type) {
std::map<int,std::map<int,std::vector<int>>>ringchannels;
std::map<int,std::vector<int>>localrings;
std::map<int,std::vector<int>>::iterator ring_it;
GroupInfo gp_info;
int gp_idx;
MockNcclLog* NcclLog = MockNcclLog::getInstance();
int current;
int prev;
int next;
int end_rank;
int nNodes;
int nlocalRanks;
int delta;
if(GroupIndex.count(std::make_pair(rank,type))==0){
NcclLog->writeLog(NcclLogLevel::ERROR,"No corresponding group information is generated, and there is an error in creating the ring channel.");
}
gp_idx = GroupIndex[std::make_pair(rank,type)];
gp_info = AllGroups[GroupIndex[std::make_pair(rank,type)]];
nNodes = gp_info.nNodes;
nlocalRanks = gp_info.nRanks/nNodes;
localrings = gen_local_ring(rank,type);
delta = nNodes > 1 ? gp_info.Ranks[nlocalRanks]-gp_info.Ranks[0] : 0;
for(ring_it = localrings.begin();ring_it != localrings.end();ring_it++) {
prev = -1;
next = -1;
for(int i = 0; i < nNodes; i++) {
int node_send;
int node_recv;
node_recv = ring_it->second[0] + i * delta;
node_send = ring_it->second[nlocalRanks-1] + i * delta;
for(int j = 0; j < nlocalRanks; j++) {
current = ring_it->second[j] + i * delta;
if (j == nlocalRanks-1) {
next = ring_it->second[0] + (i + 1) * delta;
} else {
next = ring_it->second[j+1] + i * delta;
}
ringchannels[ring_it->first][current] = {prev,next,node_recv,node_send};
prev = current;
}
}
end_rank = ring_it->second[nlocalRanks-1] + (nNodes - 1) * delta;
ringchannels[ring_it->first][ring_it->second[0]][0] = end_rank;
ringchannels[ring_it->first][end_rank][1] = ring_it->second[0];
}
Allringchannels[gp_idx]=ringchannels;
return ringchannels;
}
这个错误是什么原因造成的?更换工作负载和拓扑后有时会出现,有时不出现
Metadata
Metadata
Assignees
Labels
No labels