Skip to content

Commit bb9ee10

Browse files
committed
Enable libfabric backend plugin with CXI provider for Slingshot network
1 parent d0350d2 commit bb9ee10

File tree

3 files changed

+40
-4
lines changed

3 files changed

+40
-4
lines changed

src/utils/libfabric/libfabric_common.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ getAvailableNetworkDevices() {
5050
hints->mode = FI_CONTEXT;
5151
hints->ep_attr->type = FI_EP_RDM;
5252

53+
// Add CXI-compatible memory registration mode
54+
hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_ENDPOINT | FI_MR_ALLOCATED | FI_MR_PROV_KEY;
55+
5356
int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info);
5457
if (ret) {
5558
NIXL_ERROR << "fi_getinfo failed " << fi_strerror(-ret);
@@ -85,7 +88,9 @@ getAvailableNetworkDevices() {
8588
}
8689
}
8790

88-
if (provider_device_map.find("efa") != provider_device_map.end()) {
91+
if (provider_device_map.find("cxi") != provider_device_map.end()) {
92+
return {"cxi", provider_device_map["cxi"]};
93+
} else if (provider_device_map.find("efa") != provider_device_map.end()) {
8994
return {"efa", provider_device_map["efa"]};
9095
} else if (provider_device_map.find("sockets") != provider_device_map.end()) {
9196
return {"sockets", {provider_device_map["sockets"][0]}};

src/utils/libfabric/libfabric_rail.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,10 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device,
421421
// TCP provider doesn't support FI_MR_PROV_KEY or FI_MR_VIRT_ADDR, use basic mode
422422
hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_ALLOCATED;
423423
hints->domain_attr->mr_key_size = 0; // Let provider decide
424+
} else if (provider == "cxi") {
425+
hints->caps |= FI_RMA_EVENT;
426+
hints->domain_attr->mr_mode =
427+
FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_ENDPOINT;
424428
} else {
425429
// EFA and other providers support advanced memory registration
426430
hints->domain_attr->mr_mode =
@@ -1347,8 +1351,15 @@ nixlLibfabricRail::registerMemory(void *buffer,
13471351
iov.iov_len = length;
13481352
mr_attr.mr_iov = &iov;
13491353
mr_attr.iov_count = 1;
1354+
int ret = 0;
1355+
1356+
if (provider_name == "cxi") {
1357+
ret = fi_mr_regattr(domain, &mr_attr, FI_RMA_EVENT, &mr);
1358+
}
1359+
else {
1360+
ret = fi_mr_regattr(domain, &mr_attr, 0, &mr);
1361+
}
13501362

1351-
int ret = fi_mr_regattr(domain, &mr_attr, 0, &mr);
13521363
if (ret) {
13531364
NIXL_ERROR << "fi_mr_reg failed on rail " << rail_id << ": " << fi_strerror(-ret)
13541365
<< " (buffer=" << buffer << ", length=" << length
@@ -1357,6 +1368,24 @@ nixlLibfabricRail::registerMemory(void *buffer,
13571368
}
13581369

13591370
*mr_out = mr;
1371+
1372+
if (info->domain_attr->mr_mode & FI_MR_ENDPOINT) {
1373+
ret = fi_mr_bind(mr, &endpoint->fid, 0);
1374+
if (ret) {
1375+
NIXL_ERROR << "fi_mr_bind failed on rail " << rail_id << ": " << fi_strerror(-ret);
1376+
fi_close(&mr->fid);
1377+
return NIXL_ERR_BACKEND;
1378+
}
1379+
1380+
ret = fi_mr_enable(mr);
1381+
if (ret) {
1382+
NIXL_ERROR << "fi_mr_enable failed on rail " << rail_id << ": " << fi_strerror(-ret);
1383+
fi_close(&mr->fid);
1384+
return NIXL_ERR_BACKEND;
1385+
}
1386+
}
1387+
1388+
13601389
*key_out = fi_mr_key(mr);
13611390

13621391
NIXL_TRACE << "Memory Registration SUCCESS: rail=" << rail_id << " provider=" << provider_name

src/utils/libfabric/libfabric_rail_manager.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,8 @@ nixlLibfabricRailManager::prepareAndSubmitTransfer(
179179

180180
// For TCP providers, use offset 0 instead of virtual address
181181
// TCP providers don't support FI_MR_VIRT_ADDR and expect offset-based addressing
182-
if (data_rails_[rail_id]->provider_name == "tcp" ||
182+
if (data_rails_[rail_id]->provider_name == "cxi" ||
183+
data_rails_[rail_id]->provider_name == "tcp" ||
183184
data_rails_[rail_id]->provider_name == "sockets") {
184185
req->remote_addr = 0; // Use offset 0 for TCP providers
185186
NIXL_DEBUG << "TCP provider detected: using offset 0 instead of virtual address "
@@ -258,7 +259,8 @@ nixlLibfabricRailManager::prepareAndSubmitTransfer(
258259

259260
// For TCP providers, use offset instead of virtual address
260261
// TCP providers don't support FI_MR_VIRT_ADDR and expect offset-based addressing
261-
if (data_rails_[rail_id]->provider_name == "tcp" ||
262+
if (data_rails_[rail_id]->provider_name == "cxi" ||
263+
data_rails_[rail_id]->provider_name == "tcp" ||
262264
data_rails_[rail_id]->provider_name == "sockets") {
263265
req->remote_addr = chunk_offset; // Use chunk offset for TCP providers
264266
NIXL_DEBUG << "TCP provider detected: using chunk offset " << chunk_offset

0 commit comments

Comments
 (0)