Skip to content

basic CUDA <> CPU or CUDA <> CUDA rdma Support #372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions cuda-sys/src/wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <cuda.h>
#include <cuda_runtime.h>
1 change: 0 additions & 1 deletion monarch_rdma/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ license = "BSD-3-Clause"
anyhow = "1.0.98"
async-trait = "0.1.86"
hyperactor = { version = "0.0.0", path = "../hyperactor" }
ibverbs = "0.7.1"
rand = { version = "0.8", features = ["small_rng"] }
serde = { version = "1.0.185", features = ["derive", "rc"] }
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
Expand Down
164 changes: 122 additions & 42 deletions monarch_rdma/src/ibverbs_primitives.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,88 @@ use std::ffi::CStr;
use std::fmt;

use hyperactor::Named;
use ibverbs::Gid;
use serde::Deserialize;
use serde::Serialize;

/// A Global identifier for ibv.
///
/// This struct acts as a rust wrapper for `ffi::ibv_gid`. We use it instead of
/// `ffi::ibv_giv` because `ffi::ibv_gid` is actually an untagged union.
///
/// ```c
/// union ibv_gid {
/// uint8_t raw[16];
/// struct {
/// __be64 subnet_prefix;
/// __be64 interface_id;
/// } global;
/// };
/// ```
///
/// It appears that `global` exists for convenience, but can be safely ignored.
/// For continuity, the methods `subnet_prefix` and `interface_id` are provided.
/// These methods read the array as big endian, regardless of native cpu
/// endianness.

#[derive(
Default,
Copy,
Clone,
Debug,
Eq,
PartialEq,
Hash,
serde::Serialize,
serde::Deserialize
)]
#[repr(transparent)]
pub struct Gid {
raw: [u8; 16],
}

impl Gid {
/// Expose the subnet_prefix component of the `Gid` as a u64. This is
/// equivalent to accessing the `global.subnet_prefix` component of the
/// `ffi::ibv_gid` union.
#[allow(dead_code)]
fn subnet_prefix(&self) -> u64 {
u64::from_be_bytes(self.raw[..8].try_into().unwrap())
}

/// Expose the interface_id component of the `Gid` as a u64. This is
/// equivalent to accessing the `global.interface_id` component of the
/// `ffi::ibv_gid` union.
#[allow(dead_code)]
fn interface_id(&self) -> u64 {
u64::from_be_bytes(self.raw[8..].try_into().unwrap())
}
}
impl From<rdmacore_sys::ibv_gid> for Gid {
fn from(gid: rdmacore_sys::ibv_gid) -> Self {
Self {
raw: unsafe { gid.raw },
}
}
}

impl From<Gid> for rdmacore_sys::ibv_gid {
fn from(mut gid: Gid) -> Self {
*gid.as_mut()
}
}

impl AsRef<rdmacore_sys::ibv_gid> for Gid {
fn as_ref(&self) -> &rdmacore_sys::ibv_gid {
unsafe { &*self.raw.as_ptr().cast::<rdmacore_sys::ibv_gid>() }
}
}

impl AsMut<rdmacore_sys::ibv_gid> for Gid {
fn as_mut(&mut self) -> &mut rdmacore_sys::ibv_gid {
unsafe { &mut *self.raw.as_mut_ptr().cast::<rdmacore_sys::ibv_gid>() }
}
}

/// Represents ibverbs specific configurations.
///
/// This struct holds various parameters required to establish and manage an RDMA connection.
Expand Down Expand Up @@ -86,7 +164,7 @@ impl Default for IbverbsConfig {
max_recv_wr: 1,
max_send_sge: 1,
max_recv_sge: 1,
path_mtu: ffi::IBV_MTU_1024,
path_mtu: rdmacore_sys::IBV_MTU_1024,
retry_cnt: 7,
rnr_retry: 7,
qp_timeout: 14, // 4.096 μs * 2^14 = ~67 ms
Expand Down Expand Up @@ -144,7 +222,7 @@ impl std::fmt::Display for IbverbsConfig {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RdmaDevice {
/// `name` - The name of the RDMA device (e.g., "mlx5_0").
name: String,
pub name: String,
/// `vendor_id` - The vendor ID of the device.
vendor_id: u32,
/// `vendor_part_id` - The vendor part ID of the device.
Expand Down Expand Up @@ -330,10 +408,10 @@ impl fmt::Display for RdmaPort {
/// # Returns
///
/// A string representation of the port state.
pub fn get_port_state_str(state: ffi::ibv_port_state::Type) -> String {
pub fn get_port_state_str(state: rdmacore_sys::ibv_port_state::Type) -> String {
// SAFETY: We are calling a C function that returns a C string.
unsafe {
let c_str = ffi::ibv_port_state_str(state);
let c_str = rdmacore_sys::ibv_port_state_str(state);
if c_str.is_null() {
return "Unknown".to_string();
}
Expand Down Expand Up @@ -428,7 +506,7 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
// SAFETY: We are calling several C functions from libibverbs.
unsafe {
let mut num_devices = 0;
let device_list = ffi::ibv_get_device_list(&mut num_devices);
let device_list = rdmacore_sys::ibv_get_device_list(&mut num_devices);
if device_list.is_null() || num_devices == 0 {
return devices;
}
Expand All @@ -439,18 +517,18 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
continue;
}

let context = ffi::ibv_open_device(device);
let context = rdmacore_sys::ibv_open_device(device);
if context.is_null() {
continue;
}

let device_name = CStr::from_ptr(ffi::ibv_get_device_name(device))
let device_name = CStr::from_ptr(rdmacore_sys::ibv_get_device_name(device))
.to_string_lossy()
.into_owned();

let mut device_attr = ffi::ibv_device_attr::default();
if ffi::ibv_query_device(context, &mut device_attr) != 0 {
ffi::ibv_close_device(context);
let mut device_attr = rdmacore_sys::ibv_device_attr::default();
if rdmacore_sys::ibv_query_device(context, &mut device_attr) != 0 {
rdmacore_sys::ibv_close_device(context);
continue;
}

Expand All @@ -475,11 +553,11 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
};

for port_num in 1..=device_attr.phys_port_cnt {
let mut port_attr = ffi::ibv_port_attr::default();
if ffi::ibv_query_port(
let mut port_attr = rdmacore_sys::ibv_port_attr::default();
if rdmacore_sys::ibv_query_port(
context,
port_num,
&mut port_attr as *mut ffi::ibv_port_attr as *mut _,
&mut port_attr as *mut rdmacore_sys::ibv_port_attr as *mut _,
) != 0
{
continue;
Expand All @@ -489,8 +567,8 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {

let link_layer = get_link_layer_str(port_attr.link_layer);

let mut gid = ffi::ibv_gid::default();
let gid_str = if ffi::ibv_query_gid(context, port_num, 0, &mut gid) == 0 {
let mut gid = rdmacore_sys::ibv_gid::default();
let gid_str = if rdmacore_sys::ibv_query_gid(context, port_num, 0, &mut gid) == 0 {
format_gid(&gid.raw)
} else {
"N/A".to_string()
Expand All @@ -513,10 +591,10 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
}

devices.push(rdma_device);
ffi::ibv_close_device(context);
rdmacore_sys::ibv_close_device(context);
}

ffi::ibv_free_device_list(device_list);
rdmacore_sys::ibv_free_device_list(device_list);
}

devices
Expand All @@ -535,9 +613,9 @@ pub fn ibverbs_supported() -> bool {
// SAFETY: We are calling a C function from libibverbs.
unsafe {
let mut num_devices = 0;
let device_list = ffi::ibv_get_device_list(&mut num_devices);
let device_list = rdmacore_sys::ibv_get_device_list(&mut num_devices);
if !device_list.is_null() {
ffi::ibv_free_device_list(device_list);
rdmacore_sys::ibv_free_device_list(device_list);
return true;
}
false
Expand All @@ -557,6 +635,7 @@ pub fn ibverbs_supported() -> bool {
/// RDMA operations are in progress.
#[derive(Debug, PartialEq, Eq, std::hash::Hash, Serialize, Deserialize, Clone)]
pub struct RdmaMemoryRegionView {
pub id: u32,
pub addr: usize,
pub size: usize,
pub lkey: u32,
Expand All @@ -582,8 +661,9 @@ unsafe impl Sync for RdmaMemoryRegionView {}

impl RdmaMemoryRegionView {
/// Creates a new `RdmaMemoryRegionView` with the given address and size.
pub fn new(addr: usize, size: usize, lkey: u32, rkey: u32) -> Self {
pub fn new(id: u32, addr: usize, size: usize, lkey: u32, rkey: u32) -> Self {
Self {
id,
addr,
size,
lkey,
Expand Down Expand Up @@ -612,20 +692,20 @@ pub enum RdmaOperation {
Read,
}

impl From<RdmaOperation> for ffi::ibv_wr_opcode::Type {
impl From<RdmaOperation> for rdmacore_sys::ibv_wr_opcode::Type {
fn from(op: RdmaOperation) -> Self {
match op {
RdmaOperation::Write => ffi::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
RdmaOperation::Read => ffi::ibv_wr_opcode::IBV_WR_RDMA_READ,
RdmaOperation::Write => rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
RdmaOperation::Read => rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
}
}
}

impl From<ffi::ibv_wc_opcode::Type> for RdmaOperation {
fn from(op: ffi::ibv_wc_opcode::Type) -> Self {
impl From<rdmacore_sys::ibv_wc_opcode::Type> for RdmaOperation {
fn from(op: rdmacore_sys::ibv_wc_opcode::Type) -> Self {
match op {
ffi::ibv_wc_opcode::IBV_WC_RDMA_WRITE => RdmaOperation::Write,
ffi::ibv_wc_opcode::IBV_WC_RDMA_READ => RdmaOperation::Read,
rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE => RdmaOperation::Write,
rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_READ => RdmaOperation::Read,
_ => panic!("Unsupported operation type"),
}
}
Expand Down Expand Up @@ -660,7 +740,7 @@ impl std::fmt::Debug for RdmaQpInfo {

/// Wrapper around ibv_wc (ibverbs work completion).
///
/// This exposes only the public fields of ffi::ibv_wc, allowing us to more easily
/// This exposes only the public fields of rdmacore_sys::ibv_wc, allowing us to more easily
/// interact with it from Rust. Work completions are used to track the status of
/// RDMA operations and are generated when an operation completes.
#[derive(Debug, Named, Clone, serde::Serialize, serde::Deserialize)]
Expand All @@ -672,9 +752,9 @@ pub struct IbvWc {
/// `valid` - Whether the work completion is valid
valid: bool,
/// `error` - Error information if the operation failed
error: Option<(ffi::ibv_wc_status::Type, u32)>,
error: Option<(rdmacore_sys::ibv_wc_status::Type, u32)>,
/// `opcode` - Type of operation that completed (read, write, etc.)
opcode: ffi::ibv_wc_opcode::Type,
opcode: rdmacore_sys::ibv_wc_opcode::Type,
/// `bytes` - Immediate data (if any)
bytes: Option<u32>,
/// `qp_num` - Queue Pair Number
Expand All @@ -691,8 +771,8 @@ pub struct IbvWc {
dlid_path_bits: u8,
}

impl From<ffi::ibv_wc> for IbvWc {
fn from(wc: ffi::ibv_wc) -> Self {
impl From<rdmacore_sys::ibv_wc> for IbvWc {
fn from(wc: rdmacore_sys::ibv_wc) -> Self {
IbvWc {
wr_id: wc.wr_id(),
len: wc.len(),
Expand Down Expand Up @@ -804,21 +884,21 @@ mod tests {
#[test]
fn test_rdma_operation_conversion() {
assert_eq!(
ffi::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
ffi::ibv_wr_opcode::Type::from(RdmaOperation::Write)
rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_WRITE,
rdmacore_sys::ibv_wr_opcode::Type::from(RdmaOperation::Write)
);
assert_eq!(
ffi::ibv_wr_opcode::IBV_WR_RDMA_READ,
ffi::ibv_wr_opcode::Type::from(RdmaOperation::Read)
rdmacore_sys::ibv_wr_opcode::IBV_WR_RDMA_READ,
rdmacore_sys::ibv_wr_opcode::Type::from(RdmaOperation::Read)
);

assert_eq!(
RdmaOperation::Write,
RdmaOperation::from(ffi::ibv_wc_opcode::IBV_WC_RDMA_WRITE)
RdmaOperation::from(rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_WRITE)
);
assert_eq!(
RdmaOperation::Read,
RdmaOperation::from(ffi::ibv_wc_opcode::IBV_WC_RDMA_READ)
RdmaOperation::from(rdmacore_sys::ibv_wc_opcode::IBV_WC_RDMA_READ)
);
}

Expand All @@ -839,18 +919,18 @@ mod tests {

#[test]
fn test_ibv_wc() {
let mut wc = ffi::ibv_wc::default();
let mut wc = rdmacore_sys::ibv_wc::default();

// SAFETY: modifies private fields through pointer manipulation
unsafe {
// Cast to pointer and modify the fields directly
let wc_ptr = &mut wc as *mut ffi::ibv_wc as *mut u8;
let wc_ptr = &mut wc as *mut rdmacore_sys::ibv_wc as *mut u8;

// Set wr_id (at offset 0, u64)
*(wc_ptr as *mut u64) = 42;

// Set status to SUCCESS (at offset 8, u32)
*(wc_ptr.add(8) as *mut i32) = ffi::ibv_wc_status::IBV_WC_SUCCESS as i32;
*(wc_ptr.add(8) as *mut i32) = rdmacore_sys::ibv_wc_status::IBV_WC_SUCCESS as i32;
}
let ibv_wc = IbvWc::from(wc);
assert_eq!(ibv_wc.wr_id(), 42);
Expand Down
5 changes: 5 additions & 0 deletions monarch_rdma/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,16 @@
* LICENSE file in the root directory of this source tree.
*/

// Silence warnings about unsafe blocks
#[allow(clippy::undocumented_unsafe_blocks)]
mod ibverbs_primitives;
mod rdma_components;
mod rdma_manager_actor;
mod test_utils;

#[macro_use]
mod macros;

pub use ibverbs_primitives::*;
pub use rdma_components::*;
pub use rdma_manager_actor::*;
Loading
Loading