Skip to content

Commit f517516

Browse files
committed
Feat: more bytemuck-based features for devicebuffer/deviceslice
1 parent 3dd86d9 commit f517516

File tree

10 files changed

+449
-57
lines changed

10 files changed

+449
-57
lines changed

crates/cust/CHANGELOG.md

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,22 @@ Instead you can now use `DeviceSlice::index` which behaves the same.
5252
- Added `cust::memory::LockedBox`, same as `LockedBuffer` except for single elements.
5353
- Added `cust::memory::cuda_malloc_async`.
5454
- Added `cust::memory::cuda_free_async`.
55-
- Added `impl AsyncCopyDestination<LockedBox<T>> for DeviceBox<T>` for async HtoD memcpy.
55+
- Added `impl AsyncCopyDestination<LockedBox<T>> for DeviceBox<T>` for async HtoD/DtoH memcpy.
56+
- Added `DeviceBox::new_async`.
57+
- Added `DeviceBox::drop_async`.
58+
- Added `DeviceBox::zeroed_async`.
59+
- Added `DeviceBox::uninitialized_async`.
60+
- Added `DeviceBuffer::uninitialized_async`.
61+
- Added `DeviceBuffer::drop_async`.
62+
- Added `DeviceBuffer::zeroed`.
63+
- Added `DeviceBuffer::zeroed_async`.
64+
- Added `DeviceBuffer::cast`.
65+
- Added `DeviceBuffer::try_cast`.
66+
- Added `DeviceSlice::set_8` and `DeviceSlice::set_8_async`.
67+
- Added `DeviceSlice::set_16` and `DeviceSlice::set_16_async`.
68+
- Added `DeviceSlice::set_32` and `DeviceSlice::set_32_async`.
69+
- Added `DeviceSlice::set_zero` and `DeviceSlice::set_zero_async`.
5670
- Added the `bytemuck` feature which is enabled by default.
57-
- Added `zeroed_async` to `DeviceBox`.
58-
- Added `drop_async` to `DeviceBox`.
59-
- Added `new_async` to `DeviceBox`.
60-
- Added `DevicePointer::as_ptr` and `DevicePointer::as_mut_ptr` for returning `*const T` or `*mut T`.
6171
- Added mint integration behind `impl_mint`.
6272
- Added half integration behind `impl_half`.
6373
- Added glam integration behind `impl_glam`.
@@ -69,8 +79,8 @@ Instead you can now use `DeviceSlice::index` which behaves the same.
6979
- Added `mem_get_info` to query the amount of free and total memory.
7080
- Added `DevicePointer::as_ptr` and `DevicePointer::as_mut_ptr` for `*const T` and `*mut T`.
7181
- Added `DevicePointer::from_raw` for `CUdeviceptr -> DevicePointer<T>` with a safe function.
82+
- Added `DevicePointer::cast`.
7283
- Added dependency on `cust_core` for `DeviceCopy`.
73-
- Added dependency on `goblin` for verifying cubins and fatbins (impossible to implement safe module loading without it).
7484
- Added `ModuleJitOption`, `JitFallback`, `JitTarget`, and `OptLevel` for specifying options when loading a module. Note that
7585
`ModuleJitOption::MaxRegisters` does not seem to work currently, but NVIDIA is looking into it.
7686
You can achieve the same goal by compiling the ptx to cubin using nvcc then loading that: `nvcc --cubin foo.ptx -maxrregcount=REGS`

crates/cust/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ mint = { version = "^0.5", optional = true }
2222
num-complex = { version = "0.4", optional = true }
2323
vek = { version = "0.15.1", optional = true, default-features = false }
2424
bytemuck = { version = "1.7.3", optional = true }
25-
goblin = { version = "0.4.3", default-features = false, features = ["elf32", "elf64", "std", "endian_fd"] }
2625

2726
[features]
2827
default= ["bytemuck"]

crates/cust/src/memory/device/device_box.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ impl<T: DeviceCopy> DeviceBox<T> {
8484
/// # Ok(())
8585
/// # }
8686
pub unsafe fn new_async(val: &T, stream: &Stream) -> CudaResult<Self> {
87-
let mut dev_box = DeviceBox::uninitialized()?;
87+
let mut dev_box = DeviceBox::uninitialized_async(stream)?;
8888
dev_box.async_copy_from(val, stream)?;
8989
Ok(dev_box)
9090
}
@@ -120,6 +120,9 @@ impl<T: DeviceCopy> DeviceBox<T> {
120120
/// # Ok(())
121121
/// # }
122122
pub fn drop_async(self, stream: &Stream) -> CudaResult<()> {
123+
if self.ptr.is_null() {
124+
return Ok(());
125+
}
123126
// make sure we dont run the normal destructor, otherwise a double drop will happen
124127
let me = ManuallyDrop::new(self);
125128
// SAFETY: we consume the box so its not possible to use the box past its drop point unless
@@ -165,7 +168,7 @@ impl<T: DeviceCopy + bytemuck::Zeroable> DeviceBox<T> {
165168
}
166169
}
167170

168-
/// Allocate device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
171+
/// Allocates device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
169172
///
170173
/// This doesn't actually allocate if `T` is zero-sized.
171174
///

crates/cust/src/memory/device/device_buffer.rs

Lines changed: 198 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,23 @@
11
use crate::error::{CudaResult, DropResult, ToResult};
22
use crate::memory::device::{AsyncCopyDestination, CopyDestination, DeviceSlice};
33
use crate::memory::malloc::{cuda_free, cuda_malloc};
4-
use crate::memory::DeviceCopy;
5-
use crate::memory::DevicePointer;
4+
use crate::memory::{cuda_free_async, DevicePointer};
5+
use crate::memory::{cuda_malloc_async, DeviceCopy};
66
use crate::stream::Stream;
77
use crate::sys as cuda;
8-
use std::mem;
8+
#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
9+
pub use bytemuck;
10+
#[cfg(feature = "bytemuck")]
11+
use bytemuck::{Pod, PodCastError, Zeroable};
12+
use std::mem::{self, align_of, size_of, transmute, ManuallyDrop};
913
use std::ops::{Deref, DerefMut};
1014

1115
/// Fixed-size device-side buffer. Provides basic access to device memory.
1216
#[derive(Debug)]
1317
#[repr(C)]
1418
pub struct DeviceBuffer<T: DeviceCopy> {
1519
buf: DevicePointer<T>,
16-
capacity: usize,
20+
len: usize,
1721
}
1822

1923
unsafe impl<T: Send + DeviceCopy> Send for DeviceBuffer<T> {}
@@ -42,57 +46,84 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
4246
/// buffer.copy_from(&[0u64, 1, 2, 3, 4]).unwrap();
4347
/// ```
4448
pub unsafe fn uninitialized(size: usize) -> CudaResult<Self> {
45-
let ptr = if size > 0 && mem::size_of::<T>() > 0 {
49+
let ptr = if size > 0 && size_of::<T>() > 0 {
4650
cuda_malloc(size)?
4751
} else {
4852
// FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
4953
DevicePointer::null()
5054
};
5155
Ok(DeviceBuffer {
5256
buf: ptr,
53-
capacity: size,
57+
len: size,
5458
})
5559
}
5660

57-
/// Allocate a new device buffer large enough to hold `size` `T`'s and fill the contents with
58-
/// zeroes (`0u8`).
61+
/// Allocates device memory asynchronously on a stream, without initializing it.
5962
///
60-
/// # Errors
61-
///
62-
/// If the allocation fails, returns the error from CUDA. If `size` is large enough that
63-
/// `size * mem::sizeof::<T>()` overflows usize, then returns InvalidMemoryAllocation.
63+
/// This doesn't actually allocate if `T` is zero sized.
6464
///
6565
/// # Safety
6666
///
67-
/// The backing memory is zeroed, which may not be a valid bit-pattern for type `T`. The caller
68-
/// must ensure either that all-zeroes is a valid bit-pattern for type `T` or that the backing
69-
/// memory is set to a valid value before it is read.
67+
/// The allocated memory retains all of the unsafety of [`DeviceBuffer::uninitialized`], with
68+
/// the additional consideration that the memory cannot be used until it is actually allocated
69+
/// on the stream. This means proper stream ordering semantics must be followed, such as
70+
/// only enqueing kernel launches that use the memory AFTER the allocation call.
7071
///
71-
/// # Examples
72-
///
73-
/// ```
74-
/// # let _context = cust::quick_init().unwrap();
75-
/// use cust::memory::*;
76-
/// let buffer = unsafe { DeviceBuffer::zeroed(5).unwrap() };
77-
/// let mut host_values = [1u64, 2, 3, 4, 5];
78-
/// buffer.copy_to(&mut host_values).unwrap();
79-
/// assert_eq!([0u64, 0, 0, 0, 0], host_values);
80-
/// ```
81-
pub unsafe fn zeroed(size: usize) -> CudaResult<Self> {
82-
let ptr = if size > 0 && mem::size_of::<T>() > 0 {
83-
let ptr = cuda_malloc(size)?;
84-
cuda::cuMemsetD8_v2(ptr.as_raw(), 0, size * mem::size_of::<T>()).to_result()?;
85-
ptr
72+
/// You can synchronize the stream to ensure the memory allocation operation is complete.
73+
pub unsafe fn uninitialized_async(size: usize, stream: &Stream) -> CudaResult<Self> {
74+
let ptr = if size > 0 && size_of::<T>() > 0 {
75+
cuda_malloc_async(stream, size)?
8676
} else {
87-
// FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
8877
DevicePointer::null()
8978
};
9079
Ok(DeviceBuffer {
9180
buf: ptr,
92-
capacity: size,
81+
len: size,
9382
})
9483
}
9584

85+
/// Enqueues an operation to free the memory backed by this [`DeviceBuffer`] on a
86+
/// particular stream. The stream will free the allocation as soon as it reaches
87+
/// the operation in the stream. You can ensure the memory is freed by synchronizing
88+
/// the stream.
89+
///
90+
/// This function uses internal memory pool semantics. Async allocations will reserve memory
91+
/// in the default memory pool in the stream, and async frees will release the memory back to the pool
92+
/// for further use by async allocations.
93+
///
94+
/// The memory inside of the pool is all freed back to the OS once the stream is synchronized unless
95+
/// a custom pool is configured to not do so.
96+
///
97+
/// # Examples
98+
///
99+
/// ```
100+
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
101+
/// # let _context = cust::quick_init().unwrap();
102+
/// use cust::{memory::*, stream::*};
103+
/// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
104+
/// let mut host_vals = [1, 2, 3];
105+
/// unsafe {
106+
/// let mut allocated = DeviceBuffer::from_slice_async(&[4u8, 5, 6], &stream)?;
107+
/// allocated.async_copy_to(&mut host_vals, &stream)?;
108+
/// allocated.drop_async(&stream)?;
109+
/// }
110+
/// // ensure all async ops are done before trying to access the value
111+
/// stream.synchronize()?;
112+
/// assert_eq!(host_vals, [4, 5, 6]);
113+
/// # Ok(())
114+
/// # }
115+
/// ```
116+
pub fn drop_async(self, stream: &Stream) -> CudaResult<()> {
117+
if self.buf.is_null() {
118+
return Ok(());
119+
}
120+
// make sure we dont run the normal destructor, otherwise a double drop will happen
121+
let me = ManuallyDrop::new(self);
122+
// SAFETY: we consume the box so its not possible to use the box past its drop point unless
123+
// you keep around a pointer, but in that case, we cannot guarantee safety.
124+
unsafe { cuda_free_async(stream, me.buf) }
125+
}
126+
96127
/// Creates a `DeviceBuffer<T>` directly from the raw components of another device buffer.
97128
///
98129
/// # Safety
@@ -130,7 +161,10 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
130161
/// let buffer = unsafe { DeviceBuffer::from_raw_parts(ptr, size) };
131162
/// ```
132163
pub unsafe fn from_raw_parts(ptr: DevicePointer<T>, capacity: usize) -> DeviceBuffer<T> {
133-
DeviceBuffer { buf: ptr, capacity }
164+
DeviceBuffer {
165+
buf: ptr,
166+
len: capacity,
167+
}
134168
}
135169

136170
/// Destroy a `DeviceBuffer`, returning an error.
@@ -157,8 +191,8 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
157191
return Ok(());
158192
}
159193

160-
if dev_buf.capacity > 0 && mem::size_of::<T>() > 0 {
161-
let capacity = dev_buf.capacity;
194+
if dev_buf.len > 0 && size_of::<T>() > 0 {
195+
let capacity = dev_buf.len;
162196
let ptr = mem::replace(&mut dev_buf.buf, DevicePointer::null());
163197
unsafe {
164198
match cuda_free(ptr) {
@@ -174,6 +208,132 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
174208
}
175209
}
176210
}
211+
212+
#[cfg(feature = "bytemuck")]
213+
impl<T: DeviceCopy + Zeroable> DeviceBuffer<T> {
214+
/// Allocate device memory and fill it with zeroes (`0u8`).
215+
///
216+
/// This doesn't actually allocate if `T` is zero-sized.
217+
///
218+
/// # Examples
219+
///
220+
/// ```
221+
/// # let _context = cust::quick_init().unwrap();
222+
/// use cust::memory::*;
223+
/// let mut zero = DeviceBuffer::zeroed(4).unwrap();
224+
/// let mut values = [1u8, 2, 3, 4];
225+
/// zero.copy_to(&mut values).unwrap();
226+
/// assert_eq!(values, [0; 4]);
227+
/// ```
228+
#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
229+
pub fn zeroed(size: usize) -> CudaResult<Self> {
230+
unsafe {
231+
let new_buf = DeviceBuffer::uninitialized(size)?;
232+
if size_of::<T>() != 0 {
233+
cuda::cuMemsetD8_v2(new_buf.as_device_ptr().as_raw(), 0, size_of::<T>() * size)
234+
.to_result()?;
235+
}
236+
Ok(new_buf)
237+
}
238+
}
239+
240+
/// Allocates device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
241+
///
242+
/// This doesn't actually allocate if `T` is zero-sized.
243+
///
244+
/// # Safety
245+
///
246+
/// This method enqueues two operations on the stream: An async allocation
247+
/// and an async memset. Because of this, you must ensure that:
248+
/// - The memory is not used in any way before it is actually allocated on the stream. You
249+
/// can ensure this happens by synchronizing the stream explicitly or using events.
250+
///
251+
/// # Examples
252+
///
253+
/// ```
254+
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
255+
/// # let _context = cust::quick_init().unwrap();
256+
/// use cust::{memory::*, stream::*};
257+
/// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
258+
/// let mut values = [1u8, 2, 3, 4];
259+
/// unsafe {
260+
/// let mut zero = DeviceBuffer::zeroed_async(4, &stream)?;
261+
/// zero.async_copy_to(&mut values, &stream)?;
262+
/// zero.drop_async(&stream)?;
263+
/// }
264+
/// stream.synchronize()?;
265+
/// assert_eq!(values, [0; 4]);
266+
/// # Ok(())
267+
/// # }
268+
/// ```
269+
#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
270+
pub unsafe fn zeroed_async(size: usize, stream: &Stream) -> CudaResult<Self> {
271+
let new_buf = DeviceBuffer::uninitialized_async(size, stream)?;
272+
if size_of::<T>() != 0 {
273+
cuda::cuMemsetD8Async(
274+
new_buf.as_device_ptr().as_raw(),
275+
0,
276+
size_of::<T>() * size,
277+
stream.as_inner(),
278+
)
279+
.to_result()?;
280+
}
281+
Ok(new_buf)
282+
}
283+
}
284+
285+
fn casting_went_wrong(src: &str, err: PodCastError) -> ! {
286+
panic!("{}>{:?}", src, err);
287+
}
288+
289+
#[cfg(feature = "bytemuck")]
290+
impl<A: DeviceCopy + Pod> DeviceBuffer<A> {
291+
/// Same as [`DeviceBuffer::try_cast`] but panics if the cast fails.
292+
///
293+
/// # Panics
294+
///
295+
/// See [`DeviceBuffer::try_cast`].
296+
#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
297+
pub fn cast<B: Pod + DeviceCopy>(self) -> DeviceBuffer<B> {
298+
match Self::try_cast(self) {
299+
Ok(b) => b,
300+
Err(e) => casting_went_wrong("cast", e),
301+
}
302+
}
303+
304+
/// Tries to convert a [`DeviceBuffer`] of type `A` to a [`DeviceBuffer`] of type `B`. Returning
305+
/// an error if it failed.
306+
///
307+
/// The length of the buffer after the conversion may have changed.
308+
///
309+
/// # Failure
310+
///
311+
/// - If the target type has a greater alignment requirement.
312+
/// - If the target element type is a different size and the output buffer wouldn't have a
313+
/// whole number of elements. Such as `3` x [`u16`] -> `1.5` x [`u32`].
314+
/// - If either type is a ZST (but not both).
315+
#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
316+
pub fn try_cast<B: Pod + DeviceCopy>(self) -> Result<DeviceBuffer<B>, PodCastError> {
317+
if align_of::<B>() > align_of::<A>() && (self.buf.as_raw() as usize) % align_of::<B>() != 0
318+
{
319+
Err(PodCastError::TargetAlignmentGreaterAndInputNotAligned)
320+
} else if size_of::<B>() == size_of::<A>() {
321+
// SAFETY: we made sure sizes were compatible, and DeviceBuffer is repr(C)
322+
Ok(unsafe { transmute::<_, DeviceBuffer<B>>(self) })
323+
} else if size_of::<A>() == 0 || size_of::<B>() == 0 {
324+
Err(PodCastError::SizeMismatch)
325+
} else if (size_of::<A>() * self.len) % size_of::<B>() == 0 {
326+
let new_len = (size_of::<A>() * self.len) / size_of::<B>();
327+
Ok(DeviceBuffer {
328+
buf: self.buf.cast(),
329+
len: new_len,
330+
})
331+
} else {
332+
Err(PodCastError::OutputSliceWouldHaveSlop)
333+
}
334+
}
335+
}
336+
177337
impl<T: DeviceCopy> DeviceBuffer<T> {
178338
/// Allocate a new device buffer of the same size as `slice`, initialized with a clone of
179339
/// the data in `slice`.
@@ -225,7 +385,7 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
225385
/// }
226386
/// ```
227387
pub unsafe fn from_slice_async(slice: &[T], stream: &Stream) -> CudaResult<Self> {
228-
let mut uninit = DeviceBuffer::uninitialized(slice.len())?;
388+
let mut uninit = DeviceBuffer::uninitialized_async(slice.len(), stream)?;
229389
uninit.async_copy_from(slice, stream)?;
230390
Ok(uninit)
231391
}
@@ -256,13 +416,13 @@ impl<T: DeviceCopy> Drop for DeviceBuffer<T> {
256416
return;
257417
}
258418

259-
if self.capacity > 0 && mem::size_of::<T>() > 0 {
419+
if self.len > 0 && size_of::<T>() > 0 {
260420
let ptr = mem::replace(&mut self.buf, DevicePointer::null());
261421
unsafe {
262422
let _ = cuda_free(ptr);
263423
}
264424
}
265-
self.capacity = 0;
425+
self.len = 0;
266426
}
267427
}
268428

0 commit comments

Comments
 (0)