Rust-GPU
diff --git a/‎crates/cust/CHANGELOG.md
Lines changed: 16 additions & 6 deletions b/‎crates/cust/CHANGELOG.md
Lines changed: 16 additions & 6 deletions
diff --git a/‎crates/cust/Cargo.toml
Lines changed: 0 additions & 1 deletion b/‎crates/cust/Cargo.toml
Lines changed: 0 additions & 1 deletion
diff --git a/‎crates/cust/src/memory/device/device_box.rs
Lines changed: 5 additions & 2 deletions b/‎crates/cust/src/memory/device/device_box.rs
Lines changed: 5 additions & 2 deletions
diff --git a/‎crates/cust/src/memory/device/device_buffer.rs
Lines changed: 198 additions & 38 deletions b/‎crates/cust/src/memory/device/device_buffer.rs
Lines changed: 198 additions & 38 deletions
@@ -52,12 +52,22 @@ Instead you can now use `DeviceSlice::index` which behaves the same.
 - Added `cust::memory::LockedBox`, same as `LockedBuffer` except for single elements.
 - Added `cust::memory::cuda_malloc_async`.
 - Added `cust::memory::cuda_free_async`.
-- Added `impl AsyncCopyDestination<LockedBox<T>> for DeviceBox<T>` for async HtoD memcpy.
+- Added `impl AsyncCopyDestination<LockedBox<T>> for DeviceBox<T>` for async HtoD/DtoH memcpy.
+- Added `DeviceBox::new_async`.
+- Added `DeviceBox::drop_async`.
+- Added `DeviceBox::zeroed_async`.
+- Added `DeviceBox::uninitialized_async`.
+- Added `DeviceBuffer::uninitialized_async`.
+- Added `DeviceBuffer::drop_async`.
+- Added `DeviceBuffer::zeroed`.
+- Added `DeviceBuffer::zeroed_async`.
+- Added `DeviceBuffer::cast`.
+- Added `DeviceBuffer::try_cast`.
+- Added `DeviceSlice::set_8` and `DeviceSlice::set_8_async`.
+- Added `DeviceSlice::set_16` and `DeviceSlice::set_16_async`.
+- Added `DeviceSlice::set_32` and `DeviceSlice::set_32_async`.
+- Added `DeviceSlice::set_zero` and `DeviceSlice::set_zero_async`.
 - Added the `bytemuck` feature which is enabled by default.
-- Added `zeroed_async` to `DeviceBox`.
-- Added `drop_async` to `DeviceBox`.
-- Added `new_async` to `DeviceBox`.
-- Added `DevicePointer::as_ptr` and `DevicePointer::as_mut_ptr` for returning `*const T` or `*mut T`.
 - Added mint integration behind `impl_mint`.
 - Added half integration behind `impl_half`.
 - Added glam integration behind `impl_glam`.
@@ -69,8 +79,8 @@ Instead you can now use `DeviceSlice::index` which behaves the same.
 - Added `mem_get_info` to query the amount of free and total memory.
 - Added `DevicePointer::as_ptr` and `DevicePointer::as_mut_ptr` for `*const T` and `*mut T`.
 - Added `DevicePointer::from_raw` for `CUdeviceptr -> DevicePointer<T>` with a safe function.
+- Added `DevicePointer::cast`.
 - Added dependency on `cust_core` for `DeviceCopy`.
-- Added dependency on `goblin` for verifying cubins and fatbins (impossible to implement safe module loading without it).
 - Added `ModuleJitOption`, `JitFallback`, `JitTarget`, and `OptLevel` for specifying options when loading a module. Note that
 `ModuleJitOption::MaxRegisters` does not seem to work currently, but NVIDIA is looking into it.
 You can achieve the same goal by compiling the ptx to cubin using nvcc then loading that: `nvcc --cubin foo.ptx -maxrregcount=REGS`
 
@@ -22,7 +22,6 @@ mint = { version = "^0.5", optional = true }
 num-complex = { version = "0.4", optional = true }
 vek = { version = "0.15.1", optional = true, default-features = false }
 bytemuck = { version = "1.7.3", optional = true }
-goblin = { version = "0.4.3", default-features = false, features = ["elf32", "elf64", "std", "endian_fd"] }
 
 [features]
 default= ["bytemuck"]
 
@@ -84,7 +84,7 @@ impl<T: DeviceCopy> DeviceBox<T> {
     /// # Ok(())
     /// # }
     pub unsafe fn new_async(val: &T, stream: &Stream) -> CudaResult<Self> {
-        let mut dev_box = DeviceBox::uninitialized()?;
+        let mut dev_box = DeviceBox::uninitialized_async(stream)?;
         dev_box.async_copy_from(val, stream)?;
         Ok(dev_box)
     }
@@ -120,6 +120,9 @@ impl<T: DeviceCopy> DeviceBox<T> {
     /// # Ok(())
     /// # }
     pub fn drop_async(self, stream: &Stream) -> CudaResult<()> {
+        if self.ptr.is_null() {
+            return Ok(());
+        }
         // make sure we dont run the normal destructor, otherwise a double drop will happen
         let me = ManuallyDrop::new(self);
         // SAFETY: we consume the box so its not possible to use the box past its drop point unless
@@ -165,7 +168,7 @@ impl<T: DeviceCopy + bytemuck::Zeroable> DeviceBox<T> {
         }
     }
 
-    /// Allocate device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
+    /// Allocates device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
     ///
     /// This doesn't actually allocate if `T` is zero-sized.
     ///
 
@@ -1,19 +1,23 @@
 use crate::error::{CudaResult, DropResult, ToResult};
 use crate::memory::device::{AsyncCopyDestination, CopyDestination, DeviceSlice};
 use crate::memory::malloc::{cuda_free, cuda_malloc};
-use crate::memory::DeviceCopy;
-use crate::memory::DevicePointer;
+use crate::memory::{cuda_free_async, DevicePointer};
+use crate::memory::{cuda_malloc_async, DeviceCopy};
 use crate::stream::Stream;
 use crate::sys as cuda;
-use std::mem;
+#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
+pub use bytemuck;
+#[cfg(feature = "bytemuck")]
+use bytemuck::{Pod, PodCastError, Zeroable};
+use std::mem::{self, align_of, size_of, transmute, ManuallyDrop};
 use std::ops::{Deref, DerefMut};
 
 /// Fixed-size device-side buffer. Provides basic access to device memory.
 #[derive(Debug)]
 #[repr(C)]
 pub struct DeviceBuffer<T: DeviceCopy> {
     buf: DevicePointer<T>,
-    capacity: usize,
+    len: usize,
 }
 
 unsafe impl<T: Send + DeviceCopy> Send for DeviceBuffer<T> {}
@@ -42,57 +46,84 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
     /// buffer.copy_from(&[0u64, 1, 2, 3, 4]).unwrap();
     /// ```
     pub unsafe fn uninitialized(size: usize) -> CudaResult<Self> {
-        let ptr = if size > 0 && mem::size_of::<T>() > 0 {
+        let ptr = if size > 0 && size_of::<T>() > 0 {
             cuda_malloc(size)?
         } else {
             // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
             DevicePointer::null()
         };
         Ok(DeviceBuffer {
             buf: ptr,
-            capacity: size,
+            len: size,
         })
     }
 
-    /// Allocate a new device buffer large enough to hold `size` `T`'s and fill the contents with
-    /// zeroes (`0u8`).
+    /// Allocates device memory asynchronously on a stream, without initializing it.
     ///
-    /// # Errors
-    ///
-    /// If the allocation fails, returns the error from CUDA. If `size` is large enough that
-    /// `size * mem::sizeof::<T>()` overflows usize, then returns InvalidMemoryAllocation.
+    /// This doesn't actually allocate if `T` is zero sized.
     ///
     /// # Safety
     ///
-    /// The backing memory is zeroed, which may not be a valid bit-pattern for type `T`. The caller
-    /// must ensure either that all-zeroes is a valid bit-pattern for type `T` or that the backing
-    /// memory is set to a valid value before it is read.
+    /// The allocated memory retains all of the unsafety of [`DeviceBuffer::uninitialized`], with
+    /// the additional consideration that the memory cannot be used until it is actually allocated
+    /// on the stream. This means proper stream ordering semantics must be followed, such as
+    /// only enqueing kernel launches that use the memory AFTER the allocation call.
     ///
-    /// # Examples
-    ///
-    /// ```
-    /// # let _context = cust::quick_init().unwrap();
-    /// use cust::memory::*;
-    /// let buffer = unsafe { DeviceBuffer::zeroed(5).unwrap() };
-    /// let mut host_values = [1u64, 2, 3, 4, 5];
-    /// buffer.copy_to(&mut host_values).unwrap();
-    /// assert_eq!([0u64, 0, 0, 0, 0], host_values);
-    /// ```
-    pub unsafe fn zeroed(size: usize) -> CudaResult<Self> {
-        let ptr = if size > 0 && mem::size_of::<T>() > 0 {
-            let ptr = cuda_malloc(size)?;
-            cuda::cuMemsetD8_v2(ptr.as_raw(), 0, size * mem::size_of::<T>()).to_result()?;
-            ptr
+    /// You can synchronize the stream to ensure the memory allocation operation is complete.
+    pub unsafe fn uninitialized_async(size: usize, stream: &Stream) -> CudaResult<Self> {
+        let ptr = if size > 0 && size_of::<T>() > 0 {
+            cuda_malloc_async(stream, size)?
         } else {
-            // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
             DevicePointer::null()
         };
         Ok(DeviceBuffer {
             buf: ptr,
-            capacity: size,
+            len: size,
         })
     }
 
+    /// Enqueues an operation to free the memory backed by this [`DeviceBuffer`] on a
+    /// particular stream. The stream will free the allocation as soon as it reaches
+    /// the operation in the stream. You can ensure the memory is freed by synchronizing
+    /// the stream.
+    ///
+    /// This function uses internal memory pool semantics. Async allocations will reserve memory
+    /// in the default memory pool in the stream, and async frees will release the memory back to the pool
+    /// for further use by async allocations.
+    ///
+    /// The memory inside of the pool is all freed back to the OS once the stream is synchronized unless
+    /// a custom pool is configured to not do so.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+    /// # let _context = cust::quick_init().unwrap();
+    /// use cust::{memory::*, stream::*};
+    /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
+    /// let mut host_vals = [1, 2, 3];
+    /// unsafe {
+    ///     let mut allocated = DeviceBuffer::from_slice_async(&[4u8, 5, 6], &stream)?;
+    ///     allocated.async_copy_to(&mut host_vals, &stream)?;
+    ///     allocated.drop_async(&stream)?;
+    /// }
+    /// // ensure all async ops are done before trying to access the value
+    /// stream.synchronize()?;
+    /// assert_eq!(host_vals, [4, 5, 6]);
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn drop_async(self, stream: &Stream) -> CudaResult<()> {
+        if self.buf.is_null() {
+            return Ok(());
+        }
+        // make sure we dont run the normal destructor, otherwise a double drop will happen
+        let me = ManuallyDrop::new(self);
+        // SAFETY: we consume the box so its not possible to use the box past its drop point unless
+        // you keep around a pointer, but in that case, we cannot guarantee safety.
+        unsafe { cuda_free_async(stream, me.buf) }
+    }
+
     /// Creates a `DeviceBuffer<T>` directly from the raw components of another device buffer.
     ///
     /// # Safety
@@ -130,7 +161,10 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
     /// let buffer = unsafe { DeviceBuffer::from_raw_parts(ptr, size) };
     /// ```
     pub unsafe fn from_raw_parts(ptr: DevicePointer<T>, capacity: usize) -> DeviceBuffer<T> {
-        DeviceBuffer { buf: ptr, capacity }
+        DeviceBuffer {
+            buf: ptr,
+            len: capacity,
+        }
     }
 
     /// Destroy a `DeviceBuffer`, returning an error.
@@ -157,8 +191,8 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
             return Ok(());
         }
 
-        if dev_buf.capacity > 0 && mem::size_of::<T>() > 0 {
-            let capacity = dev_buf.capacity;
+        if dev_buf.len > 0 && size_of::<T>() > 0 {
+            let capacity = dev_buf.len;
             let ptr = mem::replace(&mut dev_buf.buf, DevicePointer::null());
             unsafe {
                 match cuda_free(ptr) {
@@ -174,6 +208,132 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
         }
     }
 }
+
+#[cfg(feature = "bytemuck")]
+impl<T: DeviceCopy + Zeroable> DeviceBuffer<T> {
+    /// Allocate device memory and fill it with zeroes (`0u8`).
+    ///
+    /// This doesn't actually allocate if `T` is zero-sized.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # let _context = cust::quick_init().unwrap();
+    /// use cust::memory::*;
+    /// let mut zero = DeviceBuffer::zeroed(4).unwrap();
+    /// let mut values = [1u8, 2, 3, 4];
+    /// zero.copy_to(&mut values).unwrap();
+    /// assert_eq!(values, [0; 4]);
+    /// ```
+    #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
+    pub fn zeroed(size: usize) -> CudaResult<Self> {
+        unsafe {
+            let new_buf = DeviceBuffer::uninitialized(size)?;
+            if size_of::<T>() != 0 {
+                cuda::cuMemsetD8_v2(new_buf.as_device_ptr().as_raw(), 0, size_of::<T>() * size)
+                    .to_result()?;
+            }
+            Ok(new_buf)
+        }
+    }
+
+    /// Allocates device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
+    ///
+    /// This doesn't actually allocate if `T` is zero-sized.
+    ///
+    /// # Safety
+    ///
+    /// This method enqueues two operations on the stream: An async allocation
+    /// and an async memset. Because of this, you must ensure that:
+    /// - The memory is not used in any way before it is actually allocated on the stream. You
+    /// can ensure this happens by synchronizing the stream explicitly or using events.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+    /// # let _context = cust::quick_init().unwrap();
+    /// use cust::{memory::*, stream::*};
+    /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
+    /// let mut values = [1u8, 2, 3, 4];
+    /// unsafe {
+    ///     let mut zero = DeviceBuffer::zeroed_async(4, &stream)?;
+    ///     zero.async_copy_to(&mut values, &stream)?;
+    ///     zero.drop_async(&stream)?;
+    /// }
+    /// stream.synchronize()?;
+    /// assert_eq!(values, [0; 4]);
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
+    pub unsafe fn zeroed_async(size: usize, stream: &Stream) -> CudaResult<Self> {
+        let new_buf = DeviceBuffer::uninitialized_async(size, stream)?;
+        if size_of::<T>() != 0 {
+            cuda::cuMemsetD8Async(
+                new_buf.as_device_ptr().as_raw(),
+                0,
+                size_of::<T>() * size,
+                stream.as_inner(),
+            )
+            .to_result()?;
+        }
+        Ok(new_buf)
+    }
+}
+
+fn casting_went_wrong(src: &str, err: PodCastError) -> ! {
+    panic!("{}>{:?}", src, err);
+}
+
+#[cfg(feature = "bytemuck")]
+impl<A: DeviceCopy + Pod> DeviceBuffer<A> {
+    /// Same as [`DeviceBuffer::try_cast`] but panics if the cast fails.
+    ///
+    /// # Panics
+    ///
+    /// See [`DeviceBuffer::try_cast`].
+    #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
+    pub fn cast<B: Pod + DeviceCopy>(self) -> DeviceBuffer<B> {
+        match Self::try_cast(self) {
+            Ok(b) => b,
+            Err(e) => casting_went_wrong("cast", e),
+        }
+    }
+
+    /// Tries to convert a [`DeviceBuffer`] of type `A` to a [`DeviceBuffer`] of type `B`. Returning
+    /// an error if it failed.
+    ///
+    /// The length of the buffer after the conversion may have changed.
+    ///
+    /// # Failure
+    ///
+    /// - If the target type has a greater alignment requirement.
+    /// - If the target element type is a different size and the output buffer wouldn't have a
+    /// whole number of elements. Such as `3` x [`u16`] -> `1.5` x [`u32`].
+    /// - If either type is a ZST (but not both).
+    #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
+    pub fn try_cast<B: Pod + DeviceCopy>(self) -> Result<DeviceBuffer<B>, PodCastError> {
+        if align_of::<B>() > align_of::<A>() && (self.buf.as_raw() as usize) % align_of::<B>() != 0
+        {
+            Err(PodCastError::TargetAlignmentGreaterAndInputNotAligned)
+        } else if size_of::<B>() == size_of::<A>() {
+            // SAFETY: we made sure sizes were compatible, and DeviceBuffer is repr(C)
+            Ok(unsafe { transmute::<_, DeviceBuffer<B>>(self) })
+        } else if size_of::<A>() == 0 || size_of::<B>() == 0 {
+            Err(PodCastError::SizeMismatch)
+        } else if (size_of::<A>() * self.len) % size_of::<B>() == 0 {
+            let new_len = (size_of::<A>() * self.len) / size_of::<B>();
+            Ok(DeviceBuffer {
+                buf: self.buf.cast(),
+                len: new_len,
+            })
+        } else {
+            Err(PodCastError::OutputSliceWouldHaveSlop)
+        }
+    }
+}
+
 impl<T: DeviceCopy> DeviceBuffer<T> {
     /// Allocate a new device buffer of the same size as `slice`, initialized with a clone of
     /// the data in `slice`.
@@ -225,7 +385,7 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
     /// }
     /// ```
     pub unsafe fn from_slice_async(slice: &[T], stream: &Stream) -> CudaResult<Self> {
-        let mut uninit = DeviceBuffer::uninitialized(slice.len())?;
+        let mut uninit = DeviceBuffer::uninitialized_async(slice.len(), stream)?;
         uninit.async_copy_from(slice, stream)?;
         Ok(uninit)
     }
@@ -256,13 +416,13 @@ impl<T: DeviceCopy> Drop for DeviceBuffer<T> {
             return;
         }
 
-        if self.capacity > 0 && mem::size_of::<T>() > 0 {
+        if self.len > 0 && size_of::<T>() > 0 {
             let ptr = mem::replace(&mut self.buf, DevicePointer::null());
             unsafe {
                 let _ = cuda_free(ptr);
             }
         }
-        self.capacity = 0;
+        self.len = 0;
     }
 }