1
1
use crate :: error:: { CudaResult , DropResult , ToResult } ;
2
2
use crate :: memory:: device:: { AsyncCopyDestination , CopyDestination , DeviceSlice } ;
3
3
use crate :: memory:: malloc:: { cuda_free, cuda_malloc} ;
4
- use crate :: memory:: DeviceCopy ;
5
- use crate :: memory:: DevicePointer ;
4
+ use crate :: memory:: { cuda_free_async , DevicePointer } ;
5
+ use crate :: memory:: { cuda_malloc_async , DeviceCopy } ;
6
6
use crate :: stream:: Stream ;
7
7
use crate :: sys as cuda;
8
- use std:: mem;
8
+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
9
+ pub use bytemuck;
10
+ #[ cfg( feature = "bytemuck" ) ]
11
+ use bytemuck:: { Pod , PodCastError , Zeroable } ;
12
+ use std:: mem:: { self , align_of, size_of, transmute, ManuallyDrop } ;
9
13
use std:: ops:: { Deref , DerefMut } ;
10
14
11
15
/// Fixed-size device-side buffer. Provides basic access to device memory.
12
16
#[ derive( Debug ) ]
13
17
#[ repr( C ) ]
14
18
pub struct DeviceBuffer < T : DeviceCopy > {
15
19
buf : DevicePointer < T > ,
16
- capacity : usize ,
20
+ len : usize ,
17
21
}
18
22
19
23
unsafe impl < T : Send + DeviceCopy > Send for DeviceBuffer < T > { }
@@ -42,57 +46,84 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
42
46
/// buffer.copy_from(&[0u64, 1, 2, 3, 4]).unwrap();
43
47
/// ```
44
48
pub unsafe fn uninitialized ( size : usize ) -> CudaResult < Self > {
45
- let ptr = if size > 0 && mem :: size_of :: < T > ( ) > 0 {
49
+ let ptr = if size > 0 && size_of :: < T > ( ) > 0 {
46
50
cuda_malloc ( size) ?
47
51
} else {
48
52
// FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
49
53
DevicePointer :: null ( )
50
54
} ;
51
55
Ok ( DeviceBuffer {
52
56
buf : ptr,
53
- capacity : size,
57
+ len : size,
54
58
} )
55
59
}
56
60
57
- /// Allocate a new device buffer large enough to hold `size` `T`'s and fill the contents with
58
- /// zeroes (`0u8`).
61
+ /// Allocates device memory asynchronously on a stream, without initializing it.
59
62
///
60
- /// # Errors
61
- ///
62
- /// If the allocation fails, returns the error from CUDA. If `size` is large enough that
63
- /// `size * mem::sizeof::<T>()` overflows usize, then returns InvalidMemoryAllocation.
63
+ /// This doesn't actually allocate if `T` is zero sized.
64
64
///
65
65
/// # Safety
66
66
///
67
- /// The backing memory is zeroed, which may not be a valid bit-pattern for type `T`. The caller
68
- /// must ensure either that all-zeroes is a valid bit-pattern for type `T` or that the backing
69
- /// memory is set to a valid value before it is read.
67
+ /// The allocated memory retains all of the unsafety of [`DeviceBuffer::uninitialized`], with
68
+ /// the additional consideration that the memory cannot be used until it is actually allocated
69
+ /// on the stream. This means proper stream ordering semantics must be followed, such as
70
+ /// only enqueing kernel launches that use the memory AFTER the allocation call.
70
71
///
71
- /// # Examples
72
- ///
73
- /// ```
74
- /// # let _context = cust::quick_init().unwrap();
75
- /// use cust::memory::*;
76
- /// let buffer = unsafe { DeviceBuffer::zeroed(5).unwrap() };
77
- /// let mut host_values = [1u64, 2, 3, 4, 5];
78
- /// buffer.copy_to(&mut host_values).unwrap();
79
- /// assert_eq!([0u64, 0, 0, 0, 0], host_values);
80
- /// ```
81
- pub unsafe fn zeroed ( size : usize ) -> CudaResult < Self > {
82
- let ptr = if size > 0 && mem:: size_of :: < T > ( ) > 0 {
83
- let ptr = cuda_malloc ( size) ?;
84
- cuda:: cuMemsetD8_v2 ( ptr. as_raw ( ) , 0 , size * mem:: size_of :: < T > ( ) ) . to_result ( ) ?;
85
- ptr
72
+ /// You can synchronize the stream to ensure the memory allocation operation is complete.
73
+ pub unsafe fn uninitialized_async ( size : usize , stream : & Stream ) -> CudaResult < Self > {
74
+ let ptr = if size > 0 && size_of :: < T > ( ) > 0 {
75
+ cuda_malloc_async ( stream, size) ?
86
76
} else {
87
- // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
88
77
DevicePointer :: null ( )
89
78
} ;
90
79
Ok ( DeviceBuffer {
91
80
buf : ptr,
92
- capacity : size,
81
+ len : size,
93
82
} )
94
83
}
95
84
85
+ /// Enqueues an operation to free the memory backed by this [`DeviceBuffer`] on a
86
+ /// particular stream. The stream will free the allocation as soon as it reaches
87
+ /// the operation in the stream. You can ensure the memory is freed by synchronizing
88
+ /// the stream.
89
+ ///
90
+ /// This function uses internal memory pool semantics. Async allocations will reserve memory
91
+ /// in the default memory pool in the stream, and async frees will release the memory back to the pool
92
+ /// for further use by async allocations.
93
+ ///
94
+ /// The memory inside of the pool is all freed back to the OS once the stream is synchronized unless
95
+ /// a custom pool is configured to not do so.
96
+ ///
97
+ /// # Examples
98
+ ///
99
+ /// ```
100
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
101
+ /// # let _context = cust::quick_init().unwrap();
102
+ /// use cust::{memory::*, stream::*};
103
+ /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
104
+ /// let mut host_vals = [1, 2, 3];
105
+ /// unsafe {
106
+ /// let mut allocated = DeviceBuffer::from_slice_async(&[4u8, 5, 6], &stream)?;
107
+ /// allocated.async_copy_to(&mut host_vals, &stream)?;
108
+ /// allocated.drop_async(&stream)?;
109
+ /// }
110
+ /// // ensure all async ops are done before trying to access the value
111
+ /// stream.synchronize()?;
112
+ /// assert_eq!(host_vals, [4, 5, 6]);
113
+ /// # Ok(())
114
+ /// # }
115
+ /// ```
116
+ pub fn drop_async ( self , stream : & Stream ) -> CudaResult < ( ) > {
117
+ if self . buf . is_null ( ) {
118
+ return Ok ( ( ) ) ;
119
+ }
120
+ // make sure we dont run the normal destructor, otherwise a double drop will happen
121
+ let me = ManuallyDrop :: new ( self ) ;
122
+ // SAFETY: we consume the box so its not possible to use the box past its drop point unless
123
+ // you keep around a pointer, but in that case, we cannot guarantee safety.
124
+ unsafe { cuda_free_async ( stream, me. buf ) }
125
+ }
126
+
96
127
/// Creates a `DeviceBuffer<T>` directly from the raw components of another device buffer.
97
128
///
98
129
/// # Safety
@@ -130,7 +161,10 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
130
161
/// let buffer = unsafe { DeviceBuffer::from_raw_parts(ptr, size) };
131
162
/// ```
132
163
pub unsafe fn from_raw_parts ( ptr : DevicePointer < T > , capacity : usize ) -> DeviceBuffer < T > {
133
- DeviceBuffer { buf : ptr, capacity }
164
+ DeviceBuffer {
165
+ buf : ptr,
166
+ len : capacity,
167
+ }
134
168
}
135
169
136
170
/// Destroy a `DeviceBuffer`, returning an error.
@@ -157,8 +191,8 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
157
191
return Ok ( ( ) ) ;
158
192
}
159
193
160
- if dev_buf. capacity > 0 && mem :: size_of :: < T > ( ) > 0 {
161
- let capacity = dev_buf. capacity ;
194
+ if dev_buf. len > 0 && size_of :: < T > ( ) > 0 {
195
+ let capacity = dev_buf. len ;
162
196
let ptr = mem:: replace ( & mut dev_buf. buf , DevicePointer :: null ( ) ) ;
163
197
unsafe {
164
198
match cuda_free ( ptr) {
@@ -174,6 +208,132 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
174
208
}
175
209
}
176
210
}
211
+
212
+ #[ cfg( feature = "bytemuck" ) ]
213
+ impl < T : DeviceCopy + Zeroable > DeviceBuffer < T > {
214
+ /// Allocate device memory and fill it with zeroes (`0u8`).
215
+ ///
216
+ /// This doesn't actually allocate if `T` is zero-sized.
217
+ ///
218
+ /// # Examples
219
+ ///
220
+ /// ```
221
+ /// # let _context = cust::quick_init().unwrap();
222
+ /// use cust::memory::*;
223
+ /// let mut zero = DeviceBuffer::zeroed(4).unwrap();
224
+ /// let mut values = [1u8, 2, 3, 4];
225
+ /// zero.copy_to(&mut values).unwrap();
226
+ /// assert_eq!(values, [0; 4]);
227
+ /// ```
228
+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
229
+ pub fn zeroed ( size : usize ) -> CudaResult < Self > {
230
+ unsafe {
231
+ let new_buf = DeviceBuffer :: uninitialized ( size) ?;
232
+ if size_of :: < T > ( ) != 0 {
233
+ cuda:: cuMemsetD8_v2 ( new_buf. as_device_ptr ( ) . as_raw ( ) , 0 , size_of :: < T > ( ) * size)
234
+ . to_result ( ) ?;
235
+ }
236
+ Ok ( new_buf)
237
+ }
238
+ }
239
+
240
+ /// Allocates device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
241
+ ///
242
+ /// This doesn't actually allocate if `T` is zero-sized.
243
+ ///
244
+ /// # Safety
245
+ ///
246
+ /// This method enqueues two operations on the stream: An async allocation
247
+ /// and an async memset. Because of this, you must ensure that:
248
+ /// - The memory is not used in any way before it is actually allocated on the stream. You
249
+ /// can ensure this happens by synchronizing the stream explicitly or using events.
250
+ ///
251
+ /// # Examples
252
+ ///
253
+ /// ```
254
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
255
+ /// # let _context = cust::quick_init().unwrap();
256
+ /// use cust::{memory::*, stream::*};
257
+ /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
258
+ /// let mut values = [1u8, 2, 3, 4];
259
+ /// unsafe {
260
+ /// let mut zero = DeviceBuffer::zeroed_async(4, &stream)?;
261
+ /// zero.async_copy_to(&mut values, &stream)?;
262
+ /// zero.drop_async(&stream)?;
263
+ /// }
264
+ /// stream.synchronize()?;
265
+ /// assert_eq!(values, [0; 4]);
266
+ /// # Ok(())
267
+ /// # }
268
+ /// ```
269
+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
270
+ pub unsafe fn zeroed_async ( size : usize , stream : & Stream ) -> CudaResult < Self > {
271
+ let new_buf = DeviceBuffer :: uninitialized_async ( size, stream) ?;
272
+ if size_of :: < T > ( ) != 0 {
273
+ cuda:: cuMemsetD8Async (
274
+ new_buf. as_device_ptr ( ) . as_raw ( ) ,
275
+ 0 ,
276
+ size_of :: < T > ( ) * size,
277
+ stream. as_inner ( ) ,
278
+ )
279
+ . to_result ( ) ?;
280
+ }
281
+ Ok ( new_buf)
282
+ }
283
+ }
284
+
285
+ fn casting_went_wrong ( src : & str , err : PodCastError ) -> ! {
286
+ panic ! ( "{}>{:?}" , src, err) ;
287
+ }
288
+
289
+ #[ cfg( feature = "bytemuck" ) ]
290
+ impl < A : DeviceCopy + Pod > DeviceBuffer < A > {
291
+ /// Same as [`DeviceBuffer::try_cast`] but panics if the cast fails.
292
+ ///
293
+ /// # Panics
294
+ ///
295
+ /// See [`DeviceBuffer::try_cast`].
296
+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
297
+ pub fn cast < B : Pod + DeviceCopy > ( self ) -> DeviceBuffer < B > {
298
+ match Self :: try_cast ( self ) {
299
+ Ok ( b) => b,
300
+ Err ( e) => casting_went_wrong ( "cast" , e) ,
301
+ }
302
+ }
303
+
304
+ /// Tries to convert a [`DeviceBuffer`] of type `A` to a [`DeviceBuffer`] of type `B`. Returning
305
+ /// an error if it failed.
306
+ ///
307
+ /// The length of the buffer after the conversion may have changed.
308
+ ///
309
+ /// # Failure
310
+ ///
311
+ /// - If the target type has a greater alignment requirement.
312
+ /// - If the target element type is a different size and the output buffer wouldn't have a
313
+ /// whole number of elements. Such as `3` x [`u16`] -> `1.5` x [`u32`].
314
+ /// - If either type is a ZST (but not both).
315
+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
316
+ pub fn try_cast < B : Pod + DeviceCopy > ( self ) -> Result < DeviceBuffer < B > , PodCastError > {
317
+ if align_of :: < B > ( ) > align_of :: < A > ( ) && ( self . buf . as_raw ( ) as usize ) % align_of :: < B > ( ) != 0
318
+ {
319
+ Err ( PodCastError :: TargetAlignmentGreaterAndInputNotAligned )
320
+ } else if size_of :: < B > ( ) == size_of :: < A > ( ) {
321
+ // SAFETY: we made sure sizes were compatible, and DeviceBuffer is repr(C)
322
+ Ok ( unsafe { transmute :: < _ , DeviceBuffer < B > > ( self ) } )
323
+ } else if size_of :: < A > ( ) == 0 || size_of :: < B > ( ) == 0 {
324
+ Err ( PodCastError :: SizeMismatch )
325
+ } else if ( size_of :: < A > ( ) * self . len ) % size_of :: < B > ( ) == 0 {
326
+ let new_len = ( size_of :: < A > ( ) * self . len ) / size_of :: < B > ( ) ;
327
+ Ok ( DeviceBuffer {
328
+ buf : self . buf . cast ( ) ,
329
+ len : new_len,
330
+ } )
331
+ } else {
332
+ Err ( PodCastError :: OutputSliceWouldHaveSlop )
333
+ }
334
+ }
335
+ }
336
+
177
337
impl < T : DeviceCopy > DeviceBuffer < T > {
178
338
/// Allocate a new device buffer of the same size as `slice`, initialized with a clone of
179
339
/// the data in `slice`.
@@ -225,7 +385,7 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
225
385
/// }
226
386
/// ```
227
387
pub unsafe fn from_slice_async ( slice : & [ T ] , stream : & Stream ) -> CudaResult < Self > {
228
- let mut uninit = DeviceBuffer :: uninitialized ( slice. len ( ) ) ?;
388
+ let mut uninit = DeviceBuffer :: uninitialized_async ( slice. len ( ) , stream ) ?;
229
389
uninit. async_copy_from ( slice, stream) ?;
230
390
Ok ( uninit)
231
391
}
@@ -256,13 +416,13 @@ impl<T: DeviceCopy> Drop for DeviceBuffer<T> {
256
416
return ;
257
417
}
258
418
259
- if self . capacity > 0 && mem :: size_of :: < T > ( ) > 0 {
419
+ if self . len > 0 && size_of :: < T > ( ) > 0 {
260
420
let ptr = mem:: replace ( & mut self . buf , DevicePointer :: null ( ) ) ;
261
421
unsafe {
262
422
let _ = cuda_free ( ptr) ;
263
423
}
264
424
}
265
- self . capacity = 0 ;
425
+ self . len = 0 ;
266
426
}
267
427
}
268
428
0 commit comments