Rust-GPU · LegNeato · Apr 1, 2025
diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml
@@ -8,6 +8,7 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA"
 readme = "../../README.md"
 
 [dependencies]
+glam = { version = ">=0.22", default-features = false, features = ["libm", "cuda", "bytemuck"] }
 vek = { version = "0.17.1", default-features = false, features = ["libm"] }
 cuda_std_macros = { version = "0.2", path = "../cuda_std_macros" }
 half = "2.4.1"

diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
@@ -49,7 +49,9 @@ mod float_ext;
 pub use cuda_std_macros::*;
 pub use float::GpuFloat;
 pub use float_ext::*;
+pub use glam;
 pub use half;
+#[deprecated(note = "The `vek` module is deprecated, use `glam` instead.")]
 pub use vek;
 
 pub use half::{bf16, f16};

diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
@@ -152,23 +152,23 @@ impl<'a> From<&'a GridSize> for GridSize {
         other.clone()
     }
 }
-impl From<vek::Vec2<u32>> for GridSize {
-    fn from(vec: vek::Vec2<u32>) -> Self {
+impl From<glam::UVec2> for GridSize {
+    fn from(vec: glam::UVec2) -> Self {
         GridSize::xy(vec.x, vec.y)
     }
 }
-impl From<vek::Vec3<u32>> for GridSize {
-    fn from(vec: vek::Vec3<u32>) -> Self {
+impl From<glam::UVec3> for GridSize {
+    fn from(vec: glam::UVec3) -> Self {
         GridSize::xyz(vec.x, vec.y, vec.z)
     }
 }
-impl From<vek::Vec2<usize>> for GridSize {
-    fn from(vec: vek::Vec2<usize>) -> Self {
+impl From<glam::USizeVec2> for GridSize {
+    fn from(vec: glam::USizeVec2) -> Self {
         GridSize::xy(vec.x as u32, vec.y as u32)
     }
 }
-impl From<vek::Vec3<usize>> for GridSize {
-    fn from(vec: vek::Vec3<usize>) -> Self {
+impl From<glam::USizeVec3> for GridSize {
+    fn from(vec: glam::USizeVec3) -> Self {
         GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
     }
 }
@@ -228,23 +228,23 @@ impl<'a> From<&'a BlockSize> for BlockSize {
         other.clone()
     }
 }
-impl From<vek::Vec2<u32>> for BlockSize {
-    fn from(vec: vek::Vec2<u32>) -> Self {
+impl From<glam::UVec2> for BlockSize {
+    fn from(vec: glam::UVec2) -> Self {
         BlockSize::xy(vec.x, vec.y)
     }
 }
-impl From<vek::Vec3<u32>> for BlockSize {
-    fn from(vec: vek::Vec3<u32>) -> Self {
+impl From<glam::UVec3> for BlockSize {
+    fn from(vec: glam::UVec3) -> Self {
         BlockSize::xyz(vec.x, vec.y, vec.z)
     }
 }
-impl From<vek::Vec2<usize>> for BlockSize {
-    fn from(vec: vek::Vec2<usize>) -> Self {
+impl From<glam::USizeVec2> for BlockSize {
+    fn from(vec: glam::USizeVec2) -> Self {
         BlockSize::xy(vec.x as u32, vec.y as u32)
     }
 }
-impl From<vek::Vec3<usize>> for BlockSize {
-    fn from(vec: vek::Vec3<usize>) -> Self {
+impl From<glam::USizeVec3> for BlockSize {
+    fn from(vec: glam::USizeVec3) -> Self {
         BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
     }
 }
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
@@ -19,7 +19,7 @@
 // TODO: write some docs about the terms used in this module.
 
 use cuda_std_macros::gpu_only;
-use vek::{Vec2, Vec3};
+use glam::{UVec2, UVec3};
 
 // different calling conventions dont exist in nvptx, so we just use C as a placeholder.
 extern "C" {
@@ -152,7 +152,7 @@ pub fn grid_dim_z() -> u32 {
 /// Gets the 3d index of the thread currently executing the kernel.
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx() -> Vec3<u32> {
+pub fn thread_idx() -> UVec3 {
     unsafe {
         Vec3::new(
             __nvvm_thread_idx_x(),
@@ -165,7 +165,7 @@ pub fn thread_idx() -> Vec3<u32> {
 /// Gets the 3d index of the block that the thread currently executing the kernel is located in.
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx() -> Vec3<u32> {
+pub fn block_idx() -> UVec3 {
     unsafe {
         Vec3::new(
             __nvvm_block_idx_x(),
@@ -179,7 +179,7 @@ pub fn block_idx() -> Vec3<u32> {
 /// how many threads exist in each thread block in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim() -> Vec3<u32> {
+pub fn block_dim() -> UVec3 {
     unsafe {
         Vec3::new(
             __nvvm_block_dim_x(),
@@ -193,7 +193,7 @@ pub fn block_dim() -> Vec3<u32> {
 /// how many thread blocks exist in each grid in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim() -> Vec3<u32> {
+pub fn grid_dim() -> UVec3 {
     unsafe {
         Vec3::new(
             __nvvm_grid_dim_x(),
@@ -232,26 +232,26 @@ pub fn index_1d() -> u32 {
 }
 
 #[inline(always)]
-pub fn index_2d() -> Vec2<u32> {
+pub fn index_2d() -> UVec2 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
-    Vec2::new(i, j)
+    UVec2::new(i, j)
 }
 
 #[inline(always)]
-pub fn index_3d() -> Vec3<u32> {
+pub fn index_3d() -> UVec3 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
     let k = thread_idx_z() + block_idx_z() * block_dim_z();
-    Vec3::new(i, j, k)
+    UVec3::new(i, j, k)
 }
 
 /// Whether this is the first thread (not the first thread to be executing). This function is guaranteed
 /// to only return true in a single thread that is invoking it. This is useful for only doing something
 /// once.
 #[inline(always)]
 pub fn first() -> bool {
-    block_idx() == Vec3::zero() && thread_idx() == Vec3::zero()
+    block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO
 }
 
 /// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture.

diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md
@@ -4,6 +4,7 @@ Notable changes to this project will be documented in this file.
 
 ## Unreleased
 
+- `cuda_std::vek` is now deprecated. Use `cuda_std::glam`.
 - Add `memory::memcpy_dtoh` to allow copying from device to host.
 - `DeviceSlice` is represented as a slice again, but as `[()]` instead of `[T]`.
 - Reimplemented `Index` and `IndexMut` for `DeviceSlice` and removed `DeviceSlice::index`.

diff --git a/crates/optix_device/Cargo.toml b/crates/optix_device/Cargo.toml
@@ -2,12 +2,17 @@
 name = "optix_device"
 version = "0.1.0"
 edition = "2021"
-authors = ["Anders Langlands <[email protected]>", "Riccardo D'Ambrosio <[email protected]>"]
+authors = [
+    "Anders Langlands <[email protected]>",
+    "Riccardo D'Ambrosio <[email protected]>"
+]
 
 [dependencies]
 bitflags = "2.8"
 cuda_std = { version = "0.2", path = "../cuda_std" }
-glam = { version = "0.29", features=["cuda", "libm"], default-features=false }
 paste = "1.0.15"
 seq-macro = "0.3.5"
 cust_core = { version = "0.1", path = "../cust_core" }
+
+[target.'cfg(not(target_os = "cuda"))'.dependencies]
+glam = { version = "0.29", features = ["cuda"], default-features = false }
diff --git a/crates/optix_device/src/hit.rs b/crates/optix_device/src/hit.rs
@@ -1,6 +1,6 @@
 #[cfg(target_os = "cuda")]
 use core::arch::asm;
-use cuda_std::gpu_only;
+use cuda_std::{glam, gpu_only};
 use glam::Vec3;
 /// The type of primitive that a ray hit.
 #[repr(u32)]

diff --git a/crates/optix_device/src/lib.rs b/crates/optix_device/src/lib.rs
@@ -14,8 +14,7 @@ pub mod trace;
 pub mod transform;
 pub mod util;
 
-use cuda_std::*;
-pub use glam;
+use cuda_std::{glam, *};
 use glam::UVec3;
 pub use misc::*;
 

diff --git a/crates/optix_device/src/ray.rs b/crates/optix_device/src/ray.rs
@@ -1,7 +1,7 @@
 use crate::trace::*;
 #[cfg(target_os = "cuda")]
 use core::arch::asm;
-use cuda_std::gpu_only;
+use cuda_std::{glam, gpu_only};
 use glam::Vec3;
 
 /// Returns the ray origin that was passed into [`trace`] in world-space.

diff --git a/crates/optix_device/src/sys.rs b/crates/optix_device/src/sys.rs
@@ -3,7 +3,7 @@
 use crate::trace::{RayFlags, TraversableHandle};
 #[cfg(target_os = "cuda")]
 use core::arch::asm;
-use cuda_std::gpu_only;
+use cuda_std::{glam, gpu_only};
 use glam::Vec3;
 use paste::paste;
 

diff --git a/examples/cuda/cpu/path_tracer/Cargo.toml b/examples/cuda/cpu/path_tracer/Cargo.toml
@@ -4,9 +4,9 @@ version = "0.1.0"
 edition = "2018"
 
 [dependencies]
-vek = { version = "0.17.1", features = ["bytemuck", "mint"] }
+glam = { version = "0.30.1", features = ["bytemuck", "cuda"] }
 bytemuck = { version = "1.21", features = ["derive"] }
-cust = { version = "0.3", path = "../../../../crates/cust", features = ["impl_vek"] }
+cust = { version = "0.3", path = "../../../../crates/cust", features = ["impl_glam"] }
 image = "0.25.5"
 path_tracer_gpu = { path = "../../gpu/path_tracer_gpu" }
 gpu_rand = { version = "0.1", path = "../../../../crates/gpu_rand" }

diff --git a/examples/cuda/cpu/path_tracer/src/common.rs b/examples/cuda/cpu/path_tracer/src/common.rs
@@ -1,14 +1,14 @@
+use glam::{Vec2, Vec3};
 use glium::glutin::event::{
     ElementState, Event, MouseButton, MouseScrollDelta, VirtualKeyCode, WindowEvent,
 };
 use path_tracer_gpu::Viewport;
-use vek::{Vec2, Vec3};
 
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub struct Camera {
-    pub origin: Vec3<f32>,
-    pub lookat: Vec3<f32>,
-    pub vup: Vec3<f32>,
+    pub origin: Vec3,
+    pub lookat: Vec3,
+    pub vup: Vec3,
     pub fov: f32,
     pub aspect_ratio: f32,
 }
@@ -43,7 +43,7 @@ pub struct CameraController {
 }
 
 impl CameraController {
-    pub fn new(dimensions: Vec2<usize>) -> Self {
+    pub fn new(dimensions: USizeVec2) -> Self {
         CameraController {
             sensitivity: 0.1,
             last_mouse_pos: dimensions.numcast().unwrap() / 2.0,

diff --git a/examples/cuda/cpu/path_tracer/src/cpu/mod.rs b/examples/cuda/cpu/path_tracer/src/cpu/mod.rs
@@ -1,19 +1,19 @@
 use std::time::Duration;
 
+use glam::{Clamp, Vec2, Vec3};
 use gpu_rand::{DefaultRand, GpuRand};
 use imgui::Ui;
 use path_tracer_gpu::{
     material::MaterialKind, render::generate_ray, scene::Scene, Object, Viewport,
 };
 use rayon::prelude::*;
 use sysinfo::System;
-use vek::{Clamp, Vec2, Vec3};
 
 use crate::{common::Camera, cuda::SEED};
 
 pub struct CpuRenderer {
     // this is basically the cuda buffers but not gpu buffers.
-    accumulated_buffer: Vec<Vec3<f32>>,
+    accumulated_buffer: Vec<Vec3>,
     out_buffer: Vec<Vec3<u8>>,
 
     viewport: Viewport,
@@ -23,7 +23,7 @@ pub struct CpuRenderer {
 }
 
 impl CpuRenderer {
-    pub fn new(dimensions: Vec2<usize>, camera: &Camera, scene: &Scene) -> Self {
+    pub fn new(dimensions: USizeVec2, camera: &Camera, scene: &Scene) -> Self {
         let accumulated_buffer = vec![Vec3::zero(); dimensions.product()];
         let out_buffer = vec![Vec3::zero(); dimensions.product()];
 
@@ -67,7 +67,7 @@ impl CpuRenderer {
         new_camera.as_viewport(&mut self.viewport);
     }
 
-    pub fn resize(&mut self, dimensions: Vec2<usize>) {
+    pub fn resize(&mut self, dimensions: USizeVec2) {
         self.accumulated_buffer
             .resize(dimensions.product(), Vec3::zero());
         self.out_buffer.resize(dimensions.product(), Vec3::zero());

diff --git a/examples/cuda/cpu/path_tracer/src/cuda/data.rs b/examples/cuda/cpu/path_tracer/src/cuda/data.rs
@@ -5,9 +5,9 @@ use cust::{
     memory::{DeviceBuffer, DeviceCopy, UnifiedBuffer},
     util::SliceExt,
 };
+use glam::{Vec2, Vec3};
 use gpu_rand::DefaultRand;
 use path_tracer_gpu::{material::MaterialKind, scene::Scene, Object, Viewport};
-use vek::{Vec2, Vec3};
 
 use super::SEED;
 

diff --git a/examples/cuda/cpu/path_tracer/src/main.rs b/examples/cuda/cpu/path_tracer/src/main.rs
@@ -6,14 +6,14 @@ pub mod renderer;
 pub mod viewer;
 
 use common::Camera;
+use glam::Vec3;
 use path_tracer_gpu::{
     material::{DielectricMaterial, DiffuseMaterial, MaterialKind, MetallicMaterial},
     scene::Scene,
     sphere::Sphere,
     Object,
 };
 use std::error::Error;
-use vek::Vec3;
 
 pub const WIDTH: u32 = 1920;
 pub const HEIGHT: u32 = 1080;

diff --git a/examples/cuda/gpu/path_tracer_gpu/src/lib.rs b/examples/cuda/gpu/path_tracer_gpu/src/lib.rs
@@ -11,20 +11,19 @@ pub mod render_kernels;
 pub mod scene;
 pub mod sphere;
 
-pub use cuda_std::vek;
+pub use cuda_std::glam;
 use cust_core::DeviceCopy;
 use enum_dispatch::enum_dispatch;
 use hittable::{HitRecord, Hittable};
 use sphere::Sphere;
 
-pub type Vec3<T = f32> = vek::Vec3<T>;
-pub type Point<T = f32> = vek::Vec3<T>;
-pub type Vec2<T = f32> = vek::Vec2<T>;
+use glam::{USizeVec2, Vec2, Vec3};
+pub type Point = Vec3;
 
 #[derive(Default, Clone, Copy, DeviceCopy)]
 #[repr(C)]
 pub struct Viewport {
-    pub bounds: vek::Vec2<usize>,
+    pub bounds: USizeVec2,
     pub lower_left: Vec3,
     pub horizontal: Vec3,
     pub vertical: Vec3,

diff --git a/examples/cuda/gpu/path_tracer_gpu/src/render.rs b/examples/cuda/gpu/path_tracer_gpu/src/render.rs
@@ -8,7 +8,7 @@ pub fn color(ray: Ray) -> Vec3 {
     (1.0 - t) * Vec3::one() + t * Vec3::new(0.5, 0.7, 1.0)
 }
 
-pub fn generate_ray(idx: vek::Vec2<u32>, view: &Viewport, offset: Vec2) -> Ray {
+pub fn generate_ray(idx: UVec2, view: &Viewport, offset: Vec2) -> Ray {
     let uv = (idx.numcast::<f32>().unwrap() + offset) / view.bounds.numcast().unwrap();
     Ray {
         origin: view.origin,

diff --git a/examples/cuda/gpu/path_tracer_gpu/src/render_kernels.rs b/examples/cuda/gpu/path_tracer_gpu/src/render_kernels.rs
@@ -1,5 +1,6 @@
 use crate::{render::*, scene::Scene, *};
-use cuda_std::{vek::Clamp, *};
+use cuda_std::*;
+use glam::{U8Vec3, Vec2, Vec3};
 use gpu_rand::{DefaultRand, GpuRand};
 
 #[kernel]
@@ -38,7 +39,7 @@ pub unsafe fn scale_buffer(fb: *const Vec3, out: *mut Vec3, samples: u32, view:
 
 /// Postprocesses a (scaled) buffer into a final u8 buffer.
 #[kernel]
-pub unsafe fn postprocess(fb: *const Vec3, out: *mut vek::Vec3<u8>, view: Viewport) {
+pub unsafe fn postprocess(fb: *const Vec3, out: *mut U8Vec3, view: Viewport) {
     let idx_2d = thread::index_2d();
     if idx_2d.x >= view.bounds.x as u32 || idx_2d.y >= view.bounds.y as u32 {
         return;
@@ -50,7 +51,7 @@ pub unsafe fn postprocess(fb: *const Vec3, out: *mut vek::Vec3<u8>, view: Viewpo
     let gamma_corrected = original.sqrt();
 
     *out = (gamma_corrected * 255.0)
-        .clamped(Vec3::zero(), Vec3::broadcast(255.0))
+        .clamp(Vec3::zero(), Vec3::broadcast(255.0))
         .numcast()
         .unwrap();
 }