|
| 1 | +use bevy_ecs::prelude::*; |
| 2 | +use core::arch::x86_64::*; |
| 3 | +use glam::*; |
| 4 | +use rand::prelude::*; |
| 5 | + |
| 6 | +use criterion::BenchmarkId; |
| 7 | +use criterion::Criterion; |
| 8 | + |
| 9 | +#[derive(Component, Copy, Clone, Default)] |
| 10 | +struct Position(Vec3); |
| 11 | + |
| 12 | +#[derive(Component, Copy, Clone, Default)] |
| 13 | +#[repr(transparent)] |
| 14 | +struct Health(f32); |
| 15 | + |
| 16 | +//A hyperplane describing solid geometry, (x,y,z) = n with d such that nx + d = 0 |
| 17 | +#[derive(Component, Copy, Clone, Default)] |
| 18 | +struct Wall(Vec3, f32); |
| 19 | + |
| 20 | +struct Benchmark(World); |
| 21 | + |
| 22 | +fn rnd_vec3(rng: &mut ThreadRng) -> Vec3 { |
| 23 | + let x1 = rng.gen_range(-16.0..=16.0); |
| 24 | + let x2 = rng.gen_range(-16.0..=16.0); |
| 25 | + let x3 = rng.gen_range(-16.0..=16.0); |
| 26 | + |
| 27 | + Vec3::new(x1, x2, x3) |
| 28 | +} |
| 29 | + |
| 30 | +fn rnd_wall(rng: &mut ThreadRng) -> Wall { |
| 31 | + let d = rng.gen_range(-16.0..=16.0); |
| 32 | + |
| 33 | + Wall(rnd_vec3(rng).normalize_or_zero(), d) |
| 34 | +} |
| 35 | + |
| 36 | +// AoS to SoA data layout conversion for x86 AVX. |
| 37 | +// This code has been adapted from: |
| 38 | +// https://www.intel.com/content/dam/develop/external/us/en/documents/normvec-181650.pdf |
| 39 | +#[inline(always)] |
| 40 | +// This example is written in a way that benefits from inlined data layout conversion. |
| 41 | +fn aos_to_soa_83(aos_inner: &[Vec3; 8]) -> [__m256; 3] { |
| 42 | + unsafe { |
| 43 | + //# SAFETY: Vec3 is repr(C) for x86_64 |
| 44 | + let mx0 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(0)); |
| 45 | + let mx1 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(4)); |
| 46 | + let mx2 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(8)); |
| 47 | + let mx3 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(12)); |
| 48 | + let mx4 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(16)); |
| 49 | + let mx5 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(20)); |
| 50 | + |
| 51 | + let mut m03 = _mm256_castps128_ps256(mx0); // load lower halves |
| 52 | + let mut m14 = _mm256_castps128_ps256(mx1); |
| 53 | + let mut m25 = _mm256_castps128_ps256(mx2); |
| 54 | + m03 = _mm256_insertf128_ps(m03, mx3, 1); // load upper halves |
| 55 | + m14 = _mm256_insertf128_ps(m14, mx4, 1); |
| 56 | + m25 = _mm256_insertf128_ps(m25, mx5, 1); |
| 57 | + |
| 58 | + let xy = _mm256_shuffle_ps::<0b10011110>(m14, m25); // upper x's and y's |
| 59 | + let yz = _mm256_shuffle_ps::<0b01001001>(m03, m14); // lower y's and z's |
| 60 | + let x = _mm256_shuffle_ps::<0b10001100>(m03, xy); |
| 61 | + let y = _mm256_shuffle_ps::<0b11011000>(yz, xy); |
| 62 | + let z = _mm256_shuffle_ps::<0b11001101>(yz, m25); |
| 63 | + [x, y, z] |
| 64 | + } |
| 65 | +} |
| 66 | + |
| 67 | +impl Benchmark { |
| 68 | + fn new(size: i32) -> Benchmark { |
| 69 | + let mut world = World::new(); |
| 70 | + |
| 71 | + let mut rng = rand::thread_rng(); |
| 72 | + |
| 73 | + world.spawn_batch((0..size).map(|_| (Position(rnd_vec3(&mut rng)), Health(100.0)))); |
| 74 | + world.spawn_batch((0..(2_i32.pow(12) - 1)).map(|_| (rnd_wall(&mut rng)))); |
| 75 | + |
| 76 | + Self(world) |
| 77 | + } |
| 78 | + |
| 79 | + fn scalar(mut pos_healths: Query<(&Position, &mut Health)>, walls: Query<&Wall>) { |
| 80 | + pos_healths.for_each_mut(|(position, mut health)| { |
| 81 | + // This forms the scalar path: it behaves just like `for_each_mut`. |
| 82 | + |
| 83 | + // Optional: disable change detection for more performance. |
| 84 | + let health = &mut health.bypass_change_detection().0; |
| 85 | + |
| 86 | + // Test each (Position,Health) against each Wall. |
| 87 | + walls.for_each(|wall| { |
| 88 | + let plane = wall.0; |
| 89 | + |
| 90 | + // Test which side of the wall we are on |
| 91 | + let dotproj = plane.dot(position.0); |
| 92 | + |
| 93 | + // Test against the Wall's displacement/discriminant value |
| 94 | + if dotproj < wall.1 { |
| 95 | + //Ouch! Take damage! |
| 96 | + *health -= 1.0; |
| 97 | + } |
| 98 | + }); |
| 99 | + }); |
| 100 | + } |
| 101 | + |
| 102 | + // Perform collision detection against a set of Walls, forming a convex polygon. |
| 103 | + // Each entity has a Position and some Health (initialized to 100.0). |
| 104 | + // If the position of an entity is found to be outside of a Wall, decrement its "health" by 1.0. |
| 105 | + // The effect is cumulative based on the number of walls. |
| 106 | + // An entity entirely inside the convex polygon will have its health remain unchanged. |
| 107 | + fn batched_avx(mut pos_healths: Query<(&Position, &mut Health)>, walls: Query<&Wall>) { |
| 108 | + // Conceptually, this system is executed using two loops: the outer "batched" loop receiving |
| 109 | + // batches of 8 Positions and Health components at a time, and the inner loop iterating over |
| 110 | + // the Walls. |
| 111 | + |
| 112 | + // There's more than one way to vectorize this system -- this example may not be optimal. |
| 113 | + pos_healths.for_each_mut_batched::<8>( |
| 114 | + |(position, mut health)| { |
| 115 | + // This forms the scalar path: it behaves just like `for_each_mut`. |
| 116 | + |
| 117 | + // Optional: disable change detection for more performance. |
| 118 | + let health = &mut health.bypass_change_detection().0; |
| 119 | + |
| 120 | + // Test each (Position,Health) against each Wall. |
| 121 | + walls.for_each(|wall| { |
| 122 | + let plane = wall.0; |
| 123 | + |
| 124 | + // Test which side of the wall we are on |
| 125 | + let dotproj = plane.dot(position.0); |
| 126 | + |
| 127 | + // Test against the Wall's displacement/discriminant value |
| 128 | + if dotproj < wall.1 { |
| 129 | + //Ouch! Take damage! |
| 130 | + *health -= 1.0; |
| 131 | + } |
| 132 | + }); |
| 133 | + }, |
| 134 | + |(positions, mut healths)| { |
| 135 | + // This forms the vector path: the closure receives a batch of |
| 136 | + // 8 Positions and 8 Healths as arrays. |
| 137 | + |
| 138 | + // Optional: disable change detection for more performance. |
| 139 | + let healths = healths.bypass_change_detection(); |
| 140 | + |
| 141 | + // Treat the Health batch as a batch of 8 f32s. |
| 142 | + unsafe { |
| 143 | + // # SAFETY: Health is repr(transprent)! |
| 144 | + let healths_raw = healths as *mut Health as *mut f32; |
| 145 | + let mut healths = _mm256_loadu_ps(healths_raw); |
| 146 | + |
| 147 | + // NOTE: array::map optimizes poorly -- it is recommended to unpack your arrays |
| 148 | + // manually as shown to avoid spurious copies which will impact your performance. |
| 149 | + let [p0, p1, p2, p3, p4, p5, p6, p7] = positions; |
| 150 | + |
| 151 | + // Perform data layout conversion from AoS to SoA. |
| 152 | + // ps_x will receive all of the X components of the positions, |
| 153 | + // ps_y will receive all of the Y components |
| 154 | + // and ps_z will receive all of the Z's. |
| 155 | + let [ps_x, ps_y, ps_z] = |
| 156 | + aos_to_soa_83(&[p0.0, p1.0, p2.0, p3.0, p4.0, p5.0, p6.0, p7.0]); |
| 157 | + |
| 158 | + // Iterate over each wall without batching. |
| 159 | + walls.for_each(|wall| { |
| 160 | + // Test each wall against all 8 positions at once. The "broadcast" intrinsic |
| 161 | + // helps us achieve this by duplicating the Wall's X coordinate over an entire |
| 162 | + // vector register, e.g., [X X ... X]. The same goes for the Wall's Y and Z |
| 163 | + // coordinates. |
| 164 | + |
| 165 | + // This is the exact same formula as implemented in the scalar path, but |
| 166 | + // modified to be calculated in parallel across each lane. |
| 167 | + |
| 168 | + // Multiply all of the X coordinates of each Position against Wall's Normal X |
| 169 | + let xs_dot = _mm256_mul_ps(ps_x, _mm256_broadcast_ss(&wall.0.x)); |
| 170 | + // Multiply all of the Y coordinates of each Position against Wall's Normal Y |
| 171 | + let ys_dot = _mm256_mul_ps(ps_y, _mm256_broadcast_ss(&wall.0.y)); |
| 172 | + // Multiply all of the Z coordinates of each Position against Wall's Normal Z |
| 173 | + let zs_dot = _mm256_mul_ps(ps_z, _mm256_broadcast_ss(&wall.0.z)); |
| 174 | + |
| 175 | + // Now add them together: the result is a vector register containing the dot |
| 176 | + // product of each Position against the Wall's Normal vector. |
| 177 | + let dotprojs = _mm256_add_ps(_mm256_add_ps(xs_dot, ys_dot), zs_dot); |
| 178 | + |
| 179 | + // Take the Wall's discriminant/displacement value and broadcast it like before. |
| 180 | + let wall_d = _mm256_broadcast_ss(&wall.1); |
| 181 | + |
| 182 | + // Compare each dot product against the Wall's discriminant, using the |
| 183 | + // "Less Than" relation as we did in the scalar code. |
| 184 | + // The result will be be either -1 or zero *as an integer*. |
| 185 | + let cmp = _mm256_cmp_ps::<_CMP_LT_OS>(dotprojs, wall_d); |
| 186 | + |
| 187 | + // Convert the integer values back to f32 values (-1.0 or 0.0). |
| 188 | + // These form the damage values for each entity. |
| 189 | + let damages = _mm256_cvtepi32_ps(_mm256_castps_si256(cmp)); //-1.0 or 0.0 |
| 190 | + |
| 191 | + // Update the healths of each entity being processed with the results of the |
| 192 | + // collision detection. |
| 193 | + healths = _mm256_add_ps(healths, damages); |
| 194 | + }); |
| 195 | + // Now that all Walls have been processed, write the final updated Health values |
| 196 | + // for this batch of entities back to main memory. |
| 197 | + _mm256_storeu_ps(healths_raw, healths); |
| 198 | + } |
| 199 | + }, |
| 200 | + ); |
| 201 | + } |
| 202 | +} |
| 203 | + |
| 204 | +pub fn batched_compute(c: &mut Criterion) { |
| 205 | + let mut group = c.benchmark_group("batched_compute"); |
| 206 | + group.warm_up_time(std::time::Duration::from_secs(1)); |
| 207 | + group.measurement_time(std::time::Duration::from_secs(9)); |
| 208 | + |
| 209 | + for exp in 14..17 { |
| 210 | + let size = 2_i32.pow(exp) - 1; //Ensure scalar path gets run too (incomplete batch at end) |
| 211 | + |
| 212 | + group.bench_with_input( |
| 213 | + BenchmarkId::new("autovectorized", size), |
| 214 | + &size, |
| 215 | + |b, &size| { |
| 216 | + let Benchmark(mut world) = Benchmark::new(size); |
| 217 | + |
| 218 | + let mut system = IntoSystem::into_system(Benchmark::scalar); |
| 219 | + system.initialize(&mut world); |
| 220 | + system.update_archetype_component_access(&world); |
| 221 | + |
| 222 | + b.iter(move || system.run((), &mut world)); |
| 223 | + }, |
| 224 | + ); |
| 225 | + |
| 226 | + group.bench_with_input(BenchmarkId::new("batched_avx", size), &size, |b, &size| { |
| 227 | + let Benchmark(mut world) = Benchmark::new(size); |
| 228 | + |
| 229 | + let mut system = IntoSystem::into_system(Benchmark::batched_avx); |
| 230 | + system.initialize(&mut world); |
| 231 | + system.update_archetype_component_access(&world); |
| 232 | + |
| 233 | + b.iter(move || system.run((), &mut world)); |
| 234 | + }); |
| 235 | + } |
| 236 | + |
| 237 | + group.finish(); |
| 238 | +} |
0 commit comments