Skip to content

Commit 35ab799

Browse files
Implement batched query support
PR #6161 Issue #1990 * Only Dense queries are accelerated currently * Code refactored to use GATs * Batch type does not encode alignment as per discussion * Simplified calling for_each_{mut_}batched (no longer need _ arguments) * Documentation about SIMD and batching * Added an example and an AVX-specific benchmark
1 parent 39e14a4 commit 35ab799

File tree

18 files changed

+1660
-43
lines changed

18 files changed

+1660
-43
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
use bevy_ecs::prelude::*;
2+
use core::arch::x86_64::*;
3+
use glam::*;
4+
use rand::prelude::*;
5+
6+
use criterion::BenchmarkId;
7+
use criterion::Criterion;
8+
9+
#[derive(Component, Copy, Clone, Default)]
10+
struct Position(Vec3);
11+
12+
#[derive(Component, Copy, Clone, Default)]
13+
#[repr(transparent)]
14+
struct Health(f32);
15+
16+
//A hyperplane describing solid geometry, (x,y,z) = n with d such that nx + d = 0
17+
#[derive(Component, Copy, Clone, Default)]
18+
struct Wall(Vec3, f32);
19+
20+
struct Benchmark(World);
21+
22+
fn rnd_vec3(rng: &mut ThreadRng) -> Vec3 {
23+
let x1 = rng.gen_range(-16.0..=16.0);
24+
let x2 = rng.gen_range(-16.0..=16.0);
25+
let x3 = rng.gen_range(-16.0..=16.0);
26+
27+
Vec3::new(x1, x2, x3)
28+
}
29+
30+
fn rnd_wall(rng: &mut ThreadRng) -> Wall {
31+
let d = rng.gen_range(-16.0..=16.0);
32+
33+
Wall(rnd_vec3(rng).normalize_or_zero(), d)
34+
}
35+
36+
// AoS to SoA data layout conversion for x86 AVX.
37+
// This code has been adapted from:
38+
// https://www.intel.com/content/dam/develop/external/us/en/documents/normvec-181650.pdf
39+
#[inline(always)]
40+
// This example is written in a way that benefits from inlined data layout conversion.
41+
fn aos_to_soa_83(aos_inner: &[Vec3; 8]) -> [__m256; 3] {
42+
unsafe {
43+
//# SAFETY: Vec3 is repr(C) for x86_64
44+
let mx0 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(0));
45+
let mx1 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(4));
46+
let mx2 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(8));
47+
let mx3 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(12));
48+
let mx4 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(16));
49+
let mx5 = _mm_loadu_ps((aos_inner as *const Vec3 as *const f32).offset(20));
50+
51+
let mut m03 = _mm256_castps128_ps256(mx0); // load lower halves
52+
let mut m14 = _mm256_castps128_ps256(mx1);
53+
let mut m25 = _mm256_castps128_ps256(mx2);
54+
m03 = _mm256_insertf128_ps(m03, mx3, 1); // load upper halves
55+
m14 = _mm256_insertf128_ps(m14, mx4, 1);
56+
m25 = _mm256_insertf128_ps(m25, mx5, 1);
57+
58+
let xy = _mm256_shuffle_ps::<0b10011110>(m14, m25); // upper x's and y's
59+
let yz = _mm256_shuffle_ps::<0b01001001>(m03, m14); // lower y's and z's
60+
let x = _mm256_shuffle_ps::<0b10001100>(m03, xy);
61+
let y = _mm256_shuffle_ps::<0b11011000>(yz, xy);
62+
let z = _mm256_shuffle_ps::<0b11001101>(yz, m25);
63+
[x, y, z]
64+
}
65+
}
66+
67+
impl Benchmark {
68+
fn new(size: i32) -> Benchmark {
69+
let mut world = World::new();
70+
71+
let mut rng = rand::thread_rng();
72+
73+
world.spawn_batch((0..size).map(|_| (Position(rnd_vec3(&mut rng)), Health(100.0))));
74+
world.spawn_batch((0..(2_i32.pow(12) - 1)).map(|_| (rnd_wall(&mut rng))));
75+
76+
Self(world)
77+
}
78+
79+
fn scalar(mut pos_healths: Query<(&Position, &mut Health)>, walls: Query<&Wall>) {
80+
pos_healths.for_each_mut(|(position, mut health)| {
81+
// This forms the scalar path: it behaves just like `for_each_mut`.
82+
83+
// Optional: disable change detection for more performance.
84+
let health = &mut health.bypass_change_detection().0;
85+
86+
// Test each (Position,Health) against each Wall.
87+
walls.for_each(|wall| {
88+
let plane = wall.0;
89+
90+
// Test which side of the wall we are on
91+
let dotproj = plane.dot(position.0);
92+
93+
// Test against the Wall's displacement/discriminant value
94+
if dotproj < wall.1 {
95+
//Ouch! Take damage!
96+
*health -= 1.0;
97+
}
98+
});
99+
});
100+
}
101+
102+
// Perform collision detection against a set of Walls, forming a convex polygon.
103+
// Each entity has a Position and some Health (initialized to 100.0).
104+
// If the position of an entity is found to be outside of a Wall, decrement its "health" by 1.0.
105+
// The effect is cumulative based on the number of walls.
106+
// An entity entirely inside the convex polygon will have its health remain unchanged.
107+
fn batched_avx(mut pos_healths: Query<(&Position, &mut Health)>, walls: Query<&Wall>) {
108+
// Conceptually, this system is executed using two loops: the outer "batched" loop receiving
109+
// batches of 8 Positions and Health components at a time, and the inner loop iterating over
110+
// the Walls.
111+
112+
// There's more than one way to vectorize this system -- this example may not be optimal.
113+
pos_healths.for_each_mut_batched::<8>(
114+
|(position, mut health)| {
115+
// This forms the scalar path: it behaves just like `for_each_mut`.
116+
117+
// Optional: disable change detection for more performance.
118+
let health = &mut health.bypass_change_detection().0;
119+
120+
// Test each (Position,Health) against each Wall.
121+
walls.for_each(|wall| {
122+
let plane = wall.0;
123+
124+
// Test which side of the wall we are on
125+
let dotproj = plane.dot(position.0);
126+
127+
// Test against the Wall's displacement/discriminant value
128+
if dotproj < wall.1 {
129+
//Ouch! Take damage!
130+
*health -= 1.0;
131+
}
132+
});
133+
},
134+
|(positions, mut healths)| {
135+
// This forms the vector path: the closure receives a batch of
136+
// 8 Positions and 8 Healths as arrays.
137+
138+
// Optional: disable change detection for more performance.
139+
let healths = healths.bypass_change_detection();
140+
141+
// Treat the Health batch as a batch of 8 f32s.
142+
unsafe {
143+
// # SAFETY: Health is repr(transprent)!
144+
let healths_raw = healths as *mut Health as *mut f32;
145+
let mut healths = _mm256_loadu_ps(healths_raw);
146+
147+
// NOTE: array::map optimizes poorly -- it is recommended to unpack your arrays
148+
// manually as shown to avoid spurious copies which will impact your performance.
149+
let [p0, p1, p2, p3, p4, p5, p6, p7] = positions;
150+
151+
// Perform data layout conversion from AoS to SoA.
152+
// ps_x will receive all of the X components of the positions,
153+
// ps_y will receive all of the Y components
154+
// and ps_z will receive all of the Z's.
155+
let [ps_x, ps_y, ps_z] =
156+
aos_to_soa_83(&[p0.0, p1.0, p2.0, p3.0, p4.0, p5.0, p6.0, p7.0]);
157+
158+
// Iterate over each wall without batching.
159+
walls.for_each(|wall| {
160+
// Test each wall against all 8 positions at once. The "broadcast" intrinsic
161+
// helps us achieve this by duplicating the Wall's X coordinate over an entire
162+
// vector register, e.g., [X X ... X]. The same goes for the Wall's Y and Z
163+
// coordinates.
164+
165+
// This is the exact same formula as implemented in the scalar path, but
166+
// modified to be calculated in parallel across each lane.
167+
168+
// Multiply all of the X coordinates of each Position against Wall's Normal X
169+
let xs_dot = _mm256_mul_ps(ps_x, _mm256_broadcast_ss(&wall.0.x));
170+
// Multiply all of the Y coordinates of each Position against Wall's Normal Y
171+
let ys_dot = _mm256_mul_ps(ps_y, _mm256_broadcast_ss(&wall.0.y));
172+
// Multiply all of the Z coordinates of each Position against Wall's Normal Z
173+
let zs_dot = _mm256_mul_ps(ps_z, _mm256_broadcast_ss(&wall.0.z));
174+
175+
// Now add them together: the result is a vector register containing the dot
176+
// product of each Position against the Wall's Normal vector.
177+
let dotprojs = _mm256_add_ps(_mm256_add_ps(xs_dot, ys_dot), zs_dot);
178+
179+
// Take the Wall's discriminant/displacement value and broadcast it like before.
180+
let wall_d = _mm256_broadcast_ss(&wall.1);
181+
182+
// Compare each dot product against the Wall's discriminant, using the
183+
// "Less Than" relation as we did in the scalar code.
184+
// The result will be be either -1 or zero *as an integer*.
185+
let cmp = _mm256_cmp_ps::<_CMP_LT_OS>(dotprojs, wall_d);
186+
187+
// Convert the integer values back to f32 values (-1.0 or 0.0).
188+
// These form the damage values for each entity.
189+
let damages = _mm256_cvtepi32_ps(_mm256_castps_si256(cmp)); //-1.0 or 0.0
190+
191+
// Update the healths of each entity being processed with the results of the
192+
// collision detection.
193+
healths = _mm256_add_ps(healths, damages);
194+
});
195+
// Now that all Walls have been processed, write the final updated Health values
196+
// for this batch of entities back to main memory.
197+
_mm256_storeu_ps(healths_raw, healths);
198+
}
199+
},
200+
);
201+
}
202+
}
203+
204+
pub fn batched_compute(c: &mut Criterion) {
205+
let mut group = c.benchmark_group("batched_compute");
206+
group.warm_up_time(std::time::Duration::from_secs(1));
207+
group.measurement_time(std::time::Duration::from_secs(9));
208+
209+
for exp in 14..17 {
210+
let size = 2_i32.pow(exp) - 1; //Ensure scalar path gets run too (incomplete batch at end)
211+
212+
group.bench_with_input(
213+
BenchmarkId::new("autovectorized", size),
214+
&size,
215+
|b, &size| {
216+
let Benchmark(mut world) = Benchmark::new(size);
217+
218+
let mut system = IntoSystem::into_system(Benchmark::scalar);
219+
system.initialize(&mut world);
220+
system.update_archetype_component_access(&world);
221+
222+
b.iter(move || system.run((), &mut world));
223+
},
224+
);
225+
226+
group.bench_with_input(BenchmarkId::new("batched_avx", size), &size, |b, &size| {
227+
let Benchmark(mut world) = Benchmark::new(size);
228+
229+
let mut system = IntoSystem::into_system(Benchmark::batched_avx);
230+
system.initialize(&mut world);
231+
system.update_archetype_component_access(&world);
232+
233+
b.iter(move || system.run((), &mut world));
234+
});
235+
}
236+
237+
group.finish();
238+
}

benches/benches/bevy_ecs/iteration/mod.rs

+16
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
use criterion::*;
22

3+
#[cfg(target_feature = "avx")]
4+
mod batched_compute;
5+
36
mod heavy_compute;
47
mod iter_frag;
58
mod iter_frag_foreach;
@@ -19,8 +22,21 @@ mod iter_simple_system;
1922
mod iter_simple_wide;
2023
mod iter_simple_wide_sparse_set;
2124

25+
#[cfg(target_feature = "avx")]
26+
use batched_compute::batched_compute;
27+
2228
use heavy_compute::*;
2329

30+
#[cfg(target_feature = "avx")]
31+
criterion_group!(
32+
iterations_benches,
33+
iter_frag,
34+
iter_frag_sparse,
35+
iter_simple,
36+
heavy_compute,
37+
batched_compute,
38+
);
39+
#[cfg(not(target_feature = "avx"))]
2440
criterion_group!(
2541
iterations_benches,
2642
iter_frag,

crates/bevy_ecs/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ serde = { version = "1", features = ["derive"] }
3030

3131
[dev-dependencies]
3232
rand = "0.8"
33+
bevy_math = { path = "../bevy_math", version = "0.9.0-dev" }
3334

3435
[[example]]
3536
name = "events"

crates/bevy_ecs/src/archetype.rs

+6-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@ use crate::{
2323
bundle::BundleId,
2424
component::{ComponentId, StorageType},
2525
entity::{Entity, EntityLocation},
26-
storage::{ImmutableSparseSet, SparseArray, SparseSet, SparseSetIndex, TableId, TableRow},
26+
storage::{
27+
aligned_vec::SimdAlignedVec, ImmutableSparseSet, SparseArray, SparseSet, SparseSetIndex,
28+
TableId, TableRow,
29+
},
2730
};
2831
use std::{
2932
collections::HashMap,
@@ -297,7 +300,7 @@ pub struct Archetype {
297300
id: ArchetypeId,
298301
table_id: TableId,
299302
edges: Edges,
300-
entities: Vec<ArchetypeEntity>,
303+
entities: SimdAlignedVec<ArchetypeEntity>,
301304
components: ImmutableSparseSet<ComponentId, ArchetypeComponentInfo>,
302305
}
303306

@@ -333,7 +336,7 @@ impl Archetype {
333336
Self {
334337
id,
335338
table_id,
336-
entities: Vec::new(),
339+
entities: SimdAlignedVec::new(),
337340
components: components.into_immutable(),
338341
edges: Default::default(),
339342
}

0 commit comments

Comments
 (0)