Skip to content

Commit 50aa66a

Browse files
cwabbott0Marge Bot
authored andcommitted
tu: Rewrite visibility stream allocation
The mechanism implemented in the hardware to synchronize against Write after Read hazards with the visibility stream for concurrent binning is for BV and BR to keep track of the number of render passes they have finished and BV waits until BR_count >= BV_count - vis stream count. For example, if there are two visibility streams and the user submits three renderpasses, before starting renderpass #3 BV will wait for BR to finish renderpass #1. It's assumed that renderpass #3 and #2 use different visibility streams, so it's safe to start working on #3 once #2 is done. This mechanism is assumed to work across renderpasses and even submits, and the only way to reset the BR/BV counts is via CP_RESET_CONTEXT_STATE which is only done by the kernel when switching contexts. This vastly complicates things for Vulkan, where we have no idea what order command buffers will be submitted. This means that we have to defer emitting the actual pointers until submission time and create patchpoints instead. This gets unfortunately very complicated with SIMULTANEOUS_USE_BIT where we have to update the patchpoints on the GPU. I've taken the liberty of also deferring the allocation of the visibility stream until submit time. This will help us later move to per-queue visibility streams, which will be necessary for supporting multiple simultaneous queues. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36590>
1 parent 416dc87 commit 50aa66a

File tree

5 files changed

+358
-23
lines changed

5 files changed

+358
-23
lines changed

src/freedreno/vulkan/tu_cmd_buffer.cc

Lines changed: 145 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -201,42 +201,53 @@ tu6_lazy_init_vsc(struct tu_cmd_buffer *cmd)
201201

202202
mtx_unlock(&dev->mutex);
203203

204-
struct tu_bo *vsc_bo;
205204
uint32_t prim_strm_size = cmd->vsc_prim_strm_pitch * num_vsc_pipes;
206205
uint32_t draw_strm_size = cmd->vsc_draw_strm_pitch * num_vsc_pipes;
207206
uint32_t draw_strm_size_size = 4 * num_vsc_pipes;
208207
uint32_t state_size = 4 * num_vsc_pipes;
209208

210-
tu_get_scratch_bo(dev,
211-
prim_strm_size + draw_strm_size + draw_strm_size_size +
212-
state_size,
213-
&vsc_bo);
209+
cmd->vsc_size =
210+
prim_strm_size + draw_strm_size + draw_strm_size_size + state_size;
214211

215-
cmd->vsc_prim_strm_va = vsc_bo->iova;
216-
cmd->vsc_draw_strm_va = vsc_bo->iova + prim_strm_size;
217-
cmd->vsc_draw_strm_size_va = cmd->vsc_draw_strm_va + draw_strm_size;
218-
cmd->vsc_state_va = cmd->vsc_draw_strm_size_va + draw_strm_size_size;
212+
cmd->vsc_prim_strm_offset = 0;
213+
cmd->vsc_draw_strm_offset = prim_strm_size;
214+
cmd->vsc_draw_strm_size_offset = cmd->vsc_draw_strm_offset + draw_strm_size;
215+
cmd->vsc_state_offset = cmd->vsc_draw_strm_size_offset + draw_strm_size_size;
216+
}
217+
218+
static void
219+
tu_emit_vis_stream_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
220+
uint32_t offset)
221+
{
222+
struct tu_vis_stream_patchpoint patchpoint = {
223+
.data = cs->cur,
224+
.iova = tu_cs_get_cur_iova(cs),
225+
.offset = offset,
226+
};
227+
228+
util_dynarray_append(&cmd->vis_stream_patchpoints, patchpoint);
229+
tu_cs_emit_qw(cs, offset);
219230
}
220231

221232
template <chip CHIP>
222233
static void
223234
tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
224235
{
225236
if (CHIP == A6XX) {
226-
tu_cs_emit_regs(cs,
227-
A6XX_VSC_SIZE_BASE(.qword = cmd->vsc_draw_strm_size_va));
228-
tu_cs_emit_regs(cs,
229-
A6XX_VSC_PIPE_DATA_PRIM_BASE(.qword = cmd->vsc_prim_strm_va));
230-
tu_cs_emit_regs(
231-
cs, A6XX_VSC_PIPE_DATA_DRAW_BASE(.qword = cmd->vsc_draw_strm_va));
237+
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_SIZE_BASE, 2);
238+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset);
239+
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_PRIM_BASE, 2);
240+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset);
241+
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_DRAW_BASE, 2);
242+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset);
232243
} else {
233244
tu_cs_emit_pkt7(cs, CP_SET_PSEUDO_REG, 3 * 3);
234245
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_DRAW_BASE));
235-
tu_cs_emit_qw(cs, cmd->vsc_draw_strm_va);
246+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset);
236247
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_SIZE_BASE));
237-
tu_cs_emit_qw(cs, cmd->vsc_draw_strm_size_va);
248+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset);
238249
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_PRIM_BASE));
239-
tu_cs_emit_qw(cs, cmd->vsc_prim_strm_va);
250+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset);
240251
}
241252

242253
cmd->vsc_initialized = true;
@@ -1278,7 +1289,13 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
12781289
A6XX_CP_SET_MARKER_0_USES_GMEM);
12791290

12801291
if (CHIP == A6XX && cmd->device->physical_device->has_preemption) {
1292+
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
1293+
tu_cs_set_writeable(cs, true);
1294+
12811295
tu_emit_vsc<CHIP>(cmd, &cmd->cs);
1296+
1297+
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
1298+
tu_cs_set_writeable(cs, false);
12821299
}
12831300

12841301
unsigned views = tu_fdm_num_layers(cmd);
@@ -2798,8 +2815,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
27982815
* emits the preamble lazily. We chose the per-bin approach but blob's
27992816
* should be a better one.
28002817
*/
2818+
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
2819+
tu_cs_set_writeable(cs, true);
2820+
28012821
tu_emit_vsc<CHIP>(cmd, cs);
28022822

2823+
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
2824+
tu_cs_set_writeable(cs, false);
2825+
28032826
tu6_emit_bin_size<CHIP>(cs, tiling->tile0.width, tiling->tile0.height,
28042827
{
28052828
.render_mode = BINNING_PASS,
@@ -2855,13 +2878,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
28552878
tu6_lazy_init_vsc(cmd);
28562879

28572880
/* Upload state regs to memory to be restored on skipsaverestore
2858-
* preemption.
2881+
* preemption. On a7xx this is considered part of the vis stream that
2882+
* requires a patchpoint.
28592883
*/
2884+
if (CHIP >= A7XX &&
2885+
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
2886+
tu_cs_set_writeable(cs, true);
2887+
28602888
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
28612889
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(0)) |
28622890
CP_REG_TO_MEM_0_CNT(32));
28632891
if (CHIP >= A7XX)
2864-
tu_cs_emit_qw(cs, cmd->vsc_state_va);
2892+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset);
28652893
else
28662894
tu_cs_emit_qw(cs, global_iova(cmd, vsc_state));
28672895

@@ -2874,8 +2902,12 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
28742902
tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4);
28752903
tu_cs_emit(cs, num_vsc_pipes); /* count */
28762904
tu_cs_emit(cs, 0); /* offset */
2877-
tu_cs_emit_qw(cs, cmd->vsc_state_va);
2905+
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset);
28782906
}
2907+
2908+
if (CHIP >= A7XX &&
2909+
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
2910+
tu_cs_set_writeable(cs, false);
28792911
}
28802912

28812913
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
@@ -3573,6 +3605,26 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
35733605
ralloc_free(cmd_buffer->pre_chain.patchpoints_ctx);
35743606
util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints);
35753607
util_dynarray_fini(&cmd_buffer->pre_chain.fdm_bin_patchpoints);
3608+
util_dynarray_fini(&cmd_buffer->vis_stream_patchpoints);
3609+
3610+
util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *,
3611+
bo) {
3612+
tu_bo_finish(cmd_buffer->device, *bo);
3613+
}
3614+
3615+
mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx);
3616+
util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos,
3617+
struct tu_vis_stream_patchpoint_cs,
3618+
bo) {
3619+
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
3620+
&bo->cs_bo);
3621+
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
3622+
&bo->fence_bo);
3623+
}
3624+
mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx);
3625+
3626+
util_dynarray_fini(&cmd_buffer->vis_stream_bos);
3627+
util_dynarray_fini(&cmd_buffer->vis_stream_cs_bos);
35763628

35773629
vk_command_buffer_finish(&cmd_buffer->vk);
35783630
vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc,
@@ -3649,6 +3701,26 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
36493701
cmd_buffer->pre_chain.patchpoints_ctx = NULL;
36503702
util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints);
36513703
util_dynarray_clear(&cmd_buffer->pre_chain.fdm_bin_patchpoints);
3704+
util_dynarray_clear(&cmd_buffer->vis_stream_patchpoints);
3705+
3706+
util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *,
3707+
bo) {
3708+
tu_bo_finish(cmd_buffer->device, *bo);
3709+
}
3710+
3711+
mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx);
3712+
util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos,
3713+
struct tu_vis_stream_patchpoint_cs,
3714+
bo) {
3715+
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
3716+
&bo->cs_bo);
3717+
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
3718+
&bo->fence_bo);
3719+
}
3720+
mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx);
3721+
3722+
util_dynarray_clear(&cmd_buffer->vis_stream_bos);
3723+
util_dynarray_clear(&cmd_buffer->vis_stream_cs_bos);
36523724
}
36533725

36543726
const struct vk_command_buffer_ops tu_cmd_buffer_ops = {
@@ -5562,6 +5634,58 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
55625634
util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
55635635
&secondary->fdm_bin_patchpoints);
55645636
} else {
5637+
struct tu_cs *cs = &cmd->cs;
5638+
5639+
/* If the secondary can be used multiple times, we have to set its
5640+
* patchpoints on the GPU. Set them here, and create a new
5641+
* patchpoint pointing to the CP_MEM_WRITE packet. Otherwise just
5642+
* copy them over adjusting the index.
5643+
*/
5644+
bool simultaneous_use = secondary->usage_flags &
5645+
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
5646+
5647+
/* If this cmdbuf itself can be used multiple times in a submit then
5648+
* its patchpoint will also be updated on the GPU.
5649+
*/
5650+
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
5651+
tu_cs_set_writeable(cs, true);
5652+
5653+
util_dynarray_foreach (&secondary->vis_stream_patchpoints,
5654+
struct tu_vis_stream_patchpoint,
5655+
secondary_patchpoint) {
5656+
struct tu_vis_stream_patchpoint patchpoint =
5657+
*secondary_patchpoint;
5658+
5659+
if (simultaneous_use) {
5660+
tu_cs_reserve_space(cs, 5);
5661+
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
5662+
tu_cs_emit_qw(cs, patchpoint.iova);
5663+
patchpoint.iova = tu_cs_get_cur_iova(cs);
5664+
patchpoint.data = cs->cur;
5665+
tu_cs_emit_qw(cs, 0);
5666+
}
5667+
5668+
util_dynarray_append(&cmd->vis_stream_patchpoints,
5669+
patchpoint);
5670+
}
5671+
5672+
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
5673+
tu_cs_set_writeable(cs, false);
5674+
5675+
if (simultaneous_use) {
5676+
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
5677+
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
5678+
5679+
/* Make BV wait for updates on BR to land */
5680+
if (cmd->device->physical_device->info->chip >= 7) {
5681+
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
5682+
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
5683+
CP_THREAD_CONTROL_0_SYNC_THREADS);
5684+
}
5685+
}
5686+
5687+
cmd->vsc_size = MAX2(cmd->vsc_size, secondary->vsc_size);
5688+
55655689
switch (secondary->state.suspend_resume) {
55665690
case SR_NONE:
55675691
assert(tu_cs_is_empty(&secondary->draw_cs));

src/freedreno/vulkan/tu_cmd_buffer.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,10 @@ struct tu_cmd_buffer
618618
void *patchpoints_ctx;
619619
struct util_dynarray fdm_bin_patchpoints;
620620

621+
struct util_dynarray vis_stream_patchpoints;
622+
struct util_dynarray vis_stream_bos;
623+
struct util_dynarray vis_stream_cs_bos;
624+
621625
VkCommandBufferUsageFlags usage_flags;
622626

623627
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
@@ -686,8 +690,9 @@ struct tu_cmd_buffer
686690

687691
uint32_t vsc_draw_strm_pitch;
688692
uint32_t vsc_prim_strm_pitch;
689-
uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va;
690-
uint64_t vsc_state_va;
693+
uint32_t vsc_draw_strm_offset, vsc_draw_strm_size_offset;
694+
uint32_t vsc_prim_strm_offset, vsc_state_offset;
695+
uint64_t vsc_size;
691696
bool vsc_initialized;
692697

693698
bool prev_fsr_is_null;
@@ -833,6 +838,16 @@ struct tu_fdm_bin_patchpoint {
833838
tu_fdm_bin_apply_t apply;
834839
};
835840

841+
struct tu_vis_stream_patchpoint {
842+
uint32_t *data;
843+
uint64_t iova;
844+
uint32_t offset;
845+
};
846+
847+
struct tu_vis_stream_patchpoint_cs {
848+
struct tu_suballoc_bo cs_bo;
849+
struct tu_suballoc_bo fence_bo;
850+
};
836851

837852
void
838853
tu_barrier(struct tu_cmd_buffer *cmd,

src/freedreno/vulkan/tu_device.cc

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2725,6 +2725,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
27252725
mtx_init(&device->radix_sort_mutex, mtx_plain);
27262726
mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain);
27272727
mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain);
2728+
mtx_init(&device->vis_stream_mtx, mtx_plain);
2729+
mtx_init(&device->vis_stream_suballocator_mtx, mtx_plain);
27282730
mtx_init(&device->mutex, mtx_plain);
27292731
mtx_init(&device->copy_timestamp_cs_pool_mutex, mtx_plain);
27302732
#ifdef HAVE_PERFETTO
@@ -2853,6 +2855,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
28532855
getpagesize(), TU_BO_ALLOC_INTERNAL_RESOURCE,
28542856
"event_suballoc");
28552857

2858+
tu_bo_suballocator_init(
2859+
&device->vis_stream_suballocator, device,
2860+
getpagesize(),
2861+
(enum tu_bo_alloc_flags)(TU_BO_ALLOC_INTERNAL_RESOURCE |
2862+
TU_BO_ALLOC_ALLOW_DUMP),
2863+
"vis_stream_suballoc");
2864+
28562865
result = tu_bo_init_new(
28572866
device, NULL, &device->global_bo, global_size,
28582867
(enum tu_bo_alloc_flags) (TU_BO_ALLOC_ALLOW_DUMP |
@@ -3146,12 +3155,16 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
31463155
tu_bo_suballocator_finish(&device->autotune_suballoc);
31473156
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
31483157
tu_bo_suballocator_finish(&device->event_suballoc);
3158+
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
31493159

31503160
tu_bo_finish(device, device->global_bo);
31513161

31523162
if (device->vm_bind_fence_fd != -1)
31533163
close(device->vm_bind_fence_fd);
31543164

3165+
if (device->vis_stream_bo)
3166+
tu_bo_finish(device, device->vis_stream_bo);
3167+
31553168
if (device->null_accel_struct_bo)
31563169
tu_bo_finish(device, device->null_accel_struct_bo);
31573170

src/freedreno/vulkan/tu_device.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,11 @@ struct tu_device
363363
struct tu_suballocator *trace_suballoc;
364364
mtx_t trace_mutex;
365365

366+
/* VSC patchpoint BO suballocator.
367+
*/
368+
struct tu_suballocator vis_stream_suballocator;
369+
mtx_t vis_stream_suballocator_mtx;
370+
366371
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
367372
#define TU_TESS_FACTOR_SIZE (8 * 1024)
368373
#define TU_TESS_PARAM_SIZE (128 * 1024)
@@ -433,6 +438,9 @@ struct tu_device
433438

434439
struct tu_cs_entry bin_preamble_entry, bin_preamble_bv_entry;
435440

441+
struct tu_bo *vis_stream_bo;
442+
mtx_t vis_stream_mtx;
443+
436444
struct util_dynarray dynamic_rendering_pending;
437445
VkCommandPool dynamic_rendering_pool;
438446
uint32_t dynamic_rendering_fence;

0 commit comments

Comments
 (0)