@@ -201,42 +201,53 @@ tu6_lazy_init_vsc(struct tu_cmd_buffer *cmd)
201201
202202 mtx_unlock (&dev->mutex );
203203
204- struct tu_bo *vsc_bo;
205204 uint32_t prim_strm_size = cmd->vsc_prim_strm_pitch * num_vsc_pipes;
206205 uint32_t draw_strm_size = cmd->vsc_draw_strm_pitch * num_vsc_pipes;
207206 uint32_t draw_strm_size_size = 4 * num_vsc_pipes;
208207 uint32_t state_size = 4 * num_vsc_pipes;
209208
210- tu_get_scratch_bo (dev,
211- prim_strm_size + draw_strm_size + draw_strm_size_size +
212- state_size,
213- &vsc_bo);
209+ cmd->vsc_size =
210+ prim_strm_size + draw_strm_size + draw_strm_size_size + state_size;
214211
215- cmd->vsc_prim_strm_va = vsc_bo->iova ;
216- cmd->vsc_draw_strm_va = vsc_bo->iova + prim_strm_size;
217- cmd->vsc_draw_strm_size_va = cmd->vsc_draw_strm_va + draw_strm_size;
218- cmd->vsc_state_va = cmd->vsc_draw_strm_size_va + draw_strm_size_size;
212+ cmd->vsc_prim_strm_offset = 0 ;
213+ cmd->vsc_draw_strm_offset = prim_strm_size;
214+ cmd->vsc_draw_strm_size_offset = cmd->vsc_draw_strm_offset + draw_strm_size;
215+ cmd->vsc_state_offset = cmd->vsc_draw_strm_size_offset + draw_strm_size_size;
216+ }
217+
218+ static void
219+ tu_emit_vis_stream_patchpoint (struct tu_cmd_buffer *cmd, struct tu_cs *cs,
220+ uint32_t offset)
221+ {
222+ struct tu_vis_stream_patchpoint patchpoint = {
223+ .data = cs->cur ,
224+ .iova = tu_cs_get_cur_iova (cs),
225+ .offset = offset,
226+ };
227+
228+ util_dynarray_append (&cmd->vis_stream_patchpoints , patchpoint);
229+ tu_cs_emit_qw (cs, offset);
219230}
220231
221232template <chip CHIP>
222233static void
223234tu_emit_vsc (struct tu_cmd_buffer *cmd, struct tu_cs *cs)
224235{
225236 if (CHIP == A6XX) {
226- tu_cs_emit_regs (cs,
227- A6XX_VSC_SIZE_BASE (. qword = cmd->vsc_draw_strm_size_va ) );
228- tu_cs_emit_regs (cs,
229- A6XX_VSC_PIPE_DATA_PRIM_BASE (. qword = cmd->vsc_prim_strm_va ) );
230- tu_cs_emit_regs (
231- cs, A6XX_VSC_PIPE_DATA_DRAW_BASE (. qword = cmd->vsc_draw_strm_va ) );
237+ tu_cs_emit_pkt4 (cs, REG_A6XX_VSC_SIZE_BASE, 2 );
238+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_draw_strm_size_offset );
239+ tu_cs_emit_pkt4 (cs, REG_A6XX_VSC_PIPE_DATA_PRIM_BASE, 2 );
240+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_prim_strm_offset );
241+ tu_cs_emit_pkt4 (cs, REG_A6XX_VSC_PIPE_DATA_DRAW_BASE, 2 );
242+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_draw_strm_offset );
232243 } else {
233244 tu_cs_emit_pkt7 (cs, CP_SET_PSEUDO_REG, 3 * 3 );
234245 tu_cs_emit (cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG (VSC_PIPE_DATA_DRAW_BASE));
235- tu_cs_emit_qw ( cs, cmd->vsc_draw_strm_va );
246+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_draw_strm_offset );
236247 tu_cs_emit (cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG (VSC_SIZE_BASE));
237- tu_cs_emit_qw ( cs, cmd->vsc_draw_strm_size_va );
248+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_draw_strm_size_offset );
238249 tu_cs_emit (cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG (VSC_PIPE_DATA_PRIM_BASE));
239- tu_cs_emit_qw ( cs, cmd->vsc_prim_strm_va );
250+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_prim_strm_offset );
240251 }
241252
242253 cmd->vsc_initialized = true ;
@@ -1278,7 +1289,13 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
12781289 A6XX_CP_SET_MARKER_0_USES_GMEM);
12791290
12801291 if (CHIP == A6XX && cmd->device ->physical_device ->has_preemption ) {
1292+ if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
1293+ tu_cs_set_writeable (cs, true );
1294+
12811295 tu_emit_vsc<CHIP>(cmd, &cmd->cs );
1296+
1297+ if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
1298+ tu_cs_set_writeable (cs, false );
12821299 }
12831300
12841301 unsigned views = tu_fdm_num_layers (cmd);
@@ -2798,8 +2815,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
27982815 * emits the preamble lazily. We chose the per-bin approach but blob's
27992816 * should be a better one.
28002817 */
2818+ if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
2819+ tu_cs_set_writeable (cs, true );
2820+
28012821 tu_emit_vsc<CHIP>(cmd, cs);
28022822
2823+ if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
2824+ tu_cs_set_writeable (cs, false );
2825+
28032826 tu6_emit_bin_size<CHIP>(cs, tiling->tile0 .width , tiling->tile0 .height ,
28042827 {
28052828 .render_mode = BINNING_PASS,
@@ -2855,13 +2878,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
28552878 tu6_lazy_init_vsc (cmd);
28562879
28572880 /* Upload state regs to memory to be restored on skipsaverestore
2858- * preemption.
2881+ * preemption. On a7xx this is considered part of the vis stream that
2882+ * requires a patchpoint.
28592883 */
2884+ if (CHIP >= A7XX &&
2885+ (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
2886+ tu_cs_set_writeable (cs, true );
2887+
28602888 tu_cs_emit_pkt7 (cs, CP_REG_TO_MEM, 3 );
28612889 tu_cs_emit (cs, CP_REG_TO_MEM_0_REG (REG_A6XX_VSC_CHANNEL_VISIBILITY (0 )) |
28622890 CP_REG_TO_MEM_0_CNT (32 ));
28632891 if (CHIP >= A7XX)
2864- tu_cs_emit_qw ( cs, cmd->vsc_state_va );
2892+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_state_offset );
28652893 else
28662894 tu_cs_emit_qw (cs, global_iova (cmd, vsc_state));
28672895
@@ -2874,8 +2902,12 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
28742902 tu_cs_emit_pkt7 (cs, CP_MEM_TO_SCRATCH_MEM, 4 );
28752903 tu_cs_emit (cs, num_vsc_pipes); /* count */
28762904 tu_cs_emit (cs, 0 ); /* offset */
2877- tu_cs_emit_qw ( cs, cmd->vsc_state_va );
2905+ tu_emit_vis_stream_patchpoint (cmd, cs, cmd->vsc_state_offset );
28782906 }
2907+
2908+ if (CHIP >= A7XX &&
2909+ (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
2910+ tu_cs_set_writeable (cs, false );
28792911 }
28802912
28812913 tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
@@ -3573,6 +3605,26 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
35733605 ralloc_free (cmd_buffer->pre_chain .patchpoints_ctx );
35743606 util_dynarray_fini (&cmd_buffer->fdm_bin_patchpoints );
35753607 util_dynarray_fini (&cmd_buffer->pre_chain .fdm_bin_patchpoints );
3608+ util_dynarray_fini (&cmd_buffer->vis_stream_patchpoints );
3609+
3610+ util_dynarray_foreach (&cmd_buffer->vis_stream_bos , struct tu_bo *,
3611+ bo) {
3612+ tu_bo_finish (cmd_buffer->device , *bo);
3613+ }
3614+
3615+ mtx_lock (&cmd_buffer->device ->vis_stream_suballocator_mtx );
3616+ util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos ,
3617+ struct tu_vis_stream_patchpoint_cs ,
3618+ bo) {
3619+ tu_suballoc_bo_free (&cmd_buffer->device ->vis_stream_suballocator ,
3620+ &bo->cs_bo );
3621+ tu_suballoc_bo_free (&cmd_buffer->device ->vis_stream_suballocator ,
3622+ &bo->fence_bo );
3623+ }
3624+ mtx_unlock (&cmd_buffer->device ->vis_stream_suballocator_mtx );
3625+
3626+ util_dynarray_fini (&cmd_buffer->vis_stream_bos );
3627+ util_dynarray_fini (&cmd_buffer->vis_stream_cs_bos );
35763628
35773629 vk_command_buffer_finish (&cmd_buffer->vk );
35783630 vk_free2 (&cmd_buffer->device ->vk .alloc , &cmd_buffer->vk .pool ->alloc ,
@@ -3649,6 +3701,26 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
36493701 cmd_buffer->pre_chain .patchpoints_ctx = NULL ;
36503702 util_dynarray_clear (&cmd_buffer->fdm_bin_patchpoints );
36513703 util_dynarray_clear (&cmd_buffer->pre_chain .fdm_bin_patchpoints );
3704+ util_dynarray_clear (&cmd_buffer->vis_stream_patchpoints );
3705+
3706+ util_dynarray_foreach (&cmd_buffer->vis_stream_bos , struct tu_bo *,
3707+ bo) {
3708+ tu_bo_finish (cmd_buffer->device , *bo);
3709+ }
3710+
3711+ mtx_lock (&cmd_buffer->device ->vis_stream_suballocator_mtx );
3712+ util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos ,
3713+ struct tu_vis_stream_patchpoint_cs ,
3714+ bo) {
3715+ tu_suballoc_bo_free (&cmd_buffer->device ->vis_stream_suballocator ,
3716+ &bo->cs_bo );
3717+ tu_suballoc_bo_free (&cmd_buffer->device ->vis_stream_suballocator ,
3718+ &bo->fence_bo );
3719+ }
3720+ mtx_unlock (&cmd_buffer->device ->vis_stream_suballocator_mtx );
3721+
3722+ util_dynarray_clear (&cmd_buffer->vis_stream_bos );
3723+ util_dynarray_clear (&cmd_buffer->vis_stream_cs_bos );
36523724}
36533725
36543726const struct vk_command_buffer_ops tu_cmd_buffer_ops = {
@@ -5562,6 +5634,58 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
55625634 util_dynarray_append_dynarray (&cmd->fdm_bin_patchpoints ,
55635635 &secondary->fdm_bin_patchpoints );
55645636 } else {
5637+ struct tu_cs *cs = &cmd->cs ;
5638+
5639+ /* If the secondary can be used multiple times, we have to set its
5640+ * patchpoints on the GPU. Set them here, and create a new
5641+ * patchpoint pointing to the CP_MEM_WRITE packet. Otherwise just
5642+ * copy them over adjusting the index.
5643+ */
5644+ bool simultaneous_use = secondary->usage_flags &
5645+ VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
5646+
5647+ /* If this cmdbuf itself can be used multiple times in a submit then
5648+ * its patchpoint will also be updated on the GPU.
5649+ */
5650+ if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
5651+ tu_cs_set_writeable (cs, true );
5652+
5653+ util_dynarray_foreach (&secondary->vis_stream_patchpoints ,
5654+ struct tu_vis_stream_patchpoint ,
5655+ secondary_patchpoint) {
5656+ struct tu_vis_stream_patchpoint patchpoint =
5657+ *secondary_patchpoint;
5658+
5659+ if (simultaneous_use) {
5660+ tu_cs_reserve_space (cs, 5 );
5661+ tu_cs_emit_pkt7 (cs, CP_MEM_WRITE, 4 );
5662+ tu_cs_emit_qw (cs, patchpoint.iova );
5663+ patchpoint.iova = tu_cs_get_cur_iova (cs);
5664+ patchpoint.data = cs->cur ;
5665+ tu_cs_emit_qw (cs, 0 );
5666+ }
5667+
5668+ util_dynarray_append (&cmd->vis_stream_patchpoints ,
5669+ patchpoint);
5670+ }
5671+
5672+ if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
5673+ tu_cs_set_writeable (cs, false );
5674+
5675+ if (simultaneous_use) {
5676+ tu_cs_emit_pkt7 (cs, CP_WAIT_MEM_WRITES, 0 );
5677+ tu_cs_emit_pkt7 (cs, CP_WAIT_FOR_ME, 0 );
5678+
5679+ /* Make BV wait for updates on BR to land */
5680+ if (cmd->device ->physical_device ->info ->chip >= 7 ) {
5681+ tu_cs_emit_pkt7 (cs, CP_THREAD_CONTROL, 1 );
5682+ tu_cs_emit (cs, CP_THREAD_CONTROL_0_THREAD (CP_SET_THREAD_BR) |
5683+ CP_THREAD_CONTROL_0_SYNC_THREADS);
5684+ }
5685+ }
5686+
5687+ cmd->vsc_size = MAX2 (cmd->vsc_size , secondary->vsc_size );
5688+
55655689 switch (secondary->state .suspend_resume ) {
55665690 case SR_NONE:
55675691 assert (tu_cs_is_empty (&secondary->draw_cs ));
0 commit comments