static void
tu_copy_buffer(struct tu_cmd_buffer *cmdbuf,
               struct tu_bo *src_bo,
               uint64_t src_offset,
               struct tu_bo *dst_bo,
               uint64_t dst_offset,
               uint64_t size)
{
   const unsigned max_size_per_iter = 0x4000 - 0x40;
   const unsigned max_iterations =
      (size + max_size_per_iter) / max_size_per_iter;

   tu_bo_list_add(&cmdbuf->bo_list, src_bo, MSM_SUBMIT_BO_READ);
   tu_bo_list_add(&cmdbuf->bo_list, dst_bo, MSM_SUBMIT_BO_WRITE);

   tu_dma_prepare(cmdbuf);

   tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 21 + 48 * max_iterations);

   /* buffer copy setup */
   tu_cs_emit_pkt7(&cmdbuf->cs, CP_SET_MARKER, 1);
   tu_cs_emit(&cmdbuf->cs, A2XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE));

   const uint32_t blit_cntl = blit_control(RB6_R8_UNORM) | 0x20000000;

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
   tu_cs_emit(&cmdbuf->cs, blit_cntl);

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
   tu_cs_emit(&cmdbuf->cs, blit_cntl);

   for (; size;) {
      uint64_t src_va = src_bo->iova + src_offset;
      uint64_t dst_va = dst_bo->iova + dst_offset;

      unsigned src_shift = src_va & 0x3f;
      unsigned dst_shift = dst_va & 0x3f;
      unsigned max_shift = MAX2(src_shift, dst_shift);

      src_va -= src_shift;
      dst_va -= dst_shift;

      uint32_t size_todo = MIN2(0x4000 - max_shift, size);
      unsigned pitch = (size_todo + max_shift + 63) & ~63;

      /*
       * Emit source:
       */
      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_PS_2D_SRC_INFO, 13);
      tu_cs_emit(&cmdbuf->cs,
                 A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(RB6_R8_UNORM) |
                    A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(TILE6_LINEAR) |
                    A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(WZYX) | 0x500000);
      tu_cs_emit(&cmdbuf->cs,
                 A6XX_SP_PS_2D_SRC_SIZE_WIDTH(src_shift + size_todo) |
                    A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(1)); /* SP_PS_2D_SRC_SIZE */
      tu_cs_emit_qw(&cmdbuf->cs, src_va);
      tu_cs_emit(&cmdbuf->cs, A6XX_SP_PS_2D_SRC_PITCH_PITCH(pitch));

      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);

      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);

      /*
       * Emit destination:
       */
      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_DST_INFO, 9);
      tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(RB6_R8_UNORM) |
                                 A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) |
                                 A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX));
      tu_cs_emit_qw(&cmdbuf->cs, dst_va);

      tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_SIZE_PITCH(pitch));
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);
      tu_cs_emit(&cmdbuf->cs, 0x00000000);

      /*
       * Blit command:
       */
      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_SRC_TL_X, 4);
      tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_X_X(src_shift));
      tu_cs_emit(&cmdbuf->cs,
                 A6XX_GRAS_2D_SRC_BR_X_X(src_shift + size_todo - 1));
      tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_Y_Y(0));
      tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_BR_Y_Y(0));

      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_DST_TL, 2);
      tu_cs_emit(&cmdbuf->cs,
                 A6XX_GRAS_2D_DST_TL_X(dst_shift) | A6XX_GRAS_2D_DST_TL_Y(0));
      tu_cs_emit(&cmdbuf->cs,
                 A6XX_GRAS_2D_DST_BR_X(dst_shift + size_todo - 1) |
                    A6XX_GRAS_2D_DST_BR_Y(0));

      tu_cs_emit_pkt7(&cmdbuf->cs, CP_EVENT_WRITE, 1);
      tu_cs_emit(&cmdbuf->cs, 0x3f);
      tu_cs_emit_wfi(&cmdbuf->cs);

      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
      tu_cs_emit(&cmdbuf->cs, 0);

      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_2D_SRC_FORMAT, 1);
      tu_cs_emit(&cmdbuf->cs, 0xf180);

      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1);
      tu_cs_emit(&cmdbuf->cs, 0x01000000);

      tu_cs_emit_pkt7(&cmdbuf->cs, CP_BLIT, 1);
      tu_cs_emit(&cmdbuf->cs, CP_BLIT_0_OP(BLIT_OP_SCALE));

      tu_cs_emit_wfi(&cmdbuf->cs);

      tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1);
      tu_cs_emit(&cmdbuf->cs, 0);

      src_offset += size_todo;
      dst_offset += size_todo;
      size -= size_todo;
   }

   tu6_emit_event_write(cmdbuf, &cmdbuf->cs, 0x1d, true);
   tu6_emit_event_write(cmdbuf, &cmdbuf->cs, FACENESS_FLUSH, true);
   tu6_emit_event_write(cmdbuf, &cmdbuf->cs, CACHE_FLUSH_TS, true);
}
static void
fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
{
	struct fd_ringbuffer *ring;

	// TODO mid-frame clears (ie. app doing crazy stuff)??  Maybe worth
	// splitting both clear and lrz clear out into their own rb's.  And
	// just throw away any draws prior to clear.  (Anything not fullscreen
	// clear, just fallback to generic path that treats it as a normal
	// draw

	if (!batch->lrz_clear) {
		batch->lrz_clear = fd_ringbuffer_new(batch->ctx->pipe, 0x1000);
		fd_ringbuffer_set_parent(batch->lrz_clear, batch->gmem);
	}

	ring = batch->lrz_clear;

	emit_marker6(ring, 7);
	OUT_PKT7(ring, CP_SET_MARKER, 1);
	OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
	emit_marker6(ring, 7);

	OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1);
	OUT_RING(ring, 0x10000000);

	OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
	OUT_RING(ring, 0x7ffff);

	emit_marker6(ring, 7);
	OUT_PKT7(ring, CP_SET_MARKER, 1);
	OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0xc));
	emit_marker6(ring, 7);

	OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8C01, 1);
	OUT_RING(ring, 0x0);

	OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 13);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);

	OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_ACC0, 1);
	OUT_RING(ring, 0x0000f410);

	OUT_PKT4(ring, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
	OUT_RING(ring, A6XX_GRAS_2D_BLIT_CNTL_COLOR_FORMAT(RB6_R16_UNORM) |
			0x4f00080);

	OUT_PKT4(ring, REG_A6XX_RB_2D_BLIT_CNTL, 1);
	OUT_RING(ring, A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(RB6_R16_UNORM) |
			0x4f00080);

	fd6_event_write(batch, ring, UNK_1D, true);
	fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false);

	OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
	OUT_RING(ring, fui(depth));
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);

	OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9);
	OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(RB6_R16_UNORM) |
			A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) |
			A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX));
	OUT_RELOCW(ring, zsbuf->lrz, 0, 0, 0);
	OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(zsbuf->lrz_pitch * 2));
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);
	OUT_RING(ring, 0x00000000);

	OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4);
	OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X_X(0));
	OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X_X(0));
	OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y_Y(0));
	OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y_Y(0));

	OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2);
	OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) |
			A6XX_GRAS_2D_DST_TL_Y(0));
	OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(zsbuf->lrz_width - 1) |
			A6XX_GRAS_2D_DST_BR_Y(zsbuf->lrz_height - 1));

	fd6_event_write(batch, ring, 0x3f, false);

	OUT_WFI5(ring);

	OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1);
	OUT_RING(ring, 0x1000000);

	OUT_PKT7(ring, CP_BLIT, 1);
	OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE));

	OUT_WFI5(ring);

	OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1);
	OUT_RING(ring, 0x0);

	fd6_event_write(batch, ring, UNK_1D, true);
	fd6_event_write(batch, ring, FACENESS_FLUSH, true);
	fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);

	fd6_cache_flush(batch, ring);
}
static void
tu_copy_image_to_buffer_step(struct tu_cmd_buffer *cmdbuf,
                             struct tu_image *src_image,
                             struct tu_buffer *dst_buffer,
                             const VkBufferImageCopy *copy_info,
                             VkFormat format,
                             uint32_t layer,
                             uint64_t dst_va)
{
   const enum a6xx_color_fmt rb_fmt = tu6_get_native_format(format)->rb;

   uint64_t src_va = src_image->bo->iova + src_image->bo_offset + src_image->layer_size * layer + src_image->levels[copy_info->imageSubresource.mipLevel].offset;
   unsigned src_pitch = src_image->levels[copy_info->imageSubresource.mipLevel].pitch *
                        vk_format_get_blocksize(format);

   unsigned dst_pitch;
   unsigned dst_offset = 0;
   if (copy_info->imageExtent.height == 1) {
      /* Can't find this in the spec, but not having it is sort of insane? */
      assert(dst_va % vk_format_get_blocksize(format) == 0);

      dst_offset = (dst_va & 63) / vk_format_get_blocksize(format);
      dst_va &= ~63;

      dst_pitch = align((dst_offset + copy_info->imageExtent.width) * vk_format_get_blocksize(format), 64);
   } else {
      unsigned dst_pixel_stride = copy_info->bufferRowLength
                                  ? copy_info->bufferRowLength
                                  : copy_info->imageExtent.width;
      dst_pitch = dst_pixel_stride * vk_format_get_blocksize(format);
      assert(!(dst_pitch & 63));
      assert(!(dst_va & 63));
   }
   

   tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 48);

   /*
    * Emit source:
    */
   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_PS_2D_SRC_INFO, 13);
   tu_cs_emit(&cmdbuf->cs,
              A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(rb_fmt) |
                 A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(src_image->tile_mode) |
                 A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(WZYX) | 0x500000);
   tu_cs_emit(&cmdbuf->cs,
              A6XX_SP_PS_2D_SRC_SIZE_WIDTH(src_image->extent.width) |
                 A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(
                    src_image->extent.height)); /* SP_PS_2D_SRC_SIZE */
   tu_cs_emit_qw(&cmdbuf->cs, src_va);
   tu_cs_emit(&cmdbuf->cs, A6XX_SP_PS_2D_SRC_PITCH_PITCH(src_pitch));

   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);

   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);

   /*
    * Emit destination:
    */
   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_DST_INFO, 9);
   tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(rb_fmt) |
                              A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) |
                              A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX));
   tu_cs_emit_qw(&cmdbuf->cs, dst_va);
   tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_SIZE_PITCH(dst_pitch));
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);
   tu_cs_emit(&cmdbuf->cs, 0x00000000);

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_SRC_TL_X, 4);
   tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_X_X(copy_info->imageOffset.x));
   tu_cs_emit(&cmdbuf->cs,
              A6XX_GRAS_2D_SRC_BR_X_X(copy_info->imageOffset.x +
                                      copy_info->imageExtent.width - 1));
   tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_Y_Y(copy_info->imageOffset.y));
   tu_cs_emit(&cmdbuf->cs,
              A6XX_GRAS_2D_SRC_BR_Y_Y(copy_info->imageOffset.y +
                                      copy_info->imageExtent.height - 1));

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_DST_TL, 2);
   tu_cs_emit(&cmdbuf->cs,
              A6XX_GRAS_2D_DST_TL_X(dst_offset) | A6XX_GRAS_2D_DST_TL_Y(0));
   tu_cs_emit(&cmdbuf->cs,
              A6XX_GRAS_2D_DST_BR_X(dst_offset + copy_info->imageExtent.width - 1) |
                 A6XX_GRAS_2D_DST_BR_Y(copy_info->imageExtent.height - 1));

   tu_cs_emit_pkt7(&cmdbuf->cs, CP_EVENT_WRITE, 1);
   tu_cs_emit(&cmdbuf->cs, 0x3f);
   tu_cs_emit_wfi(&cmdbuf->cs);

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
   tu_cs_emit(&cmdbuf->cs, 0);

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_2D_SRC_FORMAT, 1);
   tu_cs_emit(&cmdbuf->cs, tu6_sp_2d_src_format(format));

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1);
   tu_cs_emit(&cmdbuf->cs, 0x01000000);

   tu_cs_emit_pkt7(&cmdbuf->cs, CP_BLIT, 1);
   tu_cs_emit(&cmdbuf->cs, CP_BLIT_0_OP(BLIT_OP_SCALE));

   tu_cs_emit_wfi(&cmdbuf->cs);

   tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1);
   tu_cs_emit(&cmdbuf->cs, 0);
}