void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE | R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB | R600_CONTEXT_FLUSH_AND_INV_DB | R600_CONTEXT_FLUSH_AND_INV_CB_META | R600_CONTEXT_FLUSH_AND_INV_DB_META | R600_CONTEXT_STREAMOUT_FLUSH | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; }
void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resources are bound. */ rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER) | R600_CONTEXT_WAIT_3D_IDLE; /* There are differences between R700 and EG in CP DMA, * but we only use the common bits here. */ while (size) { unsigned sync = 0; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned src_reloc, dst_reloc; r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { r600_flush_emit(rctx); } /* Do the synchronization after the last copy, so that all data is written to memory. */ if (size == byte_count) { sync = PKT3_CP_DMA_CP_SYNC; } /* This must be done after r600_need_cs_space. */ src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, src_reloc); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, dst_reloc); size -= byte_count; src_offset += byte_count; dst_offset += byte_count; } /* CP_DMA_CP_SYNC doesn't wait for idle on R6xx, but this does. */ if (rctx->b.chip_class == R600) radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_CP_DMA_IDLE(1)); /* CP DMA is executed in ME, but index buffers are read by PFP. * This ensures that ME (CP DMA) is idle before PFP starts fetching * indices. If we wanted to execute CP DMA in PFP, this packet * should precede it. */ r600_emit_pfp_sync_me(rctx); }