void r600_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = rctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(&rdst->valid_buffer_range, dst_offset, dst_offset + size); size >>= 2; /* convert to dwords */ ncopy = (size / R600_DMA_COPY_MAX_SIZE_DW) + !!(size % R600_DMA_COPY_MAX_SIZE_DW); r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc); for (i = 0; i < ncopy; i++) { csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW; /* emit reloc before writing cs so that cs is always in consistent state */ radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize)); radeon_emit(cs, dst_offset & 0xfffffffc); radeon_emit(cs, src_offset & 0xfffffffc); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += csize << 2; src_offset += csize << 2; size -= csize; } r600_dma_emit_wait_idle(&rctx->b); }
static void cik_sdma_do_copy_buffer(struct si_context *ctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, uint64_t size) { struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE; r600_need_dma_space(&ctx->b, ncopy * 7); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { csize = size < CIK_SDMA_COPY_MAX_SIZE ? size : CIK_SDMA_COPY_MAX_SIZE; cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0); cs->buf[cs->cdw++] = csize; cs->buf[cs->cdw++] = 0; /* src/dst endian swap */ cs->buf[cs->cdw++] = src_offset; cs->buf[cs->cdw++] = src_offset >> 32; cs->buf[cs->cdw++] = dst_offset; cs->buf[cs->cdw++] = dst_offset >> 32; dst_offset += csize; src_offset += csize; size -= csize; } }
/* This is required to prevent read-after-write hazards. */ void r600_dma_emit_wait_idle(struct r600_common_context *rctx) { struct radeon_winsys_cs *cs = rctx->dma.cs; /* done at the end of DMA calls, so increment this. */ rctx->num_dma_calls++; /* IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ if (rctx->ws->cs_query_memory_usage(rctx->dma.cs) > 64 * 1024 * 1024) { rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); return; } r600_need_dma_space(rctx, 1, NULL, NULL); if (!radeon_emitted(cs, 0)) /* empty queue */ return; /* NOP waits for idle on Evergreen and later. */ if (rctx->chip_class >= CIK) radeon_emit(cs, 0x00000000); /* NOP */ else if (rctx->chip_class >= EVERGREEN) radeon_emit(cs, 0xf0000000); /* NOP */ else { /* TODO: R600-R700 should use the FENCE packet. * CS checker support is required. */ } }
static void cik_sdma_copy_tile(struct si_context *ctx, struct pipe_resource *dst, unsigned dst_level, struct pipe_resource *src, unsigned src_level, unsigned y, unsigned copy_height, unsigned y_align, unsigned pitch, unsigned bpe) { struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; struct r600_texture *rlinear, *rtiled; unsigned linear_lvl, tiled_lvl; unsigned array_mode, lbpe, pitch_tile_max, slice_tile_max, size; unsigned ncopy, height, cheight, detile, i, src_mode, dst_mode; unsigned sub_op, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt; uint64_t base, addr; unsigned pipe_config, tile_mode_index; dst_mode = rdst->surface.level[dst_level].mode; src_mode = rsrc->surface.level[src_level].mode; assert(dst_mode != src_mode); assert(src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED || dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED); sub_op = CIK_SDMA_COPY_SUB_OPCODE_TILED; lbpe = util_logbase2(bpe); pitch_tile_max = ((pitch / bpe) / 8) - 1; detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED; rlinear = detile ? rdst : rsrc; rtiled = detile ? rsrc : rdst; linear_lvl = detile ? dst_level : src_level; tiled_lvl = detile ? src_level : dst_level; assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format)); array_mode = si_array_mode(rtiled->surface.level[tiled_lvl].mode); slice_tile_max = (rtiled->surface.level[tiled_lvl].nblk_x * rtiled->surface.level[tiled_lvl].nblk_y) / (8*8) - 1; height = rlinear->surface.level[linear_lvl].nblk_y; base = rtiled->surface.level[tiled_lvl].offset; addr = rlinear->surface.level[linear_lvl].offset; bank_h = cik_bank_wh(rtiled->surface.bankh); bank_w = cik_bank_wh(rtiled->surface.bankw); mt_aspect = cik_macro_tile_aspect(rtiled->surface.mtilea); tile_split = cik_tile_split(rtiled->surface.tile_split); tile_mode_index = si_tile_mode_index(rtiled, tiled_lvl, false); nbanks = si_num_banks(sscreen, rtiled); base += rtiled->resource.gpu_address; addr += rlinear->resource.gpu_address; pipe_config = cik_db_pipe_config(sscreen, tile_mode_index); mt = cik_micro_tile_mode(sscreen, tile_mode_index); size = (copy_height * pitch) / 4; cheight = copy_height; if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) { cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch; cheight &= ~(y_align - 1); } ncopy = (copy_height + cheight - 1) / cheight; r600_need_dma_space(&ctx->b, ncopy * 12); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); copy_height = size * 4 / pitch; for (i = 0; i < ncopy; i++) { cheight = copy_height; if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) { cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch; cheight &= ~(y_align - 1); } size = (cheight * pitch) / 4; cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, sub_op, detile << 15); cs->buf[cs->cdw++] = base; cs->buf[cs->cdw++] = base >> 32; cs->buf[cs->cdw++] = ((height - 1) << 16) | pitch_tile_max; cs->buf[cs->cdw++] = slice_tile_max; cs->buf[cs->cdw++] = (pipe_config << 26) | (mt_aspect << 24) | (nbanks << 21) | (bank_h << 18) | (bank_w << 15) | (tile_split << 11) | (mt << 8) | (array_mode << 3) | lbpe; cs->buf[cs->cdw++] = y << 16; /* | x */ cs->buf[cs->cdw++] = 0; /* z */ cs->buf[cs->cdw++] = addr & 0xfffffffc; cs->buf[cs->cdw++] = addr >> 32; cs->buf[cs->cdw++] = (pitch / bpe) - 1; cs->buf[cs->cdw++] = size; copy_height -= cheight; y += cheight; } }