Beispiel #1
0
static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
			      unsigned offset, unsigned size, unsigned value)
{
	struct r600_context *rctx = (struct r600_context*)ctx;

	if (rctx->screen->b.has_cp_dma &&
	    rctx->b.chip_class >= EVERGREEN &&
	    offset % 4 == 0 && size % 4 == 0) {
		evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value);
	} else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) {
		union pipe_color_union clear_value;
		clear_value.ui[0] = value;

		r600_blitter_begin(ctx, R600_DISABLE_RENDER_COND);
		util_blitter_clear_buffer(rctx->blitter, dst, offset, size,
					  1, &clear_value);
		r600_blitter_end(ctx);
	} else {
		uint32_t *map = r600_buffer_map_sync_with_rings(&rctx->b, r600_resource(dst),
								 PIPE_TRANSFER_WRITE);
		size /= 4;
		for (unsigned i = 0; i < size; i++)
			*map++ = value;
	}
}
Beispiel #2
0
static void *evergreen_create_compute_state(struct pipe_context *ctx,
					    const struct pipe_compute_state *cso)
{
	struct r600_context *rctx = (struct r600_context *)ctx;
	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
#ifdef HAVE_OPENCL
	const struct pipe_llvm_program_header *header;
	const char *code;
	void *p;
	boolean use_kill;

	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
	header = cso->prog;
	code = cso->prog + sizeof(struct pipe_llvm_program_header);
	radeon_shader_binary_init(&shader->binary);
	radeon_elf_read(code, header->num_bytes, &shader->binary);
	r600_create_shader(&shader->bc, &shader->binary, &use_kill);

	/* Upload code + ROdata */
	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
							shader->bc.ndw * 4);
	p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
	//TODO: use util_memcpy_cpu_to_le32 ?
	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
#endif

	shader->ctx = rctx;
	shader->local_size = cso->req_local_mem;
	shader->private_size = cso->req_private_mem;
	shader->input_size = cso->req_input_mem;

	return shader;
}
Beispiel #3
0
static void evergreen_launch_grid(
		struct pipe_context *ctx_,
		const uint *block_layout, const uint *grid_layout,
		uint32_t pc, const void *input)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
#ifdef HAVE_OPENCL
	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
	boolean use_kill;

#if HAVE_LLVM < 0x0306
	struct r600_kernel *kernel = &shader->kernels[pc];
	(void)use_kill;
        if (!kernel->code_bo) {
                void *p;
                struct r600_bytecode *bc = &kernel->bc;
                LLVMModuleRef mod = kernel->llvm_module;
                boolean use_kill = false;
                bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
                unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
                unsigned sb_disasm = use_sb ||
                        (ctx->screen->b.debug_flags & DBG_SB_DISASM);

                r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
                           ctx->screen->has_compressed_msaa_texturing);
                bc->type = TGSI_PROCESSOR_COMPUTE;
                bc->isa = ctx->isa;
                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);

                if (dump && !sb_disasm) {
                        r600_bytecode_disasm(bc);
                } else if ((dump && sb_disasm) || use_sb) {
                        if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
                                R600_ERR("r600_sb_bytecode_process failed!\n");
                }

                kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
                                                        kernel->bc.ndw * 4);
                p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
                memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
                ctx->b.ws->buffer_unmap(kernel->code_bo->buf);
        }
	shader->active_kernel = kernel;
	ctx->cs_shader_state.kernel_index = pc;
#else
	ctx->cs_shader_state.pc = pc;
	/* Get the config information for this kernel. */
	r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
#endif
#endif

	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);


	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
	compute_emit_cs(ctx, block_layout, grid_layout);
}
Beispiel #4
0
void *evergreen_create_compute_state(
	struct pipe_context *ctx_,
	const const struct pipe_compute_state *cso)
{
	struct r600_context *ctx = (struct r600_context *)ctx_;
	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
#ifdef HAVE_OPENCL
	const struct pipe_llvm_program_header * header;
	const char *code;
	void *p;
	boolean use_kill;

	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
	header = cso->prog;
	code = cso->prog + sizeof(struct pipe_llvm_program_header);
#if HAVE_LLVM < 0x0306
        (void)use_kill;
	(void)p;
	shader->llvm_ctx = LLVMContextCreate();
	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
				code, header->num_bytes);
	shader->kernels = CALLOC(sizeof(struct r600_kernel),
				shader->num_kernels);
	{
		unsigned i;
		for (i = 0; i < shader->num_kernels; i++) {
			struct r600_kernel *kernel = &shader->kernels[i];
			kernel->llvm_module = radeon_llvm_get_kernel_module(
				shader->llvm_ctx, i, code, header->num_bytes);
		}
	}
#else
	radeon_shader_binary_init(&shader->binary);
	radeon_elf_read(code, header->num_bytes, &shader->binary);
	r600_create_shader(&shader->bc, &shader->binary, &use_kill);

	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
							shader->bc.ndw * 4);
	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
	ctx->b.ws->buffer_unmap(shader->code_bo->buf);
#endif
#endif

	shader->ctx = ctx;
	shader->local_size = cso->req_local_mem;
	shader->private_size = cso->req_private_mem;
	shader->input_size = cso->req_input_mem;

	return shader;
}
static void *r600_buffer_transfer_map(struct pipe_context *ctx,
                                      struct pipe_resource *resource,
                                      unsigned level,
                                      unsigned usage,
                                      const struct pipe_box *box,
                                      struct pipe_transfer **ptransfer)
{
	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
	struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen;
        struct r600_resource *rbuffer = r600_resource(resource);
        uint8_t *data;

	assert(box->x + box->width <= resource->width0);

	/* See if the buffer range being mapped has never been initialized,
	 * in which case it can be mapped unsynchronized. */
	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
	    usage & PIPE_TRANSFER_WRITE &&
	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
	}

	/* If discarding the entire range, discard the whole resource instead. */
	if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
	    box->x == 0 && box->width == resource->width0) {
		usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
	}

	if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
		assert(usage & PIPE_TRANSFER_WRITE);

		if (r600_invalidate_buffer(rctx, rbuffer)) {
			/* At this point, the buffer is always idle. */
			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
		}
	}
	else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
		 !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
		 !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
		 r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
		assert(usage & PIPE_TRANSFER_WRITE);

		/* Check if mapping this buffer would cause waiting for the GPU. */
		if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
			/* Do a wait-free write-only transfer using a temporary buffer. */
			unsigned offset;
			struct r600_resource *staging = NULL;

			u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
				       256, &offset, (struct pipe_resource**)&staging, (void**)&data);

			if (staging) {
				data += box->x % R600_MAP_BUFFER_ALIGNMENT;
				return r600_buffer_get_transfer(ctx, resource, level, usage, box,
								ptransfer, data, staging, offset);
			}
		} else {
			/* At this point, the buffer is always idle (we checked it above). */
			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
		}
	}
	/* Using a staging buffer in GTT for larger reads is much faster. */
	else if ((usage & PIPE_TRANSFER_READ) &&
		 !(usage & PIPE_TRANSFER_WRITE) &&
		 rbuffer->domains == RADEON_DOMAIN_VRAM &&
		 r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) {
		struct r600_resource *staging;

		staging = (struct r600_resource*) pipe_buffer_create(
				ctx->screen, PIPE_BIND_TRANSFER_READ, PIPE_USAGE_STAGING,
				box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT));
		if (staging) {
			/* Copy the VRAM buffer to the staging buffer. */
			rctx->dma_copy(ctx, &staging->b.b, 0,
				       box->x % R600_MAP_BUFFER_ALIGNMENT,
				       0, 0, resource, level, box);

			data = r600_buffer_map_sync_with_rings(rctx, staging, PIPE_TRANSFER_READ);
			data += box->x % R600_MAP_BUFFER_ALIGNMENT;

			return r600_buffer_get_transfer(ctx, resource, level, usage, box,
							ptransfer, data, staging, 0);
		}
	}

	data = r600_buffer_map_sync_with_rings(rctx, rbuffer, usage);
	if (!data) {
		return NULL;
	}
	data += box->x;

	return r600_buffer_get_transfer(ctx, resource, level, usage, box,
					ptransfer, data, NULL, 0);
}
Beispiel #6
0
static void *r600_buffer_transfer_map(struct pipe_context *ctx,
					struct pipe_resource *resource,
					unsigned level,
					unsigned usage,
					const struct pipe_box *box,
					struct pipe_transfer **ptransfer)
{
	struct r600_context *rctx = (struct r600_context*)ctx;
	struct r600_resource *rbuffer = r600_resource(resource);
	uint8_t *data;

	assert(box->x + box->width <= resource->width0);

	/* See if the buffer range being mapped has never been initialized,
	 * in which case it can be mapped unsynchronized. */
	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
	    usage & PIPE_TRANSFER_WRITE &&
	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
	}

	if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
		assert(usage & PIPE_TRANSFER_WRITE);

		/* Check if mapping this buffer would cause waiting for the GPU. */
		if (r600_rings_is_buffer_referenced(&rctx->b, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
		    rctx->b.ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
			unsigned i, mask;

			/* Discard the buffer. */
			pb_reference(&rbuffer->buf, NULL);

			/* Create a new one in the same pipe_resource. */
			/* XXX We probably want a different alignment for buffers and textures. */
			r600_init_resource(&rctx->screen->b, rbuffer, rbuffer->b.b.width0, 4096,
					   TRUE, rbuffer->b.b.usage);

			/* We changed the buffer, now we need to bind it where the old one was bound. */
			/* Vertex buffers. */
			mask = rctx->vertex_buffer_state.enabled_mask;
			while (mask) {
				i = u_bit_scan(&mask);
				if (rctx->vertex_buffer_state.vb[i].buffer == &rbuffer->b.b) {
					rctx->vertex_buffer_state.dirty_mask |= 1 << i;
					r600_vertex_buffers_dirty(rctx);
				}
			}
			/* Streamout buffers. */
			for (i = 0; i < rctx->b.streamout.num_targets; i++) {
				if (rctx->b.streamout.targets[i]->b.buffer == &rbuffer->b.b) {
					if (rctx->b.streamout.begin_emitted) {
						r600_emit_streamout_end(&rctx->b);
					}
					rctx->b.streamout.append_bitmask = rctx->b.streamout.enabled_mask;
					r600_streamout_buffers_dirty(&rctx->b);
				}
			}
			/* Constant buffers. */
			r600_set_constants_dirty_if_bound(rctx, rbuffer);
		}
	}
	else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
		 !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
		 !(rctx->screen->b.debug_flags & DBG_NO_DISCARD_RANGE) &&
		 (rctx->screen->has_cp_dma ||
		  (rctx->screen->has_streamout &&
		   /* The buffer range must be aligned to 4 with streamout. */
		   box->x % 4 == 0 && box->width % 4 == 0))) {
		assert(usage & PIPE_TRANSFER_WRITE);

		/* Check if mapping this buffer would cause waiting for the GPU. */
		if (r600_rings_is_buffer_referenced(&rctx->b, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
		    rctx->b.ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
			/* Do a wait-free write-only transfer using a temporary buffer. */
			unsigned offset;
			struct r600_resource *staging = NULL;

			u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
				       &offset, (struct pipe_resource**)&staging, (void**)&data);

			if (staging) {
				data += box->x % R600_MAP_BUFFER_ALIGNMENT;
				return r600_buffer_get_transfer(ctx, resource, level, usage, box,
								ptransfer, data, staging, offset);
			}
		}
	}

	/* mmap and synchronize with rings */
	data = r600_buffer_map_sync_with_rings(&rctx->b, rbuffer, usage);
	if (!data) {
		return NULL;
	}
	data += box->x;

	return r600_buffer_get_transfer(ctx, resource, level, usage, box,
					ptransfer, data, NULL, 0);
}
static void *r600_texture_transfer_map(struct pipe_context *ctx,
				       struct pipe_resource *texture,
				       unsigned level,
				       unsigned usage,
				       const struct pipe_box *box,
				       struct pipe_transfer **ptransfer)
{
	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
	struct r600_texture *rtex = (struct r600_texture*)texture;
	struct r600_transfer *trans;
	boolean use_staging_texture = FALSE;
	struct r600_resource *buf;
	unsigned offset = 0;
	char *map;

	/* We cannot map a tiled texture directly because the data is
	 * in a different order, therefore we do detiling using a blit.
	 *
	 * Also, use a temporary in GTT memory for read transfers, as
	 * the CPU is much happier reading out of cached system memory
	 * than uncached VRAM.
	 */
	if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
		use_staging_texture = TRUE;
	} else if ((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_MAP_DIRECTLY) &&
	    (rtex->resource.domains == RADEON_DOMAIN_VRAM)) {
		/* Untiled buffers in VRAM, which is slow for CPU reads */
		use_staging_texture = TRUE;
	} else if (!(usage & PIPE_TRANSFER_READ) &&
	    (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) ||
	     !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) {
		/* Use a staging texture for uploads if the underlying BO is busy. */
		use_staging_texture = TRUE;
	}

	if (texture->flags & R600_RESOURCE_FLAG_TRANSFER) {
		use_staging_texture = FALSE;
	}

	if (use_staging_texture && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) {
		return NULL;
	}

	trans = CALLOC_STRUCT(r600_transfer);
	if (trans == NULL)
		return NULL;
	trans->transfer.resource = texture;
	trans->transfer.level = level;
	trans->transfer.usage = usage;
	trans->transfer.box = *box;

	if (rtex->is_depth) {
		struct r600_texture *staging_depth;

		if (rtex->resource.b.b.nr_samples > 1) {
			/* MSAA depth buffers need to be converted to single sample buffers.
			 *
			 * Mapping MSAA depth buffers can occur if ReadPixels is called
			 * with a multisample GLX visual.
			 *
			 * First downsample the depth buffer to a temporary texture,
			 * then decompress the temporary one to staging.
			 *
			 * Only the region being mapped is transfered.
			 */
			struct pipe_resource resource;

			r600_init_temp_resource_from_box(&resource, texture, box, level, 0);

			if (!r600_init_flushed_depth_texture(ctx, &resource, &staging_depth)) {
				R600_ERR("failed to create temporary texture to hold untiled copy\n");
				FREE(trans);
				return NULL;
			}

			if (usage & PIPE_TRANSFER_READ) {
				struct pipe_resource *temp = ctx->screen->resource_create(ctx->screen, &resource);

				r600_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box);
				rctx->blit_decompress_depth(ctx, (struct r600_texture*)temp, staging_depth,
							    0, 0, 0, box->depth, 0, 0);
				pipe_resource_reference((struct pipe_resource**)&temp, NULL);
			}
		}
		else {
			/* XXX: only readback the rectangle which is being mapped? */
			/* XXX: when discard is true, no need to read back from depth texture */
			if (!r600_init_flushed_depth_texture(ctx, texture, &staging_depth)) {
				R600_ERR("failed to create temporary texture to hold untiled copy\n");
				FREE(trans);
				return NULL;
			}

			rctx->blit_decompress_depth(ctx, rtex, staging_depth,
						    level, level,
						    box->z, box->z + box->depth - 1,
						    0, 0);

			offset = r600_texture_get_offset(staging_depth, level, box);
		}

		trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes;
		trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size;
		trans->staging = (struct r600_resource*)staging_depth;
	} else if (use_staging_texture) {
		struct pipe_resource resource;
		struct r600_texture *staging;

		r600_init_temp_resource_from_box(&resource, texture, box, level,
						 R600_RESOURCE_FLAG_TRANSFER);
		resource.usage = (usage & PIPE_TRANSFER_READ) ?
			PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;

		/* Create the temporary texture. */
		staging = (struct r600_texture*)ctx->screen->resource_create(ctx->screen, &resource);
		if (staging == NULL) {
			R600_ERR("failed to create temporary texture to hold untiled copy\n");
			FREE(trans);
			return NULL;
		}
		trans->staging = &staging->resource;
		trans->transfer.stride = staging->surface.level[0].pitch_bytes;
		trans->transfer.layer_stride = staging->surface.level[0].slice_size;
		if (usage & PIPE_TRANSFER_READ) {
			r600_copy_to_staging_texture(ctx, trans);
		}
	} else {
		/* the resource is mapped directly */
		trans->transfer.stride = rtex->surface.level[level].pitch_bytes;
		trans->transfer.layer_stride = rtex->surface.level[level].slice_size;
		offset = r600_texture_get_offset(rtex, level, box);
	}

	if (trans->staging) {
		buf = trans->staging;
		if (!rtex->is_depth && !(usage & PIPE_TRANSFER_READ))
			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
	} else {
		buf = &rtex->resource;
	}

	if (!(map = r600_buffer_map_sync_with_rings(rctx, buf, usage))) {
		pipe_resource_reference((struct pipe_resource**)&trans->staging, NULL);
		FREE(trans);
		return NULL;
	}

	*ptransfer = &trans->transfer;
	return map + offset;
}
Beispiel #8
0
static void *r600_buffer_transfer_map(struct pipe_context *ctx,
                                      struct pipe_resource *resource,
                                      unsigned level,
                                      unsigned usage,
                                      const struct pipe_box *box,
                                      struct pipe_transfer **ptransfer)
{
    struct r600_common_context *rctx = (struct r600_common_context*)ctx;
    struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen;
    struct r600_resource *rbuffer = r600_resource(resource);
    uint8_t *data;

    assert(box->x + box->width <= resource->width0);

    /* See if the buffer range being mapped has never been initialized,
     * in which case it can be mapped unsynchronized. */
    if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
            usage & PIPE_TRANSFER_WRITE &&
            !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
        usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
    }

    /* If discarding the entire range, discard the whole resource instead. */
    if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
            box->x == 0 && box->width == resource->width0) {
        usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
    }

    if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
            !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
        assert(usage & PIPE_TRANSFER_WRITE);

        /* Check if mapping this buffer would cause waiting for the GPU. */
        if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
                rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
            rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
        }
        /* At this point, the buffer is always idle. */
        usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
    }
    else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
             !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
             !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
             (rscreen->has_cp_dma ||
              (rscreen->has_streamout &&
               /* The buffer range must be aligned to 4 with streamout. */
               box->x % 4 == 0 && box->width % 4 == 0))) {
        assert(usage & PIPE_TRANSFER_WRITE);

        /* Check if mapping this buffer would cause waiting for the GPU. */
        if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
                rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
            /* Do a wait-free write-only transfer using a temporary buffer. */
            unsigned offset;
            struct r600_resource *staging = NULL;

            u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
                           &offset, (struct pipe_resource**)&staging, (void**)&data);

            if (staging) {
                data += box->x % R600_MAP_BUFFER_ALIGNMENT;
                return r600_buffer_get_transfer(ctx, resource, level, usage, box,
                                                ptransfer, data, staging, offset);
            } else {
                return NULL; /* error, shouldn't occur though */
            }
        }
        /* At this point, the buffer is always idle (we checked it above). */
        usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
    }

    data = r600_buffer_map_sync_with_rings(rctx, rbuffer, usage);
    if (!data) {
        return NULL;
    }
    data += box->x;

    return r600_buffer_get_transfer(ctx, resource, level, usage, box,
                                    ptransfer, data, NULL, 0);
}