static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); boolean status = cs->csc->used_gart < cs->ws->info.gart_size * 0.8 && cs->csc->used_vram < cs->ws->info.vram_size * 0.8; if (status) { cs->csc->validated_crelocs = cs->csc->crelocs; } else { /* Remove lately-added relocations. The validation failed with them * and the CS is about to be flushed because of that. Keep only * the already-validated relocations. */ unsigned i; for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) { p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references); radeon_bo_reference(&cs->csc->relocs_bo[i], NULL); } cs->csc->crelocs = cs->csc->validated_crelocs; /* Flush if there are any relocs. Clean up otherwise. */ if (cs->csc->crelocs) { cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC); } else { radeon_cs_context_cleanup(cs->csc); assert(cs->base.cdw == 0); if (cs->base.cdw != 0) { fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__); } } } return status; }
static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs, struct radeon_bo *bo) { struct radeon_cs_context *csc = cs->csc; struct drm_radeon_cs_reloc *reloc; unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); int i = -1; i = radeon_lookup_buffer(csc, bo); if (i >= 0) { /* For async DMA, every add_buffer call must add a buffer to the list * no matter how many duplicates there are. This is due to the fact * the DMA CS checker doesn't use NOP packets for offset patching, * but always uses the i-th buffer from the list to patch the i-th * offset. If there are N offsets in a DMA CS, there must also be N * buffers in the relocation list. * * This doesn't have to be done if virtual memory is enabled, * because there is no offset patching with virtual memory. */ if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) { return i; } } /* New relocation, check if the backing array is large enough. */ if (csc->num_relocs >= csc->max_relocs) { uint32_t size; csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3)); size = csc->max_relocs * sizeof(csc->relocs_bo[0]); csc->relocs_bo = realloc(csc->relocs_bo, size); size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc); csc->relocs = realloc(csc->relocs, size); csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; } /* Initialize the new relocation. */ csc->relocs_bo[csc->num_relocs].bo = NULL; csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0; radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo); p_atomic_inc(&bo->num_cs_references); reloc = &csc->relocs[csc->num_relocs]; reloc->handle = bo->handle; reloc->read_domains = 0; reloc->write_domain = 0; reloc->flags = 0; csc->reloc_indices_hashlist[hash] = csc->num_relocs; csc->chunks[1].length_dw += RELOC_DWORDS; return csc->num_relocs++; }
/* Add the given fence to a slab buffer fence list. * * There is a potential race condition when bo participates in submissions on * two or more threads simultaneously. Since we do not know which of the * submissions will be sent to the GPU first, we have to keep the fences * of all submissions. * * However, fences that belong to submissions that have already returned from * their respective ioctl do not have to be kept, because we know that they * will signal earlier. */ static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence) { unsigned dst; assert(fence->num_cs_references); /* Cleanup older fences */ dst = 0; for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) { if (bo->u.slab.fences[src]->num_cs_references) { bo->u.slab.fences[dst] = bo->u.slab.fences[src]; dst++; } else { radeon_bo_reference(&bo->u.slab.fences[src], NULL); } } bo->u.slab.num_fences = dst; /* Check available space for the new fence */ if (bo->u.slab.num_fences >= bo->u.slab.max_fences) { unsigned new_max_fences = bo->u.slab.max_fences + 1; struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences, bo->u.slab.max_fences * sizeof(*new_fences), new_max_fences * sizeof(*new_fences)); if (!new_fences) { fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n"); return; } bo->u.slab.fences = new_fences; bo->u.slab.max_fences = new_max_fences; } /* Add the new fence */ bo->u.slab.fences[bo->u.slab.num_fences] = NULL; radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence); bo->u.slab.num_fences++; }
static void radeon_cs_context_cleanup(struct radeon_cs_context *csc) { unsigned i; for (i = 0; i < csc->num_relocs; i++) { p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references); radeon_bo_reference(&csc->relocs_bo[i].bo, NULL); } for (i = 0; i < csc->num_slab_buffers; ++i) { p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references); radeon_bo_reference(&csc->slab_buffers[i].bo, NULL); } csc->num_relocs = 0; csc->num_validated_relocs = 0; csc->num_slab_buffers = 0; csc->chunks[0].length_dw = 0; csc->chunks[1].length_dw = 0; for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) { csc->reloc_indices_hashlist[i] = -1; } }
static void radeon_cs_context_cleanup(struct radeon_cs_context *csc) { unsigned i; for (i = 0; i < csc->crelocs; i++) { p_atomic_dec(&csc->relocs_bo[i]->num_cs_references); radeon_bo_reference(&csc->relocs_bo[i], NULL); } csc->crelocs = 0; csc->validated_crelocs = 0; csc->chunks[0].length_dw = 0; csc->chunks[1].length_dw = 0; csc->used_gart = 0; csc->used_vram = 0; memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added)); }
static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs, struct radeon_bo *bo) { struct radeon_cs_context *csc = cs->csc; unsigned hash; struct radeon_bo_item *item; int idx; int real_idx; idx = radeon_lookup_buffer(csc, bo); if (idx >= 0) return idx; real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real); /* Check if the backing array is large enough. */ if (csc->num_slab_buffers >= csc->max_slab_buffers) { unsigned new_max = MAX2(csc->max_slab_buffers + 16, (unsigned)(csc->max_slab_buffers * 1.3)); struct radeon_bo_item *new_buffers = REALLOC(csc->slab_buffers, csc->max_slab_buffers * sizeof(*new_buffers), new_max * sizeof(*new_buffers)); if (!new_buffers) { fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n"); return -1; } csc->max_slab_buffers = new_max; csc->slab_buffers = new_buffers; } /* Initialize the new relocation. */ idx = csc->num_slab_buffers++; item = &csc->slab_buffers[idx]; item->bo = NULL; item->u.slab.real_idx = real_idx; radeon_bo_reference(&item->bo, bo); p_atomic_inc(&bo->num_cs_references); hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); csc->reloc_indices_hashlist[hash] = idx; return idx; }
static void radeon_cs_context_cleanup(struct radeon_cs_context *csc) { unsigned i; for (i = 0; i < csc->crelocs; i++) { p_atomic_dec(&csc->relocs_bo[i]->num_cs_references); radeon_bo_reference(&csc->relocs_bo[i], NULL); } csc->crelocs = 0; csc->validated_crelocs = 0; csc->chunks[0].length_dw = 0; csc->chunks[1].length_dw = 0; csc->used_gart = 0; csc->used_vram = 0; for (i = 0; i < Elements(csc->reloc_indices_hashlist); i++) { csc->reloc_indices_hashlist[i] = -1; } }
static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_domain *added_domains) { struct radeon_cs_context *csc = cs->csc; struct drm_radeon_cs_reloc *reloc; unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0; bool update_hash = TRUE; int i; *added_domains = 0; if (csc->is_handle_added[hash]) { i = csc->reloc_indices_hashlist[hash]; reloc = &csc->relocs[i]; if (reloc->handle != bo->handle) { /* Hash collision, look for the BO in the list of relocs linearly. */ for (i = csc->crelocs - 1; i >= 0; i--) { reloc = &csc->relocs[i]; if (reloc->handle == bo->handle) { /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ break; } } } if (i >= 0) { /* On DMA ring we need to emit as many relocation as there is use of the bo * thus each time this function is call we should grow add again the bo to * the relocation buffer * * Do not update the hash table if it's dma ring, so that first hash always point * to first bo relocation which will the one used by the kernel. Following relocation * will be ignore by the kernel memory placement (but still use by the kernel to * update the cmd stream with proper buffer offset). */ update_hash = FALSE; update_reloc_domains(reloc, rd, wd, added_domains); if (cs->base.ring_type != RING_DMA) { csc->reloc_indices_hashlist[hash] = i; return i; } } } /* New relocation, check if the backing array is large enough. */ if (csc->crelocs >= csc->nrelocs) { uint32_t size; csc->nrelocs += 10; size = csc->nrelocs * sizeof(struct radeon_bo*); csc->relocs_bo = realloc(csc->relocs_bo, size); size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc); csc->relocs = realloc(csc->relocs, size); csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; } /* Initialize the new relocation. */ csc->relocs_bo[csc->crelocs] = NULL; radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo); p_atomic_inc(&bo->num_cs_references); reloc = &csc->relocs[csc->crelocs]; reloc->handle = bo->handle; reloc->read_domains = rd; reloc->write_domain = wd; reloc->flags = 0; csc->is_handle_added[hash] = TRUE; if (update_hash) { csc->reloc_indices_hashlist[hash] = csc->crelocs; } csc->chunks[1].length_dw += RELOC_DWORDS; *added_domains = rd | wd; return csc->crelocs++; }
static unsigned radeon_add_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_domain *added_domains) { struct drm_radeon_cs_reloc *reloc; unsigned i; unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0; if (csc->is_handle_added[hash]) { i = csc->reloc_indices_hashlist[hash]; reloc = &csc->relocs[i]; if (reloc->handle == bo->handle) { update_reloc_domains(reloc, rd, wd, added_domains); return i; } /* Hash collision, look for the BO in the list of relocs linearly. */ for (i = csc->crelocs; i != 0;) { --i; reloc = &csc->relocs[i]; if (reloc->handle == bo->handle) { update_reloc_domains(reloc, rd, wd, added_domains); csc->reloc_indices_hashlist[hash] = i; /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ return i; } } } /* New relocation, check if the backing array is large enough. */ if (csc->crelocs >= csc->nrelocs) { uint32_t size; csc->nrelocs += 10; size = csc->nrelocs * sizeof(struct radeon_bo*); csc->relocs_bo = realloc(csc->relocs_bo, size); size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc); csc->relocs = realloc(csc->relocs, size); csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; } /* Initialize the new relocation. */ csc->relocs_bo[csc->crelocs] = NULL; radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo); p_atomic_inc(&bo->num_cs_references); reloc = &csc->relocs[csc->crelocs]; reloc->handle = bo->handle; reloc->read_domains = rd; reloc->write_domain = wd; reloc->flags = 0; csc->is_handle_added[hash] = TRUE; csc->reloc_indices_hashlist[hash] = csc->crelocs; csc->chunks[1].length_dw += RELOC_DWORDS; *added_domains = rd | wd; return csc->crelocs++; }
static unsigned radeon_add_buffer(struct radeon_drm_cs *cs, struct radeon_bo *bo, enum radeon_bo_usage usage, enum radeon_bo_domain domains, unsigned priority, enum radeon_bo_domain *added_domains) { struct radeon_cs_context *csc = cs->csc; struct drm_radeon_cs_reloc *reloc; unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0; int i = -1; assert(priority < 64); *added_domains = 0; i = radeon_lookup_buffer(csc, bo); if (i >= 0) { reloc = &csc->relocs[i]; update_reloc(reloc, rd, wd, priority / 4, added_domains); csc->relocs_bo[i].priority_usage |= 1llu << priority; /* For async DMA, every add_buffer call must add a buffer to the list * no matter how many duplicates there are. This is due to the fact * the DMA CS checker doesn't use NOP packets for offset patching, * but always uses the i-th buffer from the list to patch the i-th * offset. If there are N offsets in a DMA CS, there must also be N * buffers in the relocation list. * * This doesn't have to be done if virtual memory is enabled, * because there is no offset patching with virtual memory. */ if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) { return i; } } /* New relocation, check if the backing array is large enough. */ if (csc->crelocs >= csc->nrelocs) { uint32_t size; csc->nrelocs += 10; size = csc->nrelocs * sizeof(csc->relocs_bo[0]); csc->relocs_bo = realloc(csc->relocs_bo, size); size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc); csc->relocs = realloc(csc->relocs, size); csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; } /* Initialize the new relocation. */ csc->relocs_bo[csc->crelocs].bo = NULL; csc->relocs_bo[csc->crelocs].priority_usage = 1llu << priority; radeon_bo_reference(&csc->relocs_bo[csc->crelocs].bo, bo); p_atomic_inc(&bo->num_cs_references); reloc = &csc->relocs[csc->crelocs]; reloc->handle = bo->handle; reloc->read_domains = rd; reloc->write_domain = wd; reloc->flags = priority / 4; csc->reloc_indices_hashlist[hash] = csc->crelocs; csc->chunks[1].length_dw += RELOC_DWORDS; *added_domains = rd | wd; return csc->crelocs++; }
static unsigned radeon_add_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo, enum radeon_bo_usage usage, enum radeon_bo_domain domains, unsigned priority, enum radeon_bo_domain *added_domains) { struct radeon_cs_context *csc = cs->csc; struct drm_radeon_cs_reloc *reloc; unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0; bool update_hash = TRUE; int i; priority = MIN2(priority, 15); *added_domains = 0; if (csc->is_handle_added[hash]) { i = csc->reloc_indices_hashlist[hash]; reloc = &csc->relocs[i]; if (reloc->handle != bo->handle) { /* Hash collision, look for the BO in the list of relocs linearly. */ for (i = csc->crelocs - 1; i >= 0; i--) { reloc = &csc->relocs[i]; if (reloc->handle == bo->handle) { /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ break; } } } if (i >= 0) { update_reloc(reloc, rd, wd, priority, added_domains); /* For async DMA, every add_reloc call must add a buffer to the list * no matter how many duplicates there are. This is due to the fact * the DMA CS checker doesn't use NOP packets for offset patching, * but always uses the i-th buffer from the list to patch the i-th * offset. If there are N offsets in a DMA CS, there must also be N * buffers in the relocation list. * * This doesn't have to be done if virtual memory is enabled, * because there is no offset patching with virtual memory. */ if (cs->base.ring_type != RING_DMA || cs->ws->info.r600_virtual_address) { csc->reloc_indices_hashlist[hash] = i; return i; } update_hash = FALSE; } } /* New relocation, check if the backing array is large enough. */ if (csc->crelocs >= csc->nrelocs) { uint32_t size; csc->nrelocs += 10; size = csc->nrelocs * sizeof(struct radeon_bo*); csc->relocs_bo = realloc(csc->relocs_bo, size); size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc); csc->relocs = realloc(csc->relocs, size); csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; } /* Initialize the new relocation. */ csc->relocs_bo[csc->crelocs] = NULL; radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo); p_atomic_inc(&bo->num_cs_references); reloc = &csc->relocs[csc->crelocs]; reloc->handle = bo->handle; reloc->read_domains = rd; reloc->write_domain = wd; reloc->flags = priority; csc->is_handle_added[hash] = TRUE; if (update_hash) { csc->reloc_indices_hashlist[hash] = csc->crelocs; } csc->chunks[1].length_dw += RELOC_DWORDS; *added_domains = rd | wd; return csc->crelocs++; }