Exemple #1
0
static int radeon_drm_cs_get_reloc(struct radeon_winsys_cs *rcs,
                                   struct radeon_winsys_cs_handle *buf)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    return radeon_get_reloc(cs->csc, (struct radeon_bo*)buf);
}
Exemple #2
0
static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
                                    struct pb_buffer *_buf,
                                    enum radeon_bo_usage usage)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)_buf;
    int index;

    if (!bo->num_cs_references)
        return false;

    index = radeon_lookup_buffer(cs->csc, bo);
    if (index == -1)
        return false;

    if (!bo->handle)
        index = cs->csc->slab_buffers[index].u.slab.real_idx;

    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
        return true;
    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
        return true;

    return false;
}
static boolean radeon_cs_request_feature(struct radeon_winsys_cs *rcs,
                                         enum radeon_feature_id fid,
                                         boolean enable)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    switch (fid) {
    case RADEON_FID_R300_HYPERZ_ACCESS:
        if (debug_get_bool_option("RADEON_HYPERZ", FALSE)) {
            return radeon_set_fd_access(cs, &cs->ws->hyperz_owner,
                                        &cs->ws->hyperz_owner_mutex,
                                        RADEON_INFO_WANT_HYPERZ, enable);
        } else {
            return FALSE;
        }

    case RADEON_FID_R300_CMASK_ACCESS:
        if (debug_get_bool_option("RADEON_CMASK", FALSE)) {
            return radeon_set_fd_access(cs, &cs->ws->cmask_owner,
                                        &cs->ws->cmask_owner_mutex,
                                        RADEON_INFO_WANT_CMASK, enable);
        } else {
            return FALSE;
        }
    }
    return FALSE;
}
Exemple #4
0
static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    boolean status =
        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;

    if (status) {
        cs->csc->validated_crelocs = cs->csc->crelocs;
    } else {
        /* Remove lately-added relocations. The validation failed with them
         * and the CS is about to be flushed because of that. Keep only
         * the already-validated relocations. */
        unsigned i;

        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
        }
        cs->csc->crelocs = cs->csc->validated_crelocs;

        /* Flush if there are any relocs. Clean up otherwise. */
        if (cs->csc->crelocs) {
            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
        } else {
            radeon_cs_context_cleanup(cs->csc);

            assert(cs->base.cdw == 0);
            if (cs->base.cdw != 0) {
                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
            }
        }
    }
    return status;
}
Exemple #5
0
static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
                                   struct pb_buffer *buf)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
}
Exemple #6
0
static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
                                   struct radeon_winsys_cs_handle *buf)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
}
static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
                                       struct radeon_winsys_cs_handle *_buf)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)_buf;

    return radeon_bo_is_referenced_by_cs(cs, bo);
}
Exemple #8
0
/*
 * Make sure previous submission of this cs are completed
 */
void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    /* Wait for any pending ioctl of this CS to complete. */
    if (util_queue_is_initialized(&cs->ws->cs_queue))
        util_queue_fence_wait(&cs->flush_completed);
}
Exemple #9
0
static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
                                    void (*flush)(void *ctx, unsigned flags),
                                    void *user)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    cs->flush_cs = flush;
    cs->flush_data = user;
}
Exemple #10
0
static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_cs_context *tmp;

    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
       fprintf(stderr, "radeon: command stream overflowed\n");
    }

    radeon_drm_cs_sync_flush(cs);

    /* Flip command streams. */
    tmp = cs->csc;
    cs->csc = cs->cst;
    cs->cst = tmp;

    /* If the CS is not empty or overflowed, emit it in a separate thread. */
    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS) {
        unsigned i, crelocs = cs->cst->crelocs;

        cs->cst->chunks[0].length_dw = cs->base.cdw;

        for (i = 0; i < crelocs; i++) {
            /* Update the number of active asynchronous CS ioctls for the buffer. */
            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
        }

        cs->cst->flags[0] = 0;
        cs->cst->flags[1] = RADEON_CS_RING_GFX;
        cs->cst->cs.num_chunks = 2;
        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
            cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
            cs->cst->cs.num_chunks = 3;
        }
        if (cs->ws->info.r600_virtual_address) {
            cs->cst->flags[0] |= RADEON_CS_USE_VM;
            cs->cst->cs.num_chunks = 3;
        }
        if (flags & RADEON_FLUSH_COMPUTE) {
            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
            cs->cst->cs.num_chunks = 3;
        }

        if (cs->thread &&
            (flags & RADEON_FLUSH_ASYNC)) {
            cs->flush_started = 1;
            pipe_semaphore_signal(&cs->flush_queued);
        } else {
            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
        }
    } else {
        radeon_cs_context_cleanup(cs->cst);
    }

    /* Prepare a new CS. */
    cs->base.buf = cs->csc->buf;
    cs->base.cdw = 0;
}
Exemple #11
0
static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    boolean status =
        (cs->csc->used_gart + gtt) < cs->ws->info.gart_size * 0.7 &&
        (cs->csc->used_vram + vram) < cs->ws->info.vram_size * 0.7;

    return status;
}
Exemple #12
0
/*
 * Make sure previous submission of this cs are completed
 */
void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    /* Wait for any pending ioctl to complete. */
    if (cs->ws->thread && cs->flush_started) {
        pipe_semaphore_wait(&cs->flush_completed);
        cs->flush_started = 0;
    }
}
Exemple #13
0
static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    radeon_drm_cs_sync_flush(rcs);
    pipe_semaphore_destroy(&cs->flush_completed);
    radeon_cs_context_cleanup(&cs->csc1);
    radeon_cs_context_cleanup(&cs->csc2);
    p_atomic_dec(&cs->ws->num_cs);
    radeon_destroy_cs_context(&cs->csc1);
    radeon_destroy_cs_context(&cs->csc2);
    FREE(cs);
}
static bool radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    vram += cs->csc->used_vram;
    gtt += cs->csc->used_gart;

    /* Anything that goes above the VRAM size should go to GTT. */
    if (vram > cs->ws->info.vram_size)
        gtt += vram - cs->ws->info.vram_size;

    /* Now we just need to check if we have enough GTT. */
    return gtt < cs->ws->info.gart_size * 0.7;
}
Exemple #15
0
static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    radeon_drm_cs_sync_flush(rcs);
    util_queue_fence_destroy(&cs->flush_completed);
    radeon_cs_context_cleanup(&cs->csc1);
    radeon_cs_context_cleanup(&cs->csc2);
    p_atomic_dec(&cs->ws->num_cs);
    radeon_destroy_cs_context(&cs->csc1);
    radeon_destroy_cs_context(&cs->csc2);
    radeon_fence_reference(&cs->next_fence, NULL);
    FREE(cs);
}
Exemple #16
0
static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
                                      struct radeon_winsys_cs_handle *buf)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)buf;
    unsigned index = radeon_get_reloc(cs->csc, bo);

    if (index == -1) {
        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
        return;
    }

    OUT_CS(&cs->base, 0xc0001000);
    OUT_CS(&cs->base, index * RELOC_DWORDS);
}
Exemple #17
0
static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
                                              struct radeon_bo_list_item *list)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    int i;

    if (list) {
        for (i = 0; i < cs->csc->crelocs; i++) {
            pb_reference(&list[i].buf, &cs->csc->relocs_bo[i].bo->base);
            list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
            list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
        }
    }
    return cs->csc->crelocs;
}
Exemple #18
0
static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
                                              struct radeon_bo_list_item *list)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    int i;

    if (list) {
        for (i = 0; i < cs->csc->num_relocs; i++) {
            list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
            list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
            list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
        }
    }
    return cs->csc->num_relocs;
}
Exemple #19
0
static struct pipe_fence_handle *
radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct pb_buffer *fence;

    /* Create a fence, which is a dummy BO. */
    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
                                       RADEON_DOMAIN_GTT, 0);
    /* Add the fence as a dummy relocation. */
    cs->ws->base.cs_add_buffer(rcs, fence,
                              RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
                              RADEON_PRIO_FENCE);
    return (struct pipe_fence_handle*)fence;
}
Exemple #20
0
static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
                                        struct pb_buffer *buf,
                                        enum radeon_bo_usage usage,
                                        enum radeon_bo_domain domains,
                                        enum radeon_bo_priority priority)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)buf;
    enum radeon_bo_domain added_domains;

    /* If VRAM is just stolen system memory, allow both VRAM and
     * GTT, whichever has free space. If a buffer is evicted from
     * VRAM to GTT, it will stay there.
     */
    if (!cs->ws->info.has_dedicated_vram)
        domains |= RADEON_DOMAIN_GTT;

    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
    struct drm_radeon_cs_reloc *reloc;
    int index;

    if (!bo->handle) {
        index = radeon_lookup_or_add_slab_buffer(cs, bo);
        if (index < 0)
            return 0;

        index = cs->csc->slab_buffers[index].u.slab.real_idx;
    } else {
        index = radeon_lookup_or_add_real_buffer(cs, bo);
    }

    reloc = &cs->csc->relocs[index];
    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
    reloc->read_domains |= rd;
    reloc->write_domain |= wd;
    reloc->flags = MAX2(reloc->flags, priority);
    cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;

    if (added_domains & RADEON_DOMAIN_VRAM)
        cs->base.used_vram += bo->base.size;
    else if (added_domains & RADEON_DOMAIN_GTT)
        cs->base.used_gart += bo->base.size;

    return index;
}
Exemple #21
0
static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
                                        struct radeon_winsys_cs_handle *buf,
                                        enum radeon_bo_usage usage,
                                        enum radeon_bo_domain domains)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)buf;
    enum radeon_bo_domain added_domains;
    unsigned index = radeon_add_reloc(cs, bo, usage, domains, &added_domains);

    if (added_domains & RADEON_DOMAIN_GTT)
        cs->csc->used_gart += bo->base.size;
    if (added_domains & RADEON_DOMAIN_VRAM)
        cs->csc->used_vram += bo->base.size;

    return index;
}
Exemple #22
0
static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_cs_context *tmp;

    radeon_drm_cs_sync_flush(cs);

    /* Flip command streams. */
    tmp = cs->csc;
    cs->csc = cs->cst;
    cs->cst = tmp;

    /* If the CS is not empty, emit it in a separate thread. */
    if (cs->base.cdw) {
        unsigned i, crelocs = cs->cst->crelocs;

        cs->cst->chunks[0].length_dw = cs->base.cdw;

        for (i = 0; i < crelocs; i++) {
            /* Update the number of active asynchronous CS ioctls for the buffer. */
            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
        }

        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
            cs->cst->cs.num_chunks = 3;
            cs->cst->flags = RADEON_CS_KEEP_TILING_FLAGS;
        } else {
            cs->cst->cs.num_chunks = 2;
        }

        if (cs->thread &&
            (flags & RADEON_FLUSH_ASYNC)) {
            cs->flush_started = 1;
            pipe_semaphore_signal(&cs->flush_queued);
        } else {
            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
        }
    } else {
        radeon_cs_context_cleanup(cs->cst);
    }

    /* Prepare a new CS. */
    cs->base.buf = cs->csc->buf;
    cs->base.cdw = 0;
}
Exemple #23
0
static struct pipe_fence_handle *
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
{
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
   struct pipe_fence_handle *fence = NULL;

   if (cs->next_fence) {
      radeon_fence_reference(&fence, cs->next_fence);
      return fence;
   }

   fence = radeon_cs_create_fence(rcs);
   if (!fence)
      return NULL;

   radeon_fence_reference(&cs->next_fence, fence);
   return fence;
}
Exemple #24
0
static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    radeon_drm_cs_sync_flush(cs);
    if (cs->thread) {
        cs->kill_thread = 1;
        pipe_semaphore_signal(&cs->flush_queued);
        pipe_semaphore_wait(&cs->flush_completed);
        pipe_thread_wait(cs->thread);
    }
    pipe_semaphore_destroy(&cs->flush_queued);
    pipe_semaphore_destroy(&cs->flush_completed);
    radeon_cs_context_cleanup(&cs->csc1);
    radeon_cs_context_cleanup(&cs->csc2);
    p_atomic_dec(&cs->ws->num_cs);
    radeon_destroy_cs_context(&cs->csc1);
    radeon_destroy_cs_context(&cs->csc2);
    FREE(cs);
}
Exemple #25
0
static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
                                        struct pb_buffer *buf,
                                        enum radeon_bo_usage usage,
                                        enum radeon_bo_domain domains,
                                        enum radeon_bo_priority priority)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)buf;
    enum radeon_bo_domain added_domains;
    unsigned index = radeon_add_buffer(cs, bo, usage, domains, priority,
                                       &added_domains);

    if (added_domains & RADEON_DOMAIN_VRAM)
        cs->base.used_vram += bo->base.size;
    else if (added_domains & RADEON_DOMAIN_GTT)
        cs->base.used_gart += bo->base.size;

    return index;
}
static boolean radeon_cs_request_feature(struct radeon_winsys_cs *rcs,
                                         enum radeon_feature_id fid,
                                         boolean enable)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);

    switch (fid) {
    case RADEON_FID_R300_HYPERZ_ACCESS:
        return radeon_set_fd_access(cs, &cs->ws->hyperz_owner,
                                    &cs->ws->hyperz_owner_mutex,
                                    RADEON_INFO_WANT_HYPERZ, "Hyper-Z",
                                    enable);

    case RADEON_FID_R300_CMASK_ACCESS:
        return radeon_set_fd_access(cs, &cs->ws->cmask_owner,
                                    &cs->ws->cmask_owner_mutex,
                                    RADEON_INFO_WANT_CMASK, "AA optimizations",
                                    enable);
    }
    return FALSE;
}
Exemple #27
0
static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
                                       struct radeon_winsys_cs_handle *_buf,
                                       enum radeon_bo_usage usage)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)_buf;
    int index;

    if (!bo->num_cs_references)
        return FALSE;

    index = radeon_get_reloc(cs->csc, bo);
    if (index == -1)
        return FALSE;

    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
        return TRUE;
    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
        return TRUE;

    return FALSE;
}
Exemple #28
0
static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
                                        struct pb_buffer *buf,
                                        enum radeon_bo_usage usage,
                                        enum radeon_bo_domain domains,
                                        enum radeon_bo_priority priority)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_bo *bo = (struct radeon_bo*)buf;
    enum radeon_bo_domain added_domains;
    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
    struct drm_radeon_cs_reloc *reloc;
    int index;

    if (!bo->handle) {
        index = radeon_lookup_or_add_slab_buffer(cs, bo);
        if (index < 0)
            return 0;

        index = cs->csc->slab_buffers[index].u.slab.real_idx;
    } else {
        index = radeon_lookup_or_add_real_buffer(cs, bo);
    }

    reloc = &cs->csc->relocs[index];
    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
    reloc->read_domains |= rd;
    reloc->write_domain |= wd;
    reloc->flags = MAX2(reloc->flags, priority);
    cs->csc->relocs_bo[index].u.real.priority_usage |= 1llu << priority;

    if (added_domains & RADEON_DOMAIN_VRAM)
        cs->base.used_vram += bo->base.size;
    else if (added_domains & RADEON_DOMAIN_GTT)
        cs->base.used_gart += bo->base.size;

    return index;
}
Exemple #29
0
static void radeon_bo_set_tiling(struct pb_buffer *_buf,
                                 struct radeon_winsys_cs *rcs,
                                 enum radeon_bo_layout microtiled,
                                 enum radeon_bo_layout macrotiled,
                                 uint32_t pitch)
{
    struct radeon_bo *bo = get_radeon_bo(_buf);
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct drm_radeon_gem_set_tiling args = {};

    /* Tiling determines how DRM treats the buffer data.
     * We must flush CS when changing it if the buffer is referenced. */
    if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) {
        cs->flush_cs(cs->flush_data, 0);
    }

    while (p_atomic_read(&bo->num_active_ioctls)) {
        sched_yield();
    }

    if (microtiled == RADEON_LAYOUT_TILED)
        args.tiling_flags |= RADEON_BO_FLAGS_MICRO_TILE;
    else if (microtiled == RADEON_LAYOUT_SQUARETILED)
        args.tiling_flags |= RADEON_BO_FLAGS_MICRO_TILE_SQUARE;

    if (macrotiled == RADEON_LAYOUT_TILED)
        args.tiling_flags |= RADEON_BO_FLAGS_MACRO_TILE;

    args.handle = bo->handle;
    args.pitch = pitch;

    drmCommandWriteRead(bo->rws->fd,
                        DRM_RADEON_GEM_SET_TILING,
                        &args,
                        sizeof(args));
}
Exemple #30
0
static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags, uint32_t cs_trace_id)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_cs_context *tmp;

    switch (cs->base.ring_type) {
    case RING_DMA:
        /* pad DMA ring to 8 DWs */
        if (cs->ws->info.chip_class <= SI) {
            while (rcs->cdw & 7)
                OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
        } else {
            while (rcs->cdw & 7)
                OUT_CS(&cs->base, 0x00000000); /* NOP packet */
        }
        break;
    case RING_GFX:
        /* pad DMA ring to 8 DWs to meet CP fetch alignment requirements
         * r6xx, requires at least 4 dw alignment to avoid a hw bug.
         */
        if (flags & RADEON_FLUSH_COMPUTE) {
            if (cs->ws->info.chip_class <= SI) {
                while (rcs->cdw & 7)
                    OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
            } else {
                while (rcs->cdw & 7)
                    OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
            }
        } else {
            while (rcs->cdw & 7)
                OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
        }
        break;
    }

    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
        fprintf(stderr, "radeon: command stream overflowed\n");
    }

    radeon_drm_cs_sync_flush(rcs);

    /* Flip command streams. */
    tmp = cs->csc;
    cs->csc = cs->cst;
    cs->cst = tmp;

    cs->cst->cs_trace_id = cs_trace_id;

    /* If the CS is not empty or overflowed, emit it in a separate thread. */
    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS && !debug_get_option_noop()) {
        unsigned i, crelocs = cs->cst->crelocs;

        cs->cst->chunks[0].length_dw = cs->base.cdw;

        for (i = 0; i < crelocs; i++) {
            /* Update the number of active asynchronous CS ioctls for the buffer. */
            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
        }

        switch (cs->base.ring_type) {
        case RING_DMA:
            cs->cst->flags[0] = 0;
            cs->cst->flags[1] = RADEON_CS_RING_DMA;
            cs->cst->cs.num_chunks = 3;
            if (cs->ws->info.r600_virtual_address) {
                cs->cst->flags[0] |= RADEON_CS_USE_VM;
            }
            break;

        case RING_UVD:
            cs->cst->flags[0] = 0;
            cs->cst->flags[1] = RADEON_CS_RING_UVD;
            cs->cst->cs.num_chunks = 3;
            break;

        default:
        case RING_GFX:
            cs->cst->flags[0] = 0;
            cs->cst->flags[1] = RADEON_CS_RING_GFX;
            cs->cst->cs.num_chunks = 2;
            if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
                cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
                cs->cst->cs.num_chunks = 3;
            }
            if (cs->ws->info.r600_virtual_address) {
                cs->cst->flags[0] |= RADEON_CS_USE_VM;
                cs->cst->cs.num_chunks = 3;
            }
            if (flags & RADEON_FLUSH_END_OF_FRAME) {
                cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
                cs->cst->cs.num_chunks = 3;
            }
            if (flags & RADEON_FLUSH_COMPUTE) {
                cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
                cs->cst->cs.num_chunks = 3;
            }
            break;
        }

        if (cs->ws->thread && (flags & RADEON_FLUSH_ASYNC)) {
            cs->flush_started = 1;
            radeon_drm_ws_queue_cs(cs->ws, cs);
        } else {
            pipe_mutex_lock(cs->ws->cs_stack_lock);
            if (cs->ws->thread) {
                while (p_atomic_read(&cs->ws->ncs)) {
                    pipe_condvar_wait(cs->ws->cs_queue_empty, cs->ws->cs_stack_lock);
                }
            }
            pipe_mutex_unlock(cs->ws->cs_stack_lock);
            radeon_drm_cs_emit_ioctl_oneshot(cs, cs->cst);
        }
    } else {
        radeon_cs_context_cleanup(cs->cst);
    }

    /* Prepare a new CS. */
    cs->base.buf = cs->csc->buf;
    cs->base.cdw = 0;
}