static void nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input) { struct nv50_screen *screen = nv50->screen; struct nouveau_pushbuf *push = screen->base.pushbuf; unsigned size = align(nv50->compprog->parm_size, 0x4); BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1); PUSH_DATA (push, (size / 4) << 8); if (size) { struct nouveau_mm_allocation *mm; struct nouveau_bo *bo = NULL; unsigned offset; mm = nouveau_mm_allocate(screen->base.mm_GART, size, &bo, &offset); assert(mm); nouveau_bo_map(bo, 0, screen->base.client); memcpy(bo->map + offset, input, size); nouveau_bufctx_refn(nv50->bufctx, 0, bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); nouveau_pushbuf_bufctx(push, nv50->bufctx); nouveau_pushbuf_validate(push); BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4); nouveau_pushbuf_data(push, bo, offset, size); nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm); nouveau_bo_ref(NULL, &bo); nouveau_bufctx_reset(nv50->bufctx, 0); } }
static boolean nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq) { struct nv50_screen *screen = nv50->screen; struct nouveau_pushbuf *push = nv50->base.pushbuf; struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq); const struct nv50_hw_sm_query_cfg *cfg; uint16_t func; int i, c; cfg = nv50_hw_sm_query_get_cfg(nv50, hq); /* check if we have enough free counter slots */ if (screen->pm.num_hw_sm_active + cfg->num_counters > 4) { NOUVEAU_ERR("Not enough free MP counter slots !\n"); return false; } assert(cfg->num_counters <= 4); PUSH_SPACE(push, 4 * 4); /* set sequence field to 0 (used to check if result is available) */ for (i = 0; i < screen->MPsInTP; ++i) { const unsigned b = (0x14 / 4) * i; hq->data[b + 16] = 0; } hq->sequence++; for (i = 0; i < cfg->num_counters; i++) { screen->pm.num_hw_sm_active++; /* find free counter slots */ for (c = 0; c < 4; ++c) { if (!screen->pm.mp_counter[c]) { hsq->ctr[i] = c; screen->pm.mp_counter[c] = hsq; break; } } /* select func to aggregate counters */ func = nv50_hw_sm_get_func(c); /* configure and reset the counter(s) */ BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1); PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8) | cfg->ctr[i].unit | cfg->ctr[i].mode); BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1); PUSH_DATA (push, 0); } return true; }
static bool nv50_compute_validate_program(struct nv50_context *nv50) { struct nv50_program *prog = nv50->compprog; if (prog->mem) return true; if (!prog->translated) { prog->translated = nv50_program_translate( prog, nv50->screen->base.device->chipset, &nv50->base.debug); if (!prog->translated) return false; } if (unlikely(!prog->code_size)) return false; if (likely(prog->code_size)) { if (nv50_program_upload_code(nv50, prog)) { struct nouveau_pushbuf *push = nv50->base.pushbuf; BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1); PUSH_DATA (push, 0); return true; } } return false; }
void nv50_launch_grid(struct pipe_context *pipe, const uint *block_layout, const uint *grid_layout, uint32_t label, const void *input) { struct nv50_context *nv50 = nv50_context(pipe); struct nouveau_pushbuf *push = nv50->base.pushbuf; unsigned block_size = block_layout[0] * block_layout[1] * block_layout[2]; struct nv50_program *cp = nv50->compprog; bool ret; ret = !nv50_compute_state_validate(nv50); if (ret) { NOUVEAU_ERR("Failed to launch grid !\n"); return; } nv50_compute_upload_input(nv50, input); BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1); PUSH_DATA (push, nv50_compute_find_symbol(nv50, label)); BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1); PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40)); BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1); PUSH_DATA (push, cp->max_gpr); /* grid/block setup */ BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2); PUSH_DATA (push, block_layout[1] << 16 | block_layout[0]); PUSH_DATA (push, block_layout[2]); BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1); PUSH_DATA (push, 1 << 16 | block_size); BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1); PUSH_DATA (push, grid_layout[1] << 16 | grid_layout[0]); BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1); PUSH_DATA (push, 1); /* kernel launching */ BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); /* bind a compute shader clobbers fragment shader state */ nv50->dirty |= NV50_NEW_FRAGPROG; }
int nv50_screen_compute_setup(struct nv50_screen *screen, struct nouveau_pushbuf *push) { struct nouveau_device *dev = screen->base.device; struct nouveau_object *chan = screen->base.channel; struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data; unsigned obj_class; int i, ret; switch (dev->chipset & 0xf0) { case 0x50: case 0x80: case 0x90: obj_class = NV50_COMPUTE_CLASS; break; case 0xa0: switch (dev->chipset) { case 0xa3: case 0xa5: case 0xa8: obj_class = NVA3_COMPUTE_CLASS; break; default: obj_class = NV50_COMPUTE_CLASS; break; } break; default: NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); return -1; } ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0, &screen->compute); if (ret) return ret; BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->handle); BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->stack_bo->offset); PUSH_DATA (push, screen->stack_bo->offset); BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1); PUSH_DATA (push, 4); BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1); PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED); BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1); PUSH_DATA (push, 0x100); BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1); PUSH_DATA (push, fifo->vram); for (i = 0; i < 15; i++) { BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2); PUSH_DATA (push, 0); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1); PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR); } BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2); PUSH_DATA (push, 0); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1); PUSH_DATA (push, ~0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1); PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1); PUSH_DATA (push, 7); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1); PUSH_DATA (push, 7); BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1); PUSH_DATA (push, 0x54); BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset); PUSH_DATA (push, screen->txc->offset); PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1); BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset + 65536); PUSH_DATA (push, screen->txc->offset + 65536); PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1); BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls_bo->offset + 65536); PUSH_DATA (push, screen->tls_bo->offset + 65536); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1); PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2)); return 0; }
static void nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq) { struct nv50_screen *screen = nv50->screen; struct pipe_context *pipe = &nv50->base.pipe; struct nouveau_pushbuf *push = nv50->base.pushbuf; struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq); struct pipe_grid_info info = {}; uint32_t mask; uint32_t input[3]; const uint block[3] = { 32, 1, 1 }; const uint grid[3] = { screen->MPsInTP, screen->TPs, 1 }; int c, i; if (unlikely(!screen->pm.prog)) { struct nv50_program *prog = CALLOC_STRUCT(nv50_program); prog->type = PIPE_SHADER_COMPUTE; prog->translated = true; prog->max_gpr = 7; prog->parm_size = 8; prog->code = (uint32_t *)nv50_read_hw_sm_counters_code; prog->code_size = sizeof(nv50_read_hw_sm_counters_code); screen->pm.prog = prog; } /* disable all counting */ PUSH_SPACE(push, 8); for (c = 0; c < 4; c++) { if (screen->pm.mp_counter[c]) { BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1); PUSH_DATA (push, 0); } } /* release counters for this query */ for (c = 0; c < 4; c++) { if (screen->pm.mp_counter[c] == hsq) { screen->pm.num_hw_sm_active--; screen->pm.mp_counter[c] = NULL; } } BCTX_REFN_bo(nv50->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, hq->bo); PUSH_SPACE(push, 2); BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); pipe->bind_compute_state(pipe, screen->pm.prog); input[0] = hq->bo->offset + hq->base_offset; input[1] = hq->sequence; for (i = 0; i < 3; i++) { info.block[i] = block[i]; info.grid[i] = grid[i]; } info.pc = 0; info.input = input; pipe->launch_grid(pipe, &info); nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_QUERY); /* re-active other counters */ PUSH_SPACE(push, 8); mask = 0; for (c = 0; c < 4; c++) { const struct nv50_hw_sm_query_cfg *cfg; unsigned i; hsq = screen->pm.mp_counter[c]; if (!hsq) continue; cfg = nv50_hw_sm_query_get_cfg(nv50, &hsq->base); for (i = 0; i < cfg->num_counters; i++) { uint16_t func; if (mask & (1 << hsq->ctr[i])) break; mask |= 1 << hsq->ctr[i]; func = nv50_hw_sm_get_func(hsq->ctr[i]); BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1); PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8) | cfg->ctr[i].unit | cfg->ctr[i].mode); } } }