int nvc0_screen_compute_setup(struct nvc0_screen *screen, struct nouveau_pushbuf *push) { struct nouveau_object *chan = screen->base.channel; struct nouveau_device *dev = screen->base.device; uint32_t obj_class; int ret; int i; switch (dev->chipset & ~0xf) { case 0xc0: if (dev->chipset == 0xc8) obj_class = NVC8_COMPUTE_CLASS; else obj_class = NVC0_COMPUTE_CLASS; break; case 0xd0: obj_class = NVC0_COMPUTE_CLASS; break; default: NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); return -1; } ret = nouveau_object_new(chan, 0xbeef90c0, obj_class, NULL, 0, &screen->compute); if (ret) { NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret); return ret; } ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 1 << 12, NULL, &screen->parm); if (ret) return ret; BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->oclass); /* hardware limit */ BEGIN_NVC0(push, NVC0_COMPUTE(MP_LIMIT), 1); PUSH_DATA (push, screen->mp_count); BEGIN_NVC0(push, NVC0_COMPUTE(CALL_LIMIT_LOG), 1); PUSH_DATA (push, 0xf); BEGIN_NVC0(push, SUBC_COMPUTE(0x02a0), 1); PUSH_DATA (push, 0x8000); /* global memory setup */ BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); PUSH_DATA (push, 0); BEGIN_NIC0(push, NVC0_COMPUTE(GLOBAL_BASE), 0x100); for (i = 0; i <= 0xff; i++) PUSH_DATA (push, (0xc << 28) | (i << 16) | i); BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); PUSH_DATA (push, 1); /* local memory and cstack setup */ BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls->offset); PUSH_DATA (push, screen->tls->offset); BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_SIZE_HIGH), 2); PUSH_DATAh(push, screen->tls->size); PUSH_DATA (push, screen->tls->size); BEGIN_NVC0(push, NVC0_COMPUTE(WARP_TEMP_ALLOC), 1); PUSH_DATA (push, 0); BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_BASE), 1); PUSH_DATA (push, 1 << 24); /* shared memory setup */ BEGIN_NVC0(push, NVC0_COMPUTE(CACHE_SPLIT), 1); PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_48K_SHARED_16K_L1); BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_BASE), 1); PUSH_DATA (push, 2 << 24); BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 1); PUSH_DATA (push, 0); /* code segment setup */ BEGIN_NVC0(push, NVC0_COMPUTE(CODE_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->text->offset); PUSH_DATA (push, screen->text->offset); /* bind parameters buffer */ BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); PUSH_DATA (push, screen->parm->size); PUSH_DATAh(push, screen->parm->offset); PUSH_DATA (push, screen->parm->offset); BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); PUSH_DATA (push, (0 << 8) | 1); /* TODO: textures & samplers */ return 0; }
void nvc0_launch_grid(struct pipe_context *pipe, const uint *block_layout, const uint *grid_layout, uint32_t label, const void *input) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_program *cp = nvc0->compprog; unsigned s, i; int ret; ret = !nvc0_compute_state_validate(nvc0); if (ret) goto out; nvc0_compute_upload_input(nvc0, input); BEGIN_NVC0(push, NVC0_COMPUTE(CP_START_ID), 1); PUSH_DATA (push, nvc0_program_symbol_offset(cp, label)); BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_POS_ALLOC), 3); PUSH_DATA (push, align(cp->cp.lmem_size, 0x10)); PUSH_DATA (push, 0); PUSH_DATA (push, 0x800); /* WARP_CSTACK_SIZE */ BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 3); PUSH_DATA (push, align(cp->cp.smem_size, 0x100)); PUSH_DATA (push, block_layout[0] * block_layout[1] * block_layout[2]); PUSH_DATA (push, cp->num_barriers); BEGIN_NVC0(push, NVC0_COMPUTE(CP_GPR_ALLOC), 1); PUSH_DATA (push, cp->num_gprs); /* grid/block setup */ BEGIN_NVC0(push, NVC0_COMPUTE(GRIDDIM_YX), 2); PUSH_DATA (push, (grid_layout[1] << 16) | grid_layout[0]); PUSH_DATA (push, grid_layout[2]); BEGIN_NVC0(push, NVC0_COMPUTE(BLOCKDIM_YX), 2); PUSH_DATA (push, (block_layout[1] << 16) | block_layout[0]); PUSH_DATA (push, block_layout[2]); /* launch preliminary setup */ BEGIN_NVC0(push, NVC0_COMPUTE(GRIDID), 1); PUSH_DATA (push, 0x1); BEGIN_NVC0(push, SUBC_COMPUTE(0x036c), 1); PUSH_DATA (push, 0); BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1); PUSH_DATA (push, NVC0_COMPUTE_FLUSH_GLOBAL | NVC0_COMPUTE_FLUSH_UNK8); /* kernel launching */ BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_BEGIN), 1); PUSH_DATA (push, 0); BEGIN_NVC0(push, SUBC_COMPUTE(0x0a08), 1); PUSH_DATA (push, 0); BEGIN_NVC0(push, NVC0_COMPUTE(LAUNCH), 1); PUSH_DATA (push, 0x1000); BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_END), 1); PUSH_DATA (push, 0); BEGIN_NVC0(push, SUBC_COMPUTE(0x0360), 1); PUSH_DATA (push, 0x1); /* rebind all the 3D constant buffers * (looks like binding a CB on COMPUTE clobbers 3D state) */ nvc0->dirty |= NVC0_NEW_CONSTBUF; for (s = 0; s < 6; s++) { for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; i++) if (nvc0->constbuf[s][i].u.buf) nvc0->constbuf_dirty[s] |= 1 << i; } memset(nvc0->state.uniform_buffer_bound, 0, sizeof(nvc0->state.uniform_buffer_bound)); out: if (ret) NOUVEAU_ERR("Failed to launch grid !\n"); }
static void nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq) { struct nv50_screen *screen = nv50->screen; struct pipe_context *pipe = &nv50->base.pipe; struct nouveau_pushbuf *push = nv50->base.pushbuf; struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq); struct pipe_grid_info info = {}; uint32_t mask; uint32_t input[3]; const uint block[3] = { 32, 1, 1 }; const uint grid[3] = { screen->MPsInTP, screen->TPs, 1 }; int c, i; if (unlikely(!screen->pm.prog)) { struct nv50_program *prog = CALLOC_STRUCT(nv50_program); prog->type = PIPE_SHADER_COMPUTE; prog->translated = true; prog->max_gpr = 7; prog->parm_size = 8; prog->code = (uint32_t *)nv50_read_hw_sm_counters_code; prog->code_size = sizeof(nv50_read_hw_sm_counters_code); screen->pm.prog = prog; } /* disable all counting */ PUSH_SPACE(push, 8); for (c = 0; c < 4; c++) { if (screen->pm.mp_counter[c]) { BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1); PUSH_DATA (push, 0); } } /* release counters for this query */ for (c = 0; c < 4; c++) { if (screen->pm.mp_counter[c] == hsq) { screen->pm.num_hw_sm_active--; screen->pm.mp_counter[c] = NULL; } } BCTX_REFN_bo(nv50->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, hq->bo); PUSH_SPACE(push, 2); BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); pipe->bind_compute_state(pipe, screen->pm.prog); input[0] = hq->bo->offset + hq->base_offset; input[1] = hq->sequence; for (i = 0; i < 3; i++) { info.block[i] = block[i]; info.grid[i] = grid[i]; } info.pc = 0; info.input = input; pipe->launch_grid(pipe, &info); nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_QUERY); /* re-active other counters */ PUSH_SPACE(push, 8); mask = 0; for (c = 0; c < 4; c++) { const struct nv50_hw_sm_query_cfg *cfg; unsigned i; hsq = screen->pm.mp_counter[c]; if (!hsq) continue; cfg = nv50_hw_sm_query_get_cfg(nv50, &hsq->base); for (i = 0; i < cfg->num_counters; i++) { uint16_t func; if (mask & (1 << hsq->ctr[i])) break; mask |= 1 << hsq->ctr[i]; func = nv50_hw_sm_get_func(hsq->ctr[i]); BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1); PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8) | cfg->ctr[i].unit | cfg->ctr[i].mode); } } }
int nv50_screen_compute_setup(struct nv50_screen *screen, struct nouveau_pushbuf *push) { struct nouveau_device *dev = screen->base.device; struct nouveau_object *chan = screen->base.channel; struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data; unsigned obj_class; int i, ret; switch (dev->chipset & 0xf0) { case 0x50: case 0x80: case 0x90: obj_class = NV50_COMPUTE_CLASS; break; case 0xa0: switch (dev->chipset) { case 0xa3: case 0xa5: case 0xa8: obj_class = NVA3_COMPUTE_CLASS; break; default: obj_class = NV50_COMPUTE_CLASS; break; } break; default: NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); return -1; } ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0, &screen->compute); if (ret) return ret; BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->handle); BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->stack_bo->offset); PUSH_DATA (push, screen->stack_bo->offset); BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1); PUSH_DATA (push, 4); BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1); PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED); BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1); PUSH_DATA (push, 0x100); BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1); PUSH_DATA (push, fifo->vram); for (i = 0; i < 15; i++) { BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2); PUSH_DATA (push, 0); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1); PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR); } BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2); PUSH_DATA (push, 0); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1); PUSH_DATA (push, ~0); BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1); PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1); PUSH_DATA (push, 7); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1); PUSH_DATA (push, 7); BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1); PUSH_DATA (push, 0x54); BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset); PUSH_DATA (push, screen->txc->offset); PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1); BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset + 65536); PUSH_DATA (push, screen->txc->offset + 65536); PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1); BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1); PUSH_DATA (push, fifo->vram); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls_bo->offset + 65536); PUSH_DATA (push, screen->tls_bo->offset + 65536); BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1); PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2)); return 0; }
static void nve4_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) { struct nvc0_screen *screen = nvc0->screen; struct pipe_context *pipe = &nvc0->base.pipe; struct nouveau_pushbuf *push = nvc0->base.pushbuf; uint32_t mask; uint32_t input[3]; const uint block[3] = { 32, 4, 1 }; const uint grid[3] = { screen->mp_count, 1, 1 }; unsigned c; const struct nve4_mp_pm_query_cfg *cfg; cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; if (unlikely(!screen->pm.prog)) { struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); prog->type = PIPE_SHADER_COMPUTE; prog->translated = TRUE; prog->num_gprs = 14; prog->code = (uint32_t *)nve4_read_mp_pm_counters_code; prog->code_size = sizeof(nve4_read_mp_pm_counters_code); prog->parm_size = 12; screen->pm.prog = prog; } /* disable all counting */ PUSH_SPACE(push, 8); for (c = 0; c < 8; ++c) if (screen->pm.mp_counter[c]) IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); /* release counters for this query */ for (c = 0; c < 8; ++c) { if (nvc0_query(screen->pm.mp_counter[c]) == q) { screen->pm.num_mp_pm_active[c / 4]--; screen->pm.mp_counter[c] = NULL; } } BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, q->bo); PUSH_SPACE(push, 1); IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); pipe->bind_compute_state(pipe, screen->pm.prog); input[0] = (q->bo->offset + q->base); input[1] = (q->bo->offset + q->base) >> 32; input[2] = q->sequence; pipe->launch_grid(pipe, block, grid, 0, input); nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); /* re-activate other counters */ PUSH_SPACE(push, 16); mask = 0; for (c = 0; c < 8; ++c) { unsigned i; q = nvc0_query(screen->pm.mp_counter[c]); if (!q) continue; cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; for (i = 0; i < cfg->num_counters; ++i) { if (mask & (1 << q->ctr[i])) break; mask |= 1 << q->ctr[i]; BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1); PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); } } }