Ejemplo n.º 1
0
static void nvc0_fence_write(struct gdev_ctx *ctx, int subch, uint32_t sequence)
{
	uint32_t offset = sequence * sizeof(struct gdev_nvc0_query);
	uint64_t vm_addr = ctx->fence.addr + offset;
	int intr = 0; /* intr = 1 will cause an interrupt too. */

	switch (subch) {
	case GDEV_SUBCH_NV_COMPUTE:
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x110, 1);
		__gdev_out_ring(ctx, 0); /* SERIALIZE */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1b00, 4);
		__gdev_out_ring(ctx, vm_addr >> 32); /* QUERY_ADDRESS HIGH */
		__gdev_out_ring(ctx, vm_addr); /* QUERY_ADDRESS LOW */
		__gdev_out_ring(ctx, sequence); /* QUERY_SEQUENCE */
		__gdev_out_ring(ctx, intr << 20); /* QUERY_GET */
		break;
	case GDEV_SUBCH_NV_M2MF:
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x32c, 3);
		__gdev_out_ring(ctx, vm_addr >> 32); /* QUERY_ADDRESS HIGH */
		__gdev_out_ring(ctx, vm_addr); /* QUERY_ADDRESS LOW */
		__gdev_out_ring(ctx, sequence); /* QUERY_SEQUENCE */
		break;
	case GDEV_SUBCH_NV_PCOPY0:
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x338, 3);
		__gdev_out_ring(ctx, vm_addr >> 32); /* QUERY_ADDRESS HIGH */
		__gdev_out_ring(ctx, vm_addr); /* QUERY_ADDRESS LOW */
		__gdev_out_ring(ctx, sequence); /* QUERY_COUNTER */
		break;
#ifdef GDEV_NVIDIA_USE_PCOPY1
	case GDEV_SUBCH_NV_PCOPY1:
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY1, 0x338, 3);
		__gdev_out_ring(ctx, vm_addr >> 32); /* QUERY_ADDRESS HIGH */
		__gdev_out_ring(ctx, vm_addr); /* QUERY_ADDRESS LOW */
		__gdev_out_ring(ctx, sequence); /* QUERY_COUNTER */
		break;
#endif
	}

	__gdev_fire_ring(ctx);
}
Ejemplo n.º 2
0
static void nvc0_memcpy_m2mf(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size)
{
	uint32_t mode1 = 0x102110; /* QUERY_SHORT|QUERY_YES|SRC_LINEAR|DST_LINEAR */
	uint32_t mode2 = 0x100110; /* QUERY_SHORT|SRC_LINEAR|DST_LINEAR */
	uint32_t page_size = 0x1000;
	uint32_t page_count = size / page_size;
	uint32_t rem_size = size - page_size * page_count;

	while (page_count) {
		int line_count = (page_count > 2047) ? 2047 : page_count;
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x238, 2);
		__gdev_out_ring(ctx, dst_addr >> 32); /* OFFSET_OUT_HIGH */
		__gdev_out_ring(ctx, dst_addr); /* OFFSET_OUT_LOW */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x30c, 6);
		__gdev_out_ring(ctx, src_addr >> 32); /* OFFSET_IN_HIGH */
		__gdev_out_ring(ctx, src_addr); /* OFFSET_IN_LOW */
		__gdev_out_ring(ctx, page_size); /* SRC_PITCH_IN */
		__gdev_out_ring(ctx, page_size); /* DST_PITCH_IN */
		__gdev_out_ring(ctx, page_size); /* LINE_LENGTH_IN */
		__gdev_out_ring(ctx, line_count); /* LINE_COUNT */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x300, 1);
		if (page_count == line_count && rem_size == 0)
			__gdev_out_ring(ctx, mode1); /* EXEC */
		else
			__gdev_out_ring(ctx, mode2); /* EXEC */
		page_count -= line_count;
		dst_addr += (page_size * line_count);
		src_addr += (page_size * line_count);
	}

	if (rem_size) {
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x238, 2);
		__gdev_out_ring(ctx, dst_addr >> 32); /* OFFSET_OUT_HIGH */
		__gdev_out_ring(ctx, dst_addr); /* OFFSET_OUT_LOW */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x30c, 6);
		__gdev_out_ring(ctx, src_addr >> 32); /* OFFSET_IN_HIGH */
		__gdev_out_ring(ctx, src_addr); /* OFFSET_IN_LOW */
		__gdev_out_ring(ctx, rem_size); /* SRC_PITCH_IN */
		__gdev_out_ring(ctx, rem_size); /* DST_PITCH_IN */
		__gdev_out_ring(ctx, rem_size); /* LINE_LENGTH_IN */
		__gdev_out_ring(ctx, 1); /* LINE_COUNT */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x300, 1);
		__gdev_out_ring(ctx, mode1); /* EXEC */
	}

	__gdev_fire_ring(ctx);
}
Ejemplo n.º 3
0
static void nvc0_memcpy_pcopy0(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size)
{
	uint32_t mode = 0x3110; /* QUERY_SHORT|QUERY|SRC_LINEAR|DST_LINEAR */
	uint32_t pitch = 0x8000; /* make it configurable... */
	uint32_t ycnt = size / pitch;
	uint32_t rem_size = size - pitch * ycnt;
	size -= rem_size;

	if (size) {
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x30c, 6);
		__gdev_out_ring(ctx, src_addr >> 32); /* SRC_ADDRESS_HIGH */
		__gdev_out_ring(ctx, src_addr); /* SRC_ADDRESS_LOW */
		__gdev_out_ring(ctx, dst_addr >> 32); /* DST_ADDRESS_HIGH */
		__gdev_out_ring(ctx, dst_addr); /* DST_ADDRESS_LOW */
		__gdev_out_ring(ctx, pitch); /* SRC_PITCH */
		__gdev_out_ring(ctx, pitch); /* DST_PITCH */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x324, 2);
		__gdev_out_ring(ctx, pitch); /* XCNT */
		__gdev_out_ring(ctx, ycnt); /* YCNT */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x300, 1);
		__gdev_out_ring(ctx, mode); /* EXEC */

		__gdev_fire_ring(ctx);
	}

	if (rem_size) {
		src_addr += size;
		dst_addr += size;
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x30c, 6);
		__gdev_out_ring(ctx, src_addr >> 32); /* SRC_ADDRESS_HIGH */
		__gdev_out_ring(ctx, src_addr); /* SRC_ADDRESS_LOW */
		__gdev_out_ring(ctx, dst_addr >> 32); /* DST_ADDRESS_HIGH */
		__gdev_out_ring(ctx, dst_addr); /* DST_ADDRESS_LOW */
		__gdev_out_ring(ctx, 0); /* SRC_PITCH */
		__gdev_out_ring(ctx, 0); /* DST_PITCH */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x324, 2);
		__gdev_out_ring(ctx, rem_size); /* XCNT */
		__gdev_out_ring(ctx, 1); /* YCNT */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x300, 1);
		__gdev_out_ring(ctx, mode); /* EXEC */

		__gdev_fire_ring(ctx);
	}
}
Ejemplo n.º 4
0
static int nvc0_launch(struct gdev_ctx *ctx, struct gdev_kernel *k)
{
	int x;
	uint32_t cache_split;

	/* setup cache_split so that it'll allow 3 blocks (16 warps each) per 
	   SM for maximum occupancy. */
	cache_split = k->smem_size > 16 * 1024 ? 3 : 1;

	/* local (temp) memory setup. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x790, 5);
	__gdev_out_ring(ctx, k->lmem_addr >> 32); /* TEMP_ADDRESS_HIGH */
	__gdev_out_ring(ctx, k->lmem_addr); /* TEMP_ADDRESS_LOW */
	__gdev_out_ring(ctx, k->lmem_size_total >> 32); /* TEMP_SIZE_HIGH */
	__gdev_out_ring(ctx, k->lmem_size_total); /* TEMP_SIZE_LOW */
	__gdev_out_ring(ctx, k->warp_lmem_size); /* WARP_TEMP_ALLOC */

	/* local memory base. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x77c, 1);
	__gdev_out_ring(ctx, k->lmem_base); /* LOCAL_BASE */

	/* local memory size per warp */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x204, 3);
	__gdev_out_ring(ctx, k->lmem_size); /* LOCAL_POS_ALLOC */
	__gdev_out_ring(ctx, k->lmem_size_neg); /* LOCAL_NEG_ALLOC */
	__gdev_out_ring(ctx, k->warp_stack_size); /* WARP_CSTACK_SIZE */

	/* shared memory setup. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x308, 1);
	__gdev_out_ring(ctx, cache_split); /* CACHE_SPLIT */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x214, 1);
	__gdev_out_ring(ctx, k->smem_base); /* SHARED_BASE */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x24c, 1);
	__gdev_out_ring(ctx, k->smem_size); /* SHARED_SIZE */

	/* code flush, i.e., code needs to be uploaded in advance. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1698, 1);
	__gdev_out_ring(ctx, 0x0001); /* FLUSH: 0x0001 = FLUSH_CODE */

	/* code setup. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1608, 2);
	__gdev_out_ring(ctx, k->code_addr >> 32); /* CODE_ADDRESS_HIGH */
	__gdev_out_ring(ctx, k->code_addr); /* CODE_ADDRESS_LOW */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x3b4, 1);
	__gdev_out_ring(ctx, k->code_pc); /* CP_START_ID */

	/* constant memory setup. this is a bit tricky:
	   we set the constant memory size and address first. we next set
	   which const memory segment (cX[]) to be used via CB_BIND method.
	   CB_DATA will then send data (e.g., kernel parameters) to the offset
	   (CB_POS) from the constant memory address at cX[]. CB_DATA seem
	   to have 16 sockets, but not really sure how to use them... 
	   just CB_DATA#0 (0x2390) with non-increment method works here. */
	for (x = 0; x < k->cmem_count; x++) {
		if (!k->cmem[x].addr || !k->cmem[x].size)
			continue;
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2380, 3);
		__gdev_out_ring(ctx, k->cmem[x].size); /* CB_SIZE */
		__gdev_out_ring(ctx, k->cmem[x].addr >> 32); /* CB_ADDRESS_HIGH */
		__gdev_out_ring(ctx, k->cmem[x].addr); /* CB_ADDRESS_LOW */
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1694, 1);
		__gdev_out_ring(ctx, (x << 8) | 1); /* CB_BIND */
		/* send kernel parameters to a specific constant memory space. */
		if (x == 0) {
			int i;
			int n = k->param_size / 4; /* each param is integer size. */
			/* the following is the nvcc protocol */
			if (n >= 8) {
				k->param_buf[0] = k->smem_base;
				k->param_buf[1] = k->lmem_base;
				k->param_buf[2] = k->block_x;
				k->param_buf[3] = k->block_y;
				k->param_buf[4] = k->block_z;
				k->param_buf[5] = k->grid_x;
				k->param_buf[6] = k->grid_y;
				k->param_buf[7] = k->grid_z;
			}
			__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x238c, 1);
			__gdev_out_ring(ctx, k->cmem[x].offset); /* CB_POS */
			__gdev_begin_ring_nvc0_const(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2390, n);
			for (i = 0; i < n; i++) {
				__gdev_out_ring(ctx, k->param_buf[i]); /* CB_DATA#0 */
			}
		}
		/* nvcc uses c1[], but what is this? */
		else if (x == 1) {
			int i;
			__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x238c, 1);
			__gdev_out_ring(ctx, 0); /* CB_POS */
			__gdev_begin_ring_nvc0_const(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2390, 0x20);
			for (i = 0; i < 0x20; i++) {
				__gdev_out_ring(ctx, 0); /* CB_DATA#0 */
			}
			__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x238c, 1);
			__gdev_out_ring(ctx, 0x100); /* CB_POS */
			__gdev_begin_ring_nvc0_const(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2390, 1);
			__gdev_out_ring(ctx, 0x00fffc40); /* CB_DATA#0 */
		}
	}

	/* constant memory flush. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1698, 1);
	__gdev_out_ring(ctx, 0x1000); /* FLUSH: 0x1000 = FLUSH_CB */

	/* grid/block setup. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x238, 2);
	__gdev_out_ring(ctx, (k->grid_y << 16) | k->grid_x); /* GRIDDIM_YX */
	__gdev_out_ring(ctx, k->grid_z); /* GRIDDIM_Z */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x3ac, 2);
	__gdev_out_ring(ctx, (k->block_y << 16) | k->block_x); /* BLOCKDIM_YX */
	__gdev_out_ring(ctx, k->block_z); /* BLOCKDIM_X */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x250, 1);
	__gdev_out_ring(ctx, k->block_x * k->block_y * k->block_z); /* TH_ALLOC */

	/* barriers/registers setup. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2c0, 1);
	__gdev_out_ring(ctx, k->reg_count); /* CP_GPR_ALLOC */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x254, 1);
	__gdev_out_ring(ctx, k->bar_count); /* BARRIER_ALLOC */
	
	/* launch preliminary setup. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x780, 1);
	__gdev_out_ring(ctx, k->grid_id); /* GRIDID */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x36c, 1);
	__gdev_out_ring(ctx, 0); /* ??? */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1698, 1);
	__gdev_out_ring(ctx, 0x0110); /* FLUSH: 0x110 = FLUSH_UNK8 | FLUSH_GLOBAL */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x29c, 1);
	__gdev_out_ring(ctx, 0); /* BEGIN */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0xa08, 1);
	__gdev_out_ring(ctx, 0); /* ??? */

	/* kernel lauching. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x368, 1);
	__gdev_out_ring(ctx, 0x1000 /* 0x0 */); /* LAUNCH */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0xa04, 1);
	__gdev_out_ring(ctx, 0); /* END */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x360, 1);
	__gdev_out_ring(ctx, 1); /* ??? */

	__gdev_fire_ring(ctx);

#ifdef GDEV_DEBUG
	__nvc0_launch_debug_print(k);
#endif

	return 0;
}
Ejemplo n.º 5
0
static void nvc0_init(struct gdev_ctx *ctx)
{
	int i;
	uint64_t mp_limit;
	struct gdev_vas *vas = ctx->vas;
	struct gdev_device *gdev = vas->gdev;

	/* initialize the fence values. */
	for (i = 0; i < GDEV_FENCE_COUNT; i++)
		nvc0_fence_reset(ctx, i);

	/* clean the FIFO. */
	for (i = 0; i < 128/4; i++)
		__gdev_out_ring(ctx, 0);
	__gdev_fire_ring(ctx);

	/* setup subchannels. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0, 1);
	__gdev_out_ring(ctx, 0x9039); /* M2MF */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0, 1);
	__gdev_out_ring(ctx, 0x90c0); /* COMPUTE */

	/* enable PCOPY only when we are in the kernel atm... */
#ifdef __KERNEL__
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0, 1);
	__gdev_out_ring(ctx, 0x490b5); /* PCOPY0 */
#ifdef GDEV_NVIDIA_USE_PCOPY1
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY1, 0, 1);
	__gdev_out_ring(ctx, 0x590b8); /* PCOPY1 */
#endif
#endif
	__gdev_fire_ring(ctx);

	/* the blob places NOP at the beginning. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x100, 1);
	__gdev_out_ring(ctx, 0); /* GRAPH_NOP */

	/* hardware limit. */
	gdev_query(gdev, GDEV_NVIDIA_QUERY_MP_COUNT, &mp_limit);
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x758, 1);
	__gdev_out_ring(ctx, (uint32_t) mp_limit); /* MP_LIMIT */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0xd64, 1);
	__gdev_out_ring(ctx, 0xf); /* CALL_LIMIT_LOG: hardcoded for now */

	/* grid/block initialization. the blob does the following, but not 
	   really sure if they are necessary... */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2a0, 1);
	__gdev_out_ring(ctx, 0x8000); /* ??? */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x238, 2);
	__gdev_out_ring(ctx, (1 << 16) | 1); /* GRIDDIM_YX */
	__gdev_out_ring(ctx, 1); /* GRIDDIM_Z */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x3ac, 2);
	__gdev_out_ring(ctx, (1 << 16) | 1); /* BLOCKDIM_YX */
	__gdev_out_ring(ctx, 1); /* BLOCKDIM_X */

	/* global memory setup: 0xc << 28 = read_ok & write_ok. 
	   HIGH_MASK = 0x000000ff (x << 0) and INDEX_MASK = 0x00ff0000 (x << 16).
	   this will remap high bytes of g[], to the actual global memory address.
	   e.g., if INDEX = 0xff and HIGH = 0x00, g[0xff000004] in the kernel
	   program will reference address 0x4. */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2c4, 1);
	__gdev_out_ring(ctx, 0); /* ???: UNK2C4 <- FALSE */
	for (i = 0; i < 0xff; i++) {
		__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2c8, 1);
		__gdev_out_ring(ctx, (0xc << 28) | (i << 16) | i); /* GLOBAL_BASE */
	}
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x2c4, 1);
	__gdev_out_ring(ctx, 1); /* ???: UNK2C4 <- TRUE */

#ifdef GDEV_TEXTURE_SUPPORT /* not supported now... */
	/* texture setup. hardcode samp_log2 = tex_log2 = 3... FIXME!!! */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x210, 1);
	__gdev_out_ring(ctx, 0x33); /* TEX_LIMITS */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1234, 1);
	__gdev_out_ring(ctx, 1); /* LINKED_TSC */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x1578, 3);
	__gdev_out_ring(ctx, 0); /* TIC_ADDRESS_HIGH */
	__gdev_out_ring(ctx, 0); /* TIC_ADDRESS_LOW */
	__gdev_out_ring(ctx, 0x3ff); /* TIC_LIMIT */
	__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0x155c, 3);
	__gdev_out_ring(ctx, 0); /* TSC_ADDRESS_HIGH */
	__gdev_out_ring(ctx, 0); /* TSC_ADDRESS_LOW */
	__gdev_out_ring(ctx, 0x3ff); /* TSC_LIMIT */
#endif

	__gdev_fire_ring(ctx);
}