Ejemplo n.º 1
0
/**
 * Copies from host memory to device memory. dstDevice and srcHost are the base
 * addresses of the destination and source, respectively. ByteCount specifies
 * the number of bytes to copy. Note that this function is synchronous.
 *
 * Parameters:
 * dstDevice - Destination device pointer
 * srcHost - Source host pointer
 * ByteCount - Size of memory copy in bytes
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE 
 */
CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount)
{
	CUresult res;
	struct CUctx_st *ctx;
	Ghandle handle;
	const void *src_buf = srcHost;
	uint64_t dst_addr = dstDevice;
	uint32_t size = ByteCount;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	if (!src_buf || !dst_addr || !size)
		return CUDA_ERROR_INVALID_VALUE;

	handle = ctx->gdev_handle;

	if (gmemcpy_to_device(handle, dst_addr, src_buf, size))
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}
Ejemplo n.º 2
0
/**
 * Takes a filename fname and loads the corresponding module module into the
 * current context. The CUDA driver API does not attempt to lazily allocate 
 * the resources needed by a module; if the memory for functions and data 
 * (constant and global) needed by the module cannot be allocated, 
 * cuModuleLoad() fails. The file should be a cubin file as output by nvcc 
 * or a PTX file, either as output by nvcc or handwrtten.
 *
 * Parameters:
 * module - Returned module
 * fname - Filename of module to load
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_NOT_FOUND, 
 * CUDA_ERROR_OUT_OF_MEMORY, CUDA_ERROR_FILE_NOT_FOUND 
 */
CUresult cuModuleLoad(CUmodule *module, const char *fname)
{
	CUresult res;
	struct CUmod_st *mod;
	struct CUctx_st *ctx;
	void *bnc_buf;
	Ghandle handle;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;
	if (!module || !fname)
		return CUDA_ERROR_INVALID_VALUE;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	handle = ctx->gdev_handle;

	if (!(mod = MALLOC(sizeof(*mod)))) {
		GDEV_PRINT("Failed to allocate memory for module\n");
		res = CUDA_ERROR_OUT_OF_MEMORY;
		goto fail_malloc_mod;
	}

	/* load the cubin image from the given object file. */
	if ((res = gdev_cuda_load_cubin_file(mod, fname)) != CUDA_SUCCESS) {
		GDEV_PRINT("Failed to load cubin\n");
		goto fail_load_cubin;
	}

	/* check compatibility of code and device. */
	if ((ctx->cuda_info.chipset & 0xf0) !=  mod->arch) {
	    if ((ctx->cuda_info.chipset & 0xf0) !=  0xe0 &&
	    	(ctx->cuda_info.chipset & 0xf0) !=  0xf0 ) { /* fix this */
		res = CUDA_ERROR_INVALID_SOURCE;
		goto fail_load_cubin;
	    }
	}
	/* construct the kernels based on the cubin data. */
	if ((res = gdev_cuda_construct_kernels(mod, &ctx->cuda_info)) 
		!= CUDA_SUCCESS) {
		GDEV_PRINT("Failed to construct kernels\n");
		goto fail_construct_kernels;
	}

	/* allocate (local) static data memory. */
	if (mod->sdata_size > 0) {
		if (!(mod->sdata_addr = gmalloc(handle, mod->sdata_size))) {
			GDEV_PRINT("Failed to allocate device memory for static data\n");
			res = CUDA_ERROR_OUT_OF_MEMORY;
			goto fail_gmalloc_sdata;
		}
	}

	/* locate the static data information for each kernel. */
	if ((res = gdev_cuda_locate_sdata(mod)) != CUDA_SUCCESS) {
		GDEV_PRINT("Failed to locate static data\n");
		goto fail_locate_sdata;
	}

	/* allocate code and constant memory. */
	if (!(mod->code_addr = gmalloc(handle, mod->code_size))) {
		GDEV_PRINT("Failed to allocate device memory for code\n");
		goto fail_gmalloc_code;
	}

	/* locate the code information for each kernel. */
	if ((res = gdev_cuda_locate_code(mod)) != CUDA_SUCCESS) {
		GDEV_PRINT("Failed to locate code\n");
		goto fail_locate_code;
	}


	/* the following malloc() and memcpy() for bounce buffer could be 
	   removed if we use gmalloc_host() here, but they are just an easy 
	   implementation, and don't really affect performance anyway. */
	if (!(bnc_buf = MALLOC(mod->code_size))) {
		GDEV_PRINT("Failed to allocate host memory for code\n");
		res = CUDA_ERROR_OUT_OF_MEMORY;
		goto fail_malloc_code;
	}
	memset(bnc_buf, 0, mod->code_size);

	if ((res = gdev_cuda_memcpy_code(mod, bnc_buf)) 
		!= CUDA_SUCCESS) {
		GDEV_PRINT("Failed to copy code to host\n");
		goto fail_memcpy_code;
	}

	/* transfer the code and constant memory onto the device. */
	if (gmemcpy_to_device(handle, mod->code_addr, bnc_buf, mod->code_size)) {
		GDEV_PRINT("Failed to copy code to device\n");
		res = CUDA_ERROR_UNKNOWN;
		goto fail_gmemcpy_code;
	}

	/* free the bounce buffer now. */
	FREE(bnc_buf);

	mod->ctx = ctx;
	*module = mod;

	return CUDA_SUCCESS;

fail_gmemcpy_code:
fail_memcpy_code:
	FREE(bnc_buf);
fail_malloc_code:
fail_locate_code:
	gfree(handle, mod->code_addr);
fail_gmalloc_code:
fail_locate_sdata:
	if (mod->sdata_size > 0)
		gfree(handle, mod->sdata_addr);
fail_gmalloc_sdata:
	gdev_cuda_destruct_kernels(mod);
fail_construct_kernels:
	gdev_cuda_unload_cubin(mod);
fail_load_cubin:
	FREE(mod);
fail_malloc_mod:
	*module = NULL;
	return res;
}
Ejemplo n.º 3
0
int gdev_test_matrixadd(uint32_t *a, uint32_t *b, uint32_t *c, int n)
{
	int i, j, idx;
	uint32_t id;
	uint32_t mp_count;
	uint32_t code_size, a_size, b_size, c_size;
	uint32_t param_buf[PARAM_SIZE];
	uint64_t a_addr, b_addr, c_addr;
	uint64_t result[3];

	Ghandle handle;
	struct gdev_kernel k;

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		for(j = 0; j < n; j++) {
			idx = i * n + j;
			a[idx] = i;
			b[idx] = j;
		}
	}

	if (!(handle = gopen(0))) {
		return -1;
	}
	
	a_size = n * n * sizeof(uint32_t);
	b_size = n * n * sizeof(uint32_t);
	c_size = n * n * sizeof(uint32_t);

	if (!(a_addr = gmalloc(handle, a_size)))
		return -1;
	if (!(b_addr = gmalloc(handle, b_size)))
		return -1;
	if (!(c_addr = gmalloc(handle, c_size)))
		return -1;

	code_size = sizeof(kcode);
	if (code_size & 0xff)
		k.code_size = (code_size + 0x100) & ~0xff;
	if (!(k.code_addr = gmalloc(handle, k.code_size)))
		return -1;
	k.code_pc = 0;

	k.cmem[0].size = PARAM_SIZE;
	if (k.cmem[0].size == 0 || k.cmem[0].size & 0xff)
		k.cmem[0].size = (k.cmem[0].size + 0x100) & ~0xff;
	if (!(k.cmem[0].addr = gmalloc(handle, k.cmem[0].size)))
		return -1;
	k.cmem[0].offset = 0;
	for (i = 1; i < GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT; i++) {
		k.cmem[i].addr = 0;
		k.cmem[i].size = 0;
		k.cmem[i].offset = 0;
	}
	k.cmem_count = GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT;
	k.param_size = PARAM_SIZE;
	k.param_buf = c0;
	k.param_buf[NVCC_PARAM_OFFSET/4 + 0] = a_addr;
	k.param_buf[NVCC_PARAM_OFFSET/4 + 1] = a_addr >> 32;
	k.param_buf[NVCC_PARAM_OFFSET/4 + 2] = b_addr;
	k.param_buf[NVCC_PARAM_OFFSET/4 + 3] = b_addr >> 32;
	k.param_buf[NVCC_PARAM_OFFSET/4 + 4] = c_addr;
	k.param_buf[NVCC_PARAM_OFFSET/4 + 5] = c_addr >> 32;
	k.param_buf[NVCC_PARAM_OFFSET/4 + 6] = n;

	k.lmem_size = LOCAL_SIZE;
	if (k.lmem_size & 0xf)
		k.lmem_size = (k.lmem_size + 0x10) & ~0xf;
	k.lmem_size_neg = 0; /* just random */
	if (k.lmem_size_neg & 0xf)
		k.lmem_size_neg = (k.lmem_size_neg + 0x10) & ~0xf;
	k.lmem_base = 0x01000000;
	k.smem_size = SHARED_SIZE;
	if (k.smem_size & 0x7f)
		k.smem_size = (k.smem_size + 0x80) & (~0x7f);
	k.smem_base = 0x0;
	
	k.warp_stack_size = (STACK_DEPTH + 0x1000) & (~0xfff);
	
	/* FIXME: per-thread warp size may differ from 32. */
	k.warp_lmem_size = 32 * (k.lmem_size + k.lmem_size_neg) + k.warp_stack_size; 
	
	/* FIXME: the number of active warps may differ from 48. */
	gquery(handle, GDEV_NVIDIA_QUERY_MP_COUNT, (uint64_t *)&mp_count);
	k.lmem_size_total = 48 * mp_count * k.warp_lmem_size;
	k.lmem_size_total = __round_up_pow2(k.lmem_size_total);
	if (!(k.lmem_addr = gmalloc(handle, k.lmem_size_total)))
		return -1;

	k.reg_count = REG_COUNT;
	k.bar_count = BARRIER_COUNT;
	k.grid_id = 1;
	
	k.block_x = n < 16 ? n : 16;
	k.block_y = n < 16 ? n : 16;
	k.block_z = 1;
	k.grid_x = n / k.block_x;
	if (n % k.block_x != 0)
		k.grid_x++;
	k.grid_y = n / k.block_y;
	if (n % k.block_y != 0)
		k.grid_y++;
	k.grid_z = 1;
	
	gmemcpy_to_device(handle, k.code_addr, kcode, k.code_size);
	gmemcpy_to_device(handle, a_addr, a, a_size);
	gmemcpy_to_device(handle, b_addr, b, b_size);
	
	glaunch(handle, &k, &id);
	gsync(handle, id, NULL);
	
	gmemcpy_from_device(handle, c, c_addr, c_size);

	gfree(handle, a_addr);
	gfree(handle, b_addr);
	gfree(handle, c_addr);
	gfree(handle, k.code_addr);
	gfree(handle, k.cmem[0].addr);
	gfree(handle, k.lmem_addr);
	
	gclose(handle);

	i = j = idx = 0;
	while (i < n) {
		while (j < n) {
			idx = i * n + j;
			if (c[idx] != a[idx] + b[idx]) {
				printf("c[%d] = %d\n", idx, c[idx]);
				printf("a[%d]+b[%d] = %d\n", idx, idx, a[idx]+b[idx]);
				return -1;
			}
			j++;
		}
		i++;
	}

	return 0;
}