/** * Copies from host memory to device memory. dstDevice and srcHost are the base * addresses of the destination and source, respectively. ByteCount specifies * the number of bytes to copy. Note that this function is synchronous. * * Parameters: * dstDevice - Destination device pointer * srcHost - Source host pointer * ByteCount - Size of memory copy in bytes * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE */ CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount) { CUresult res; struct CUctx_st *ctx; Ghandle handle; const void *src_buf = srcHost; uint64_t dst_addr = dstDevice; uint32_t size = ByteCount; if (!gdev_initialized) return CUDA_ERROR_NOT_INITIALIZED; res = cuCtxGetCurrent(&ctx); if (res != CUDA_SUCCESS) return res; if (!src_buf || !dst_addr || !size) return CUDA_ERROR_INVALID_VALUE; handle = ctx->gdev_handle; if (gmemcpy_to_device(handle, dst_addr, src_buf, size)) return CUDA_ERROR_UNKNOWN; return CUDA_SUCCESS; }
/** * Takes a filename fname and loads the corresponding module module into the * current context. The CUDA driver API does not attempt to lazily allocate * the resources needed by a module; if the memory for functions and data * (constant and global) needed by the module cannot be allocated, * cuModuleLoad() fails. The file should be a cubin file as output by nvcc * or a PTX file, either as output by nvcc or handwrtten. * * Parameters: * module - Returned module * fname - Filename of module to load * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_NOT_FOUND, * CUDA_ERROR_OUT_OF_MEMORY, CUDA_ERROR_FILE_NOT_FOUND */ CUresult cuModuleLoad(CUmodule *module, const char *fname) { CUresult res; struct CUmod_st *mod; struct CUctx_st *ctx; void *bnc_buf; Ghandle handle; if (!gdev_initialized) return CUDA_ERROR_NOT_INITIALIZED; if (!module || !fname) return CUDA_ERROR_INVALID_VALUE; res = cuCtxGetCurrent(&ctx); if (res != CUDA_SUCCESS) return res; handle = ctx->gdev_handle; if (!(mod = MALLOC(sizeof(*mod)))) { GDEV_PRINT("Failed to allocate memory for module\n"); res = CUDA_ERROR_OUT_OF_MEMORY; goto fail_malloc_mod; } /* load the cubin image from the given object file. */ if ((res = gdev_cuda_load_cubin_file(mod, fname)) != CUDA_SUCCESS) { GDEV_PRINT("Failed to load cubin\n"); goto fail_load_cubin; } /* check compatibility of code and device. */ if ((ctx->cuda_info.chipset & 0xf0) != mod->arch) { if ((ctx->cuda_info.chipset & 0xf0) != 0xe0 && (ctx->cuda_info.chipset & 0xf0) != 0xf0 ) { /* fix this */ res = CUDA_ERROR_INVALID_SOURCE; goto fail_load_cubin; } } /* construct the kernels based on the cubin data. */ if ((res = gdev_cuda_construct_kernels(mod, &ctx->cuda_info)) != CUDA_SUCCESS) { GDEV_PRINT("Failed to construct kernels\n"); goto fail_construct_kernels; } /* allocate (local) static data memory. */ if (mod->sdata_size > 0) { if (!(mod->sdata_addr = gmalloc(handle, mod->sdata_size))) { GDEV_PRINT("Failed to allocate device memory for static data\n"); res = CUDA_ERROR_OUT_OF_MEMORY; goto fail_gmalloc_sdata; } } /* locate the static data information for each kernel. */ if ((res = gdev_cuda_locate_sdata(mod)) != CUDA_SUCCESS) { GDEV_PRINT("Failed to locate static data\n"); goto fail_locate_sdata; } /* allocate code and constant memory. */ if (!(mod->code_addr = gmalloc(handle, mod->code_size))) { GDEV_PRINT("Failed to allocate device memory for code\n"); goto fail_gmalloc_code; } /* locate the code information for each kernel. */ if ((res = gdev_cuda_locate_code(mod)) != CUDA_SUCCESS) { GDEV_PRINT("Failed to locate code\n"); goto fail_locate_code; } /* the following malloc() and memcpy() for bounce buffer could be removed if we use gmalloc_host() here, but they are just an easy implementation, and don't really affect performance anyway. */ if (!(bnc_buf = MALLOC(mod->code_size))) { GDEV_PRINT("Failed to allocate host memory for code\n"); res = CUDA_ERROR_OUT_OF_MEMORY; goto fail_malloc_code; } memset(bnc_buf, 0, mod->code_size); if ((res = gdev_cuda_memcpy_code(mod, bnc_buf)) != CUDA_SUCCESS) { GDEV_PRINT("Failed to copy code to host\n"); goto fail_memcpy_code; } /* transfer the code and constant memory onto the device. */ if (gmemcpy_to_device(handle, mod->code_addr, bnc_buf, mod->code_size)) { GDEV_PRINT("Failed to copy code to device\n"); res = CUDA_ERROR_UNKNOWN; goto fail_gmemcpy_code; } /* free the bounce buffer now. */ FREE(bnc_buf); mod->ctx = ctx; *module = mod; return CUDA_SUCCESS; fail_gmemcpy_code: fail_memcpy_code: FREE(bnc_buf); fail_malloc_code: fail_locate_code: gfree(handle, mod->code_addr); fail_gmalloc_code: fail_locate_sdata: if (mod->sdata_size > 0) gfree(handle, mod->sdata_addr); fail_gmalloc_sdata: gdev_cuda_destruct_kernels(mod); fail_construct_kernels: gdev_cuda_unload_cubin(mod); fail_load_cubin: FREE(mod); fail_malloc_mod: *module = NULL; return res; }
int gdev_test_matrixadd(uint32_t *a, uint32_t *b, uint32_t *c, int n) { int i, j, idx; uint32_t id; uint32_t mp_count; uint32_t code_size, a_size, b_size, c_size; uint32_t param_buf[PARAM_SIZE]; uint64_t a_addr, b_addr, c_addr; uint64_t result[3]; Ghandle handle; struct gdev_kernel k; /* initialize A[] & B[] */ for (i = 0; i < n; i++) { for(j = 0; j < n; j++) { idx = i * n + j; a[idx] = i; b[idx] = j; } } if (!(handle = gopen(0))) { return -1; } a_size = n * n * sizeof(uint32_t); b_size = n * n * sizeof(uint32_t); c_size = n * n * sizeof(uint32_t); if (!(a_addr = gmalloc(handle, a_size))) return -1; if (!(b_addr = gmalloc(handle, b_size))) return -1; if (!(c_addr = gmalloc(handle, c_size))) return -1; code_size = sizeof(kcode); if (code_size & 0xff) k.code_size = (code_size + 0x100) & ~0xff; if (!(k.code_addr = gmalloc(handle, k.code_size))) return -1; k.code_pc = 0; k.cmem[0].size = PARAM_SIZE; if (k.cmem[0].size == 0 || k.cmem[0].size & 0xff) k.cmem[0].size = (k.cmem[0].size + 0x100) & ~0xff; if (!(k.cmem[0].addr = gmalloc(handle, k.cmem[0].size))) return -1; k.cmem[0].offset = 0; for (i = 1; i < GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT; i++) { k.cmem[i].addr = 0; k.cmem[i].size = 0; k.cmem[i].offset = 0; } k.cmem_count = GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT; k.param_size = PARAM_SIZE; k.param_buf = c0; k.param_buf[NVCC_PARAM_OFFSET/4 + 0] = a_addr; k.param_buf[NVCC_PARAM_OFFSET/4 + 1] = a_addr >> 32; k.param_buf[NVCC_PARAM_OFFSET/4 + 2] = b_addr; k.param_buf[NVCC_PARAM_OFFSET/4 + 3] = b_addr >> 32; k.param_buf[NVCC_PARAM_OFFSET/4 + 4] = c_addr; k.param_buf[NVCC_PARAM_OFFSET/4 + 5] = c_addr >> 32; k.param_buf[NVCC_PARAM_OFFSET/4 + 6] = n; k.lmem_size = LOCAL_SIZE; if (k.lmem_size & 0xf) k.lmem_size = (k.lmem_size + 0x10) & ~0xf; k.lmem_size_neg = 0; /* just random */ if (k.lmem_size_neg & 0xf) k.lmem_size_neg = (k.lmem_size_neg + 0x10) & ~0xf; k.lmem_base = 0x01000000; k.smem_size = SHARED_SIZE; if (k.smem_size & 0x7f) k.smem_size = (k.smem_size + 0x80) & (~0x7f); k.smem_base = 0x0; k.warp_stack_size = (STACK_DEPTH + 0x1000) & (~0xfff); /* FIXME: per-thread warp size may differ from 32. */ k.warp_lmem_size = 32 * (k.lmem_size + k.lmem_size_neg) + k.warp_stack_size; /* FIXME: the number of active warps may differ from 48. */ gquery(handle, GDEV_NVIDIA_QUERY_MP_COUNT, (uint64_t *)&mp_count); k.lmem_size_total = 48 * mp_count * k.warp_lmem_size; k.lmem_size_total = __round_up_pow2(k.lmem_size_total); if (!(k.lmem_addr = gmalloc(handle, k.lmem_size_total))) return -1; k.reg_count = REG_COUNT; k.bar_count = BARRIER_COUNT; k.grid_id = 1; k.block_x = n < 16 ? n : 16; k.block_y = n < 16 ? n : 16; k.block_z = 1; k.grid_x = n / k.block_x; if (n % k.block_x != 0) k.grid_x++; k.grid_y = n / k.block_y; if (n % k.block_y != 0) k.grid_y++; k.grid_z = 1; gmemcpy_to_device(handle, k.code_addr, kcode, k.code_size); gmemcpy_to_device(handle, a_addr, a, a_size); gmemcpy_to_device(handle, b_addr, b, b_size); glaunch(handle, &k, &id); gsync(handle, id, NULL); gmemcpy_from_device(handle, c, c_addr, c_size); gfree(handle, a_addr); gfree(handle, b_addr); gfree(handle, c_addr); gfree(handle, k.code_addr); gfree(handle, k.cmem[0].addr); gfree(handle, k.lmem_addr); gclose(handle); i = j = idx = 0; while (i < n) { while (j < n) { idx = i * n + j; if (c[idx] != a[idx] + b[idx]) { printf("c[%d] = %d\n", idx, c[idx]); printf("a[%d]+b[%d] = %d\n", idx, idx, a[idx]+b[idx]); return -1; } j++; } i++; } return 0; }