CUDARunner::~CUDARunner() { DeallocateResources(); cuModuleUnload(m_module); cuCtxDestroy(m_context); }
WEAK void halide_release() { // CUcontext ignore; // TODO: this is for timing; bad for release-mode performance CHECK_CALL( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
bool GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) { struct ptx_image_data *image, **prev_p; struct ptx_device *dev = ptx_devices[ord]; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) { GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" " (expected %u, received %u)", GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); return false; } bool ret = true; pthread_mutex_lock (&dev->image_lock); for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) if (image->target_data == target_data) { *prev_p = image->next; if (cuModuleUnload (image->module) != CUDA_SUCCESS) ret = false; free (image->fns); free (image); break; } pthread_mutex_unlock (&dev->image_lock); return ret; }
void clean_cuda(void) { CUresult res; for (int i = 0; i < device_num; i++) { res = cuModuleUnload(module[i]); CUDA_CHECK(res, "cuModuleUnload()"); } for (int i = 0; i < device_num; i++) { res = cuCtxDestroy(ctx[i]); CUDA_CHECK(res, "cuCtxDestroy()"); } free(NR_MAXTHREADS_X); free(NR_MAXTHREADS_Y); free(ConvolutionKernel_func); free(DistanceTransformTwoDimensionalProblem_func); free(BilinearKernelTex32F_func); free(calculateHistogram_func); free(getFeatureMaps_func); free(calculateNorm_func); free(normalizeAndTruncate_func); free(PCAFeatureMapsAddNullableBorder_func); free(module); free(dev); free(ctx); }
int main(){ init_test(); const std::string source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry kernel(.param .u64 kernel_param_0) {\n" ".reg .s32 %r<2>;\n" ".reg .s64 %rd<3>;\n" "bra BB1_2;\n" "ld.param.u64 %rd1, [kernel_param_0];\n" "cvta.to.global.u64 %rd2, %rd1;\n" "mov.u32 %r1, 5;\n" "st.global.u32 [%rd2], %r1;\n" "BB1_2: ret;\n" "}\n"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel")); CUdeviceptr devValue; int hostValue = 10; cu_assert(cuMemAlloc(&devValue, sizeof(int))); cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue))); void * params[] = {&devValue}; cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr)); cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue))); assert(hostValue == 10); std::cout << hostValue << "\n"; cu_assert(cuMemFree(devValue)); cu_assert(cuModuleUnload(modId)); return 0; }
WEAK void halide_release() { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
void KernelRT::Release() { Reset(); if (hModule) { cuModuleUnload(hModule); hModule = NULL; } }
WEAK void halide_release(void *user_context) { // Do not do any of this if there is not context set. E.g. // if halide_release is called and no CUDA calls have been made. if (cuda_ctx_ptr != NULL) { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } cuda_ctx_ptr = NULL; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
GpuCompilationContext::~GpuCompilationContext() { #ifdef HAVE_CUDA static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_); auto status = cuModuleUnload(module_); // TODO(alex): handle this race better if (status == CUDA_ERROR_DEINITIALIZED) { return; } checkCudaErrors(status); #endif }
SEXP R_auto_cuModuleUnload(SEXP r_hmod) { SEXP r_ans = R_NilValue; CUmodule hmod = (CUmodule) getRReference(r_hmod); CUresult ans; ans = cuModuleUnload(hmod); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
//----------------------------------------------------------------------------// bool CUDAImpl::_UnloadModule(std::string * err) { CUresult c_err; if (_cudaBuild) { _cudaKernels.clear(); _cudaBuild = false; c_err = cuModuleUnload(_cudaModule); if (_cudaErrorLoadModule(c_err, err)) { return false; } } return true; }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
/* * module unload and destroy context */ void clean_cuda(void){ res = cuModuleUnload(module); if(res != CUDA_SUCCESS){ printf("cuModuleUnload failed: res = %s\n", conv(res)); exit(1); } res = cuCtxDestroy(ctx); if(res != CUDA_SUCCESS){ printf("cuCtxDestroy failed: res = %s\n", conv(res)); exit(1); } }
int gib_destroy ( gib_context c ) { /* TODO: Make sure everything created in gib_init is destroyed here. */ ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); int rc_i = gib_cpu_destroy(c); if (rc_i != GIB_SUC) { printf("gib_cpu_destroy returned %i\n", rc_i); exit(EXIT_FAILURE); } gpu_context gpu_c = (gpu_context) c->acc_context; #if !GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemFree(gpu_c->buffers)); #endif ERROR_CHECK_FAIL(cuModuleUnload(gpu_c->module)); ERROR_CHECK_FAIL(cuCtxDestroy(gpu_c->pCtx)); return GIB_SUC; }
static void _cuda_freekernel(gpukernel *k) { k->refcnt--; if (k->refcnt == 0) { if (k->ctx != NULL) { cuda_enter(k->ctx); cuModuleUnload(k->m); cuda_exit(k->ctx); cuda_free_ctx(k->ctx); } CLEAR(k); free(k->args); free(k->bin); free(k->types); free(k); } }
int main(){ init_test(); const std::string test_source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry _Z6kernelPfi(\n" ".param .u64 _Z6kernelPfi_param_0,\n" ".param .u32 _Z6kernelPfi_param_1){\n" ".reg .pred %p<2>;\n" ".reg .f32 %f<3>;\n" ".reg .s32 %r<3>;\n" ".reg .s64 %rd<5>;\n" "ld.param.u64 %rd1, [_Z6kernelPfi_param_0];\n" "ld.param.u32 %r2, [_Z6kernelPfi_param_1];\n" "mov.u32 %r1, %tid.x;\n" "setp.ge.u32 %p1, %r1, %r2;\n" "@%p1 bra BB0_2;\n" "cvta.to.global.u64 %rd2, %rd1;\n" "cvt.rn.f32.u32 %f1, %r1;\n" "mul.f32 %f2, %f1, 0f3FC00000;\n" "mul.wide.u32 %rd3, %r1, 4;\n" "add.s64 %rd4, %rd2, %rd3;\n" "st.global.f32 [%rd4], %f2;\n" "BB0_2:\n" "ret;\n" "}"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, test_source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "_Z6kernelPfi")); CUdeviceptr devArray; int size = 10; float hostArray[size]; memset(hostArray, 0, size * sizeof(hostArray[0])); cu_assert(cuMemAlloc(&devArray, sizeof(float) * size)); void * params[] = {&devArray, &size}; auto result = cuLaunchKernel(funcHandle, 1,1,1, size*2,1,1, 0,0, params, nullptr); cu_assert(result); cu_assert(cuMemcpyDtoH(&hostArray, devArray, sizeof(hostArray[0])*size)); cu_assert(cuMemFree(devArray)); cu_assert(cuModuleUnload(modId)); for (int i=0 ; i<size ; ++i) std::cout << hostArray[i] << '\n'; return 0; }
void setupModuleResource(const char* kernelFileName) { CUmodule newModule = createModuleFromFile(kernelFileName); if (newModule != NULL) { if (module != NULL) cuModuleUnload(module); module = newModule; } checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "mainImage")); // TODO: take care of bytes size_t bytes; checkCudaErrors(cuModuleGetGlobal(&d_iResolution, &bytes, module, "iResolution")); checkCudaErrors(cuModuleGetGlobal(&d_iGlobalTime, &bytes, module, "iGlobalTime")); checkCudaErrors(cuModuleGetGlobal(&d_iMouse, &bytes, module, "iMouse")); checkCudaErrors(cuModuleGetGlobal(&d_fragColor, &d_fragColor_bytes, module, "fragColor")); }
CUresult cuda_driver_api_exit(CUcontext ctx, CUmodule mod) { CUresult res; res = cuModuleUnload(mod); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return res; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return res; } return CUDA_SUCCESS; }
int mmult_gpu_close(struct device_info *device_info) { CUresult res; res = cuModuleUnload(device_info->module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(device_info->context); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } return 0; }
WEAK void halide_release(void *user_context) { DEBUG_PRINTF( user_context, "CUDA: halide_release (user_context: %p)\n", user_context ); int err; CUcontext ctx; err = halide_acquire_cuda_context(user_context, &ctx); if (err != CUDA_SUCCESS || !ctx) { return; } // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. err = cuCtxSynchronize(); halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); // Unload the modules attached to this context. Note that the list // nodes themselves are not freed, only the module objects are // released. Subsequent calls to halide_init_kernels might re-create // the program object using the same list node to store the module // object. module_state *state = state_list; while (state) { if (state->module) { DEBUG_PRINTF(user_context, " cuModuleUnload %p\n", state->module); err = cuModuleUnload(state->module); halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); state->module = 0; } state = state->next; } // Only destroy the context if we own it if (ctx == weak_cuda_ctx) { DEBUG_PRINTF(user_context, " cuCtxDestroy %p\n", weak_cuda_ctx); err = cuCtxDestroy(weak_cuda_ctx); halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); weak_cuda_ctx = NULL; } halide_release_cuda_context(user_context); }
void GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) { struct ptx_image_data *image, **prev_p; struct ptx_device *dev = ptx_devices[ord]; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) return; pthread_mutex_lock (&dev->image_lock); for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) if (image->target_data == target_data) { *prev_p = image->next; cuModuleUnload (image->module); free (image->fns); free (image); break; } pthread_mutex_unlock (&dev->image_lock); }
int main(){ init_test(); const std::string source = ".version 4.2\n" ".target sm_20\n" ".address_size 64\n" ".visible .entry kernel_4(\n" ".param .u32 kernel_4_param_0,\n" ".param .u64 kernel_4_param_1\n" ")\n" "{\n" ".reg .s32 %r<3>;\n" ".reg .s64 %rd<3>;\n" "ld.param.u32 %r1, [kernel_4_param_0];\n" "ld.param.u64 %rd1, [kernel_4_param_1];\n" "cvta.to.global.u64 %rd2, %rd1;\n" "add.s32 %r2, %r1, 7;\n" "st.global.u32 [%rd2], %r2;\n" "ret;\n" "}"; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel_4")); CUdeviceptr devValue; int hostValue = 10; cu_assert(cuMemAlloc(&devValue, sizeof(int))); void * params[] = {&hostValue, &devValue}; cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr)); int result = 0; cu_assert(cuMemcpyDtoH(&result, devValue, sizeof(result))); assert(result == hostValue + 7); std::cout << result << "\n"; cu_assert(cuMemFree(devValue)); cu_assert(cuModuleUnload(modId)); return 0; }
/// main - Program entry point int main(int argc, char** argv) { if (argc != 3) { printf("Usage: %s dataCount blockSize\n", argv[0]); exit(1); } CUdevice device; CUmodule cudaModule; CUcontext context; CUfunction function; CUlinkState linker; int devCount; // CUDA initialization checkCudaErrors(cuInit(0)); checkCudaErrors(cuDeviceGetCount(&devCount)); checkCudaErrors(cuDeviceGet(&device, 0)); char name[128]; checkCudaErrors(cuDeviceGetName(name, 128, device)); std::cout << "Using CUDA Device [0]: " << name << "\n"; int devMajor, devMinor; checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); std::cout << "Device Compute Capability: " << devMajor << "." << devMinor << "\n"; if (devMajor < 2) { std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; return 1; } std::ifstream t("kernel.ptx"); if (!t.is_open()) { std::cerr << "kernel.ptx not found\n"; return 1; } std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>()); // Create driver context checkCudaErrors(cuCtxCreate(&context, 0, device)); // Create module for object checkCudaErrors(cuModuleLoadDataEx(&cudaModule, str.c_str(), 0, 0, 0)); // Get kernel function checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel")); // Device data CUdeviceptr devBufferA; CUdeviceptr devBufferB; CUdeviceptr devBufferC; CUdeviceptr devBufferSMid; // Size unsigned dataCount = atoi(argv[1]); checkCudaErrors(cuMemAlloc(&devBufferA, sizeof(float) * dataCount)); checkCudaErrors(cuMemAlloc(&devBufferB, sizeof(float) * dataCount)); checkCudaErrors(cuMemAlloc(&devBufferC, sizeof(float) * dataCount)); checkCudaErrors(cuMemAlloc(&devBufferSMid, sizeof(int) * dataCount)); float* hostA = new float[dataCount]; float* hostB = new float[dataCount]; float* hostC = new float[dataCount]; int* hostSMid = new int[dataCount]; // Populate input for (unsigned i = 0; i != dataCount; ++i) { hostA[i] = (float)i; hostB[i] = (float)(2 * i); hostC[i] = 2.0f; hostSMid[i] = 0; } checkCudaErrors( cuMemcpyHtoD(devBufferA, &hostA[0], sizeof(float) * dataCount)); checkCudaErrors( cuMemcpyHtoD(devBufferB, &hostB[0], sizeof(float) * dataCount)); unsigned blockSizeX = atoi(argv[2]); unsigned blockSizeY = 1; unsigned blockSizeZ = 1; unsigned gridSizeX = (dataCount + blockSizeX - 1) / blockSizeX; unsigned gridSizeY = 1; unsigned gridSizeZ = 1; // Kernel parameters void* KernelParams[] = {&devBufferA, &devBufferB, &devBufferC, &devBufferSMid}; std::cout << "Launching kernel\n"; // Kernel launch checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ, 0, NULL, KernelParams, NULL)); // Retrieve device data checkCudaErrors( cuMemcpyDtoH(&hostC[0], devBufferC, sizeof(float) * dataCount)); checkCudaErrors( cuMemcpyDtoH(&hostSMid[0], devBufferSMid, sizeof(int) * dataCount)); std::cout << "Results:\n"; std::cout << "SM " << hostSMid[0] << ":" << hostA[0] << " + " << hostB[0] << " = " << hostC[0] << "\n"; for (unsigned i = 1; i != dataCount; i++) { if (hostSMid[i] != hostSMid[i - 1]) std::cout << "SM " << hostSMid[i] << ":" << hostA[i] << " + " << hostB[i] << " = " << hostC[i] << "\n"; } // Clean up after ourselves delete[] hostA; delete[] hostB; delete[] hostC; delete[] hostSMid; // Clean-up checkCudaErrors(cuMemFree(devBufferA)); checkCudaErrors(cuMemFree(devBufferB)); checkCudaErrors(cuMemFree(devBufferC)); checkCudaErrors(cuMemFree(devBufferSMid)); checkCudaErrors(cuModuleUnload(cudaModule)); checkCudaErrors(cuCtxDestroy(context)); return 0; }
//------------------------------------------------------------------------------ void mat_mul_test(const std::vector< std::string >& file_paths, const char* kernel_name, int size, int grid_dim_x, int grid_dim_y, int grid_dim_z, int block_dim_x, int block_dim_y, int block_dim_z) { const int MATRIX_WIDTH = size; const int MATRIX_HEIGHT = MATRIX_WIDTH; const int VECTOR_SIZE = MATRIX_WIDTH; const int MATRIX_SIZE = MATRIX_WIDTH * MATRIX_HEIGHT; const int MATRIX_BYTE_SIZE = sizeof(real_t) * MATRIX_SIZE; const int VECTOR_BYTE_SIZE = sizeof(real_t) * VECTOR_SIZE; CCHECK(cuInit(0)); array_t in_matrix_h(MATRIX_SIZE, real_t(1)); array_t in_vector_h(VECTOR_SIZE, real_t(1)); array_t out_vector_h(VECTOR_SIZE, real_t(0)); CUdeviceptr in_matrix_d = 0; CUdeviceptr in_vector_d = 0; CUdeviceptr out_vector_d = 0; CUdevice device = CUdevice(); CUcontext ctx = CUcontext(); CCHECK(cuCtxCreate(&ctx, 0, device)); CCHECK(cuMemAlloc(&in_matrix_d, MATRIX_BYTE_SIZE)); assert(in_matrix_d); CCHECK(cuMemAlloc(&in_vector_d, VECTOR_BYTE_SIZE)); assert(in_vector_d); CCHECK(cuMemAlloc(&out_vector_d, VECTOR_BYTE_SIZE)); assert(out_vector_d); CCHECK(cuMemcpy(in_matrix_d, CUdeviceptr(&in_matrix_h[0]), MATRIX_BYTE_SIZE)); CCHECK(cuMemcpy(in_vector_d, CUdeviceptr(&in_vector_h[0]), VECTOR_BYTE_SIZE)); CCHECK(cuMemcpy(out_vector_d, CUdeviceptr(&out_vector_h[0]), VECTOR_BYTE_SIZE)); CUmodule module = CUmodule(); CUfunction fun = CUfunction(); build(module, fun, file_paths, kernel_name); void* kernel_params[] = {&in_matrix_d, (void *)(&MATRIX_WIDTH), (void *)(&MATRIX_HEIGHT), &in_vector_d, &out_vector_d}; CCHECK(cuLaunchKernel(fun, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y, block_dim_z, 0,//shared_mem_bytes 0,//stream kernel_params, 0)); CCHECK(cuMemcpy(CUdeviceptr(&out_vector_h[0]), out_vector_d, VECTOR_BYTE_SIZE)); // print first two and last elements std::cout << "vector[0] = " << out_vector_h[ 0 ] << '\n'; std::cout << "vector[1] = " << out_vector_h[ 1 ] << '\n'; std::cout << "vector[last] = " << out_vector_h.back() << std::endl; CCHECK(cuMemFree(in_matrix_d)); CCHECK(cuMemFree(in_vector_d)); CCHECK(cuMemFree(out_vector_d)); CCHECK(cuModuleUnload(module)); CCHECK(cuCtxDestroy(ctx)); }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); count = (count > 2) ? 2 : count; CUdevice devices[count]; for (int i = 0; i < count; i++) CU_ERROR_CHECK(cuDeviceGet(&devices[i], i)); // Question 1: Can you create multiple contexts on the same device? { fprintf(stderr, "Attempting to create multiple contexts on each device...\n"); CUcontext contexts[count * N]; size_t j = 0; for (int i = 0; i < count; i++) { CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < N && error == CUDA_SUCCESS; k++) { error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]); if (error == CUDA_SUCCESS) CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++])); } fprintf(stderr, " created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error)); } CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < j && error == CUDA_SUCCESS; k++) error = cuCtxPushCurrent(contexts[k]); if (error == CUDA_SUCCESS) fprintf(stderr, " successfully pushed %zu contexts with cuCtxPushCurrent\n", k); else fprintf(stderr, " pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error)); for (size_t k = 0; k < j; k++) CU_ERROR_CHECK(cuCtxDestroy(contexts[k])); fprintf(stderr, "\n"); } CUcontext contexts[count][2]; for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) { CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j])); } } // Question 2: Can you access a host pointer in a different context from // which it was created? // Question 3: Can you free a host pointer in a different context from which // it was created? { void * hPtr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUdeviceptr dPtr[count]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 4: Can you access a device pointer in a different context from // which it was created? // Question 5: Can you free a device pointer in a different context from which // it was created? { CUdeviceptr dPtr[count][2]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0][1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1][0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 6: Can you access a module in a different context from which it // was loaded? // Question 7: Can you unload a module in a different context from which it // was loaded? { CUmodule module; CUdeviceptr ptr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuModuleLoad(&module, "kernel-test.ptx")); CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUfunction function = 0; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); CUdeviceptr a, b; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); void * params[] = { &a, & b }; CUresult error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemFree(a)); CU_ERROR_CHECK(cuMemFree(b)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j])); } return 0; }
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; unsigned int *a_buf, *b_buf, *c_buf; unsigned long long int a_phys, b_phys, c_phys; unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int)); int block_x, block_y, grid_x, grid_y; char fname[256]; int ret = 0; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; struct timeval tv_mem_alloc_start; struct timeval tv_data_init_start; float data_init; struct timeval tv_conf_kern_start; struct timeval tv_close_start; float mem_alloc; float exec; float init_gpu; float configure_kernel; float close_gpu; float data_read; unsigned int dummy_b, dummy_c; /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/madd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } gettimeofday(&tv_mem_alloc_start, NULL); /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (a) failed\n"); return -1; } res = cuMemGetPhysAddr(&a_phys, (void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (a) failed\n"); return -1; } /*printf("a[]: Physical Address 0x%llx\n", a_phys);*/ /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (b) failed\n"); return -1; } res = cuMemGetPhysAddr(&b_phys, (void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (b) failed\n"); return -1; } /*printf("b[]: Physical Address 0x%llx\n", b_phys);*/ /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (c) failed\n"); return -1; } res = cuMemGetPhysAddr(&c_phys, (void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (c) failed\n"); return -1; } /*printf("c[]: Physical Address 0x%llx\n", c_phys);*/ gettimeofday(&tv_data_init_start, NULL); /* initialize A[] & B[] */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { a_buf[idx++] = i; } } for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { b_buf[idx++] = i; } } gettimeofday(&tv_h2d_start, NULL); gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_conf_kern_start, NULL); /* set kernel parameters */ res = cuParamSeti(function, 0, a_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 4, a_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 8, b_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 12, b_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 16, c_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 20, c_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 24, n); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSetSize(function, 28); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ memcpy(c, c_buf, n*n*sizeof(unsigned int)); gettimeofday(&tv_d2h_end, NULL); /* Read back */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { dummy_c = c[idx++]; } } gettimeofday(&tv_close_start, NULL); res = cuMemUnmap((void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_mem_alloc_start, &tv_total_start, &tv); init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv); mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_start, &tv_data_init_start, &tv); data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_start, &tv_conf_kern_start, &tv); configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_close_start, &tv_d2h_end, &tv); data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_close_start, &tv); close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("Init: %f\n", init_gpu); printf("MemAlloc: %f\n", mem_alloc); printf("DataInit: %f\n", data_init); printf("HtoD: %f\n", h2d); printf("KernConf: %f\n", configure_kernel); printf("Exec: %f\n", exec); printf("DtoH: %f\n", d2h); printf("DataRead: %f\n", data_read); printf("Close: %f\n", close_gpu); printf("Total: %f\n", total); return ret; }
int cuda_test_fmadd(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; float *a = (float *) malloc (n*n * sizeof(float)); float *b = (float *) malloc (n*n * sizeof(float)); float *c = (float *) malloc (n*n * sizeof(float)); int block_x, block_y, grid_x, grid_y; int offset; char fname[256]; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; float exec; /* initialize A[] & B[] */ for (i = 0; i < n; i++) { for(j = 0; j < n; j++) { idx = i * n + j; a[idx] = i + 0.1; b[idx] = i + 0.1; } } /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; printf("block = (%d, %d)\n", block_x, block_y); printf("grid = (%d, %d)\n", grid_x, grid_y); gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/fmadd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetSharedSize(function, 0x40); /* just random */ if (res != CUDA_SUCCESS) { printf("cuFuncSetSharedSize() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } gettimeofday(&tv_h2d_start, NULL); /* upload a[] and b[] */ res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_h2d_end, NULL); /* set kernel parameters */ offset = 0; res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(a_dev); res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(b_dev); res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(c_dev); res = cuParamSetv(function, offset, &n, sizeof(n)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(n); res = cuParamSetSize(function, offset); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_d2h_end, NULL); res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); /* check the results */ i = j = idx = 0; while (i < n) { while (j < n) { idx = i * n + j; if (c[idx] != a[idx] + b[idx]) { printf("c[%d] = %f\n", idx, c[idx]); printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]); return -1; } j++; } i++; } free(a); free(b); free(c); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); printf("Exec: %f\n", exec); printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec); printf("Total: %f\n", total); return 0; }
int main( int argc, char **argv ) { CUdevice main_device = 0; CUcontext main_context = nullptr; CUmodule mod_vectorAdd = nullptr; CUfunction fun_vectorAdd = nullptr; std::string path_vectorAdd( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.cu" ); std::string ptx_vectorAdd ( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.ptx" ); CUdeviceptr input_data = 0u; CUdeviceptr output_data = 0u; std::size_t problem_size = 1024; try { //Initialize the driver API check_error( cuInit( 0u ) ); { int device_count = 0u; check_error( cuDeviceGetCount( &device_count ) ); if( ! device_count ) { std::cerr << "No CUDA devices available" << std::endl; throw CUDA_ERROR_NO_DEVICE; } } check_error( cuDeviceGet( &main_device, 0 ) ); check_error( cuCtxCreate( &main_context, 0, main_device ) ); //Try to manually compile the source file { std::stringstream build_command; build_command << "nvcc " "-ptx " "-o " << ptx_vectorAdd << " " << path_vectorAdd; if( int build_status = system( build_command.str( ).c_str( ) ) ) { std::cerr << "Failed to compile source cuda file into a ptx assembly" << std::endl; throw CUDA_ERROR_UNKNOWN; } //Find module entry with assembly std::string str_assembly; { std::ifstream fassembly( ptx_vectorAdd ); if( !fassembly.is_open( ) ) { std::cerr << "'Vector Add' assembly unavailable" << std::endl; throw CUDA_ERROR_FILE_NOT_FOUND; } fassembly.seekg (0, std::ios::end); str_assembly.resize( std::string::size_type( fassembly.tellg() ) ); fassembly.seekg (0, std::ios::beg); fassembly.read( &str_assembly[0], str_assembly.size( ) ); fassembly.close( ); } auto entry_pos = str_assembly.find( ".entry" ); if( entry_pos == std::string::npos ) { std::cerr << "No entry point in 'Vector Add'" << std::endl; throw CUDA_ERROR_INVALID_SOURCE; } entry_pos += 6u; //".entry".size( ) auto search_limit = str_assembly.find_first_of( " (", entry_pos ); if( search_limit == std::string::npos ) { std::cerr << "No entry point in 'Vector Add'" << std::endl; throw CUDA_ERROR_INVALID_SOURCE; } std::string funcName( str_assembly.substr( entry_pos, search_limit ) ); check_error( cuModuleLoad ( &mod_vectorAdd, ptx_vectorAdd.c_str( ) ) ); check_error( cuModuleGetFunction ( &fun_vectorAdd, mod_vectorAdd, funcName.c_str( ) ) ); } //Play with buffer cuMemAlloc( &input_data, problem_size * sizeof( float ) ); cuMemAlloc( &output_data, problem_size * sizeof( float ) ); { int threadsPerBlock = 256; int blocksPerGrid = (problem_size + threadsPerBlock - 1) / threadsPerBlock; void* args[] = { &input_data, &output_data, &problem_size }; cuLaunchKernel( fun_vectorAdd, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, nullptr); } float* result = new float[problem_size]; cuMemcpyDtoH( result, output_data, problem_size * sizeof( float ) ); std::copy( result, result + problem_size, std::ostream_iterator<float>(std::cout, ", ") ); delete[] result; if( output_data ) cuMemFree ( output_data ); if( input_data ) cuMemFree ( input_data ); if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd ); if( main_context ) cuCtxDestroy ( main_context ); } catch( int return_code ) { if( output_data ) cuMemFree ( output_data ); if( input_data ) cuMemFree ( input_data ); if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd ); if( main_context ) cuCtxDestroy ( main_context ); system("PAUSE"); return return_code;
int main(int argc, char **argv) { //data CUdeviceptr d_data0 = 0; CUdeviceptr d_data1 = 0; DataStruct *h_data0 = 0; DataStruct *h_data1 = 0; DataStruct h_data_reference0; DataStruct h_data_reference1; unsigned int memSize = sizeof(DataStruct); //device references CUcontext hContext = 0; CUdevice hDevice = 0; CUmodule hModule = 0; CUstream hStream = 0; // Initialize the device and get a handle to the kernel CUresult status = initialize(0, &hContext, &hDevice, &hModule, &hStream); // Allocate memory on host and device if ((h_data0 = (DataStruct *)malloc(memSize)) == NULL) { std::cerr << "Could not allocate host memory" << std::endl; exit(-1); } status = cuMemAlloc(&d_data0, memSize); if ((h_data1 = (DataStruct *)malloc(memSize)) == NULL) { std::cerr << "Could not allocate host memory" << std::endl; exit(-1); } status = cuMemAlloc(&d_data1, memSize); if (status != CUDA_SUCCESS) printf("ERROR: during cuMemAlloc\n"); /////////////////////////////////////////////////////////////////////////////// //======================= test cases ========================================// /////////////////////////////////////////////////////////////////////////////// std::string name = ""; unsigned int testnum=0; unsigned int passed=0; //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ /////////////////////// Ralf /////////////////////////////////////////////////// //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if(runRalfFunction("test_phi_scalar", test_phi_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi2_scalar", test_phi2_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi3_scalar", test_phi3_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi4_scalar", test_phi4_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi5_scalar", test_phi5_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi6_scalar", test_phi6_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi7_scalar", test_phi7_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi8_scalar", test_phi8_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi9_scalar", test_phi9_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loopbad_scalar", test_loopbad_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loop23_scalar", test_loop23_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loop13_scalar", test_loop13_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; //////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_constant"; ///////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_constant(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_calculate"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = 3.2; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_calculate(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_parquetShader"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 1; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_parquetShader(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_dyn"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->u = h_data_reference0.u = 7; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_dyn(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simple"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = -4; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simple(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simple"; // Branch 2 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 8; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simple(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simplePHI"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = -10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simplePHI(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop"; ////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 100; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_math"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 1.4; h_data0->i = h_data_reference0.i = 3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_math(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_signedOperands"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = -7; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_signedOperands(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_constantOperands"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = -1.44; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_constantOperands(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop_semihard"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_semihard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop_hard"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->u = h_data_reference0.u = 3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_hard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*////////////*/ name = "test_branch_loop_hard"; // Branch 2 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 7; h_data0->u = h_data_reference0.u = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_hard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_binaryInst"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 5; h_data0->f = h_data_reference0.f = -121.23; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_binaryInst(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_selp"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = -15; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_selp(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_complicated"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data_reference0.s.s.f = h_data0->s.s.f = 3.11; h_data_reference0.s.sa[2].f = h_data0->s.sa[2].f = -4.32; h_data_reference0.s.sa[h_data0->i].f = h_data0->s.sa[h_data0->i].f = 111.3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_complicated(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_call"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_call(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*/////////////*/ name = "test_alloca"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->f = h_data_reference0.f = -3.23; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_alloca(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_alloca_complicated"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->f = h_data_reference0.f = 23.213; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_alloca_complicated(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_globalVariables"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_globalVariables(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_specialRegisters_x"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3); //run device function runHostTestFunction(test_specialRegisters_x, &h_data_reference0, 2,3,4, 2,3); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_specialRegisters_y"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3); //run device function runHostTestFunction(test_specialRegisters_x, &h_data_reference0, 2,3,4, 2,3); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_dualArgument"; ///////////////////////// setZero(h_data0,&h_data_reference0); setZero(h_data1,&h_data_reference1); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunDualTestFunction(&hModule, name, d_data0, d_data1, h_data0, h_data1, memSize); //run device function test_dualArgument(&h_data_reference0,&h_data_reference1); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} if(compareData(h_data1,&h_data_reference1)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_vector"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->fa[0] = h_data_reference0.fa[0] = 0.43f; h_data0->fa[1] = h_data_reference0.fa[1] = 0.234f; h_data0->fa[2] = h_data_reference0.fa[2] = 12893.f; h_data0->fa[3] = h_data_reference0.fa[3] = 13.33f; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_vector(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_reg2Const"; ///////////////////////// setZero(h_data0,&h_data_reference0); /* unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); */ std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_reg2Const(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_constantMemory"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->fa[0] = __ptx_constant_data_global.fa[0] = 0.2348f; unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_constantMemory(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_sharedMemory"; ///////////////////////// setZero(h_data0,&h_data_reference0); for(int i = 0; i < ARRAY_N/2; i++) h_data0->fa[i*2] = i; for(int i = 0; i < ARRAY_N/2; i++) h_data0->fa[i*2+1] = -i; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 32,1,1, 1,1); //run device function for(int i = 0; i < ARRAY_N/2; i++) h_data_reference0.fa[i] = i; for(int i = 0; i < ARRAY_N/2; i++) h_data_reference0.fa[i+32] = -i; // runHostTestFunction(test_sharedMemory, &h_data_reference0, 16,1,1, 1,1); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_lightShader"; ///////////////////////// setZero(h_data0,&h_data_reference0); /* unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); */ std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function /* test_lightShader(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; */ /////////////////////////////////////////////////////////////////////////////// //======================= test cases END ====================================// /////////////////////////////////////////////////////////////////////////////// // Check the result std::cout << "\nPASSED " << passed << " tests" << std::endl; std::cout << "FAILED " << (testnum-passed) << " tests" << std::endl; // Cleanup if (d_data0) { cuMemFree(d_data0); d_data0 = 0; } if (d_data1) { cuMemFree(d_data1); d_data1 = 0; } if (h_data0) { free(h_data0); h_data0 = 0; } if (h_data1) { free(h_data1); h_data1 = 0; } if (hModule) { cuModuleUnload(hModule); hModule = 0; } if (hStream) { cuStreamDestroy(hStream); hStream = 0; } if (hContext) { cuCtxDestroy(hContext); hContext = 0; } return 0; }