const unsigned long CUDARunner::RunStep() { //unsigned int best=0; //unsigned int bestg=~0; int offset=0; if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0) { AllocateResources(m_numb,m_numt); } m_out[0].m_bestnonce=0; cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out)); cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in)); int loops=GetStepIterations(); int bits=GetStepBitShift()-1; void *ptr=(void *)(size_t)m_devin; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ptr=(void *)(size_t)m_devout; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ALIGN_UP(offset, __alignof(loops)); cuParamSeti(m_function,offset,loops); offset+=sizeof(loops); ALIGN_UP(offset, __alignof(bits)); cuParamSeti(m_function,offset,bits); offset+=sizeof(bits); cuParamSetSize(m_function,offset); cuFuncSetBlockShape(m_function,m_numt,1,1); cuLaunchGrid(m_function,m_numb,1); cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out)); // very unlikely that we will find more than 1 hash with H=0 // so we'll just return the first one and not even worry about G for(int i=0; i<1/*m_numb*m_numt*/; i++) { if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg) { return CryptoPP::ByteReverse(m_out[i].m_bestnonce); //best=m_out[i].m_bestnonce; //bestg=m_out[i].m_bestg; } } return 0; }
//host driver void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int imgSize, int numRegionsY, int shmemX, int shmem, int nrhs, hostdrv_pars_t *prhs) { //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); CUresult err = CUDA_SUCCESS; // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.z))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, shmem)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imgSize)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imgSize); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, numRegionsY)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(numRegionsY); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, grid.x, grid.y, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } }
/* Driver */ void hostGPUPdist(CUfunction drvfun, int nrhs, hostdrv_pars_t *prhs, int n, int m) { /* Each thread block computes a linear block of the target */ int gridx = (n + BLOCK_DIM1D - 1) / BLOCK_DIM1D; //BLOCK_DIM1D set in GPUkernel.hh CUresult err = CUDA_SUCCESS; // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, m*sizeof(float))) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams for (int p=0;p<nrhs;p++) { if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, n)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(n); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, m)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(m); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } }
int CudaModule::setParami(CUfunction kernel, int offset, S32 value) { if (kernel) { checkError( "cuParamSeti", cuParamSeti(kernel, offset, value)); } return sizeof(S32); }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jstring filename, jint num_blocks){ void * cubin_file; int offset; CUresult status; char * native_filename; heapEndPtr = heap_end_ptr; native_filename = (*env)->GetStringUTFChars(env, filename, 0); status = cuModuleLoad(&cuModule, native_filename); CHECK_STATUS(env, "error in cuModuleLoad", status); (*env)->ReleaseStringUTFChars(env, filename, native_filename); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (7 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuClassMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuClassMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: loadFunction * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jobject buffers, jint size, jint total_size, jint num_blocks){ void * cubin_file; int offset; CUresult status; heapEndPtr = heap_end_ptr; //void * cubin_file = readCubinFile("code_file.cubin"); cubin_file = readCubinFileFromBuffers(env, buffers, size, total_size); status = cuModuleLoadData(&cuModule, cubin_file); CHECK_STATUS(env,"error in cuModuleLoad",status) free(cubin_file); status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_i"); CHECK_STATUS(env,"error in cuModuleGetFunction",status) status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1); CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status) status = cuParamSetSize(cuFunction, (6 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env,"error in cuParamSetSize",status) offset = 0; status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status) offset += sizeof(CUdeviceptr); status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status) offset += sizeof(CUdeviceptr); status = cuParamSeti(cuFunction, offset, num_blocks); CHECK_STATUS(env,"error in cuParamSetv num_blocks",status) offset += sizeof(int); }
void GPUInterface::LaunchKernel(GPUFunction deviceFunction, Dim3Int block, Dim3Int grid, int parameterCountV, int totalParameterCount, ...) { // parameters #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernel\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuFuncSetBlockShape(deviceFunction, block.x, block.y, block.z)); int offset = 0; va_list parameters; va_start(parameters, totalParameterCount); for(int i = 0; i < parameterCountV; i++) { void* param = (void*)(size_t)va_arg(parameters, GPUPtr); // adjust offset alignment requirements offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1); SAFE_CUDA(cuParamSetv(deviceFunction, offset, ¶m, sizeof(param))); offset += sizeof(void*); } for(int i = parameterCountV; i < totalParameterCount; i++) { unsigned int param = va_arg(parameters, unsigned int); // adjust offset alignment requirements offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1); SAFE_CUDA(cuParamSeti(deviceFunction, offset, param)); offset += sizeof(param); } va_end(parameters); SAFE_CUDA(cuParamSetSize(deviceFunction, offset)); SAFE_CUDA(cuLaunchGrid(deviceFunction, grid.x, grid.y)); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::LaunchKernel\n"); #endif }
int main(int argc, char ** argv) { int dev_count = 0; CUdevice device; CUcontext context; CUmodule module; CUfunction function; cuInit(0); cuDeviceGetCount(&dev_count); if (dev_count < 1) return -1; cuDeviceGet( &device, 0 ); cuCtxCreate( &context, 0, device ); cuModuleLoad( &module, "hello.cuda_runtime.ptx" ); cuModuleGetFunction( &function, module, "_Z6kernelPf" ); int N = 512; CUdeviceptr pData; cuMemAlloc( &pData, N * sizeof(float) ); cuFuncSetBlockShape( function, N, 1, 1 ); cuParamSeti( function, 0, pData ); cuParamSetSize( function, 4 ); cuLaunchGrid( function, 1, 1 ); float * pHostData = new float[N]; cuMemcpyDtoH( pHostData, pData, N * sizeof( float) ); cuMemFree( pData ); delete [] pHostData; return 0; }
CUresult cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch, CUdeviceptr d_dstARGB, size_t nDestPitch, uint32 width, uint32 height, CUfunction fpFunc, CUstream streamID) { CUresult status; // Each thread will output 2 pixels at a time. The grid size width is half // as large because of this dim3 block(32,16,1); dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1); #if CUDA_VERSION >= 4000 // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) void *args[] = { &d_srcNV12, &nSourcePitch, &d_dstARGB, &nDestPitch, &width, &height }; // new CUDA 4.0 Driver API Kernel launch call status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, streamID, args, NULL); #else // This is the older Driver API launch method from CUDA (V1.0 to V3.2) cutilDrvSafeCall(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1)); int offset = 0; // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers // device pointer for Source Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_srcNV12, sizeof(d_srcNV12))); offset += sizeof(d_srcNV12); // set the Source pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch))); offset += sizeof(nSourcePitch); // device pointer for Destination Surface cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &d_dstARGB, sizeof(d_dstARGB))); offset += sizeof(d_dstARGB); // set the Destination Pitch cutilDrvSafeCall(cuParamSetv(fpFunc, offset, &nDestPitch, sizeof(nDestPitch))); offset += sizeof(nDestPitch); // set the width of the image ALIGN_OFFSET(offset, __alignof(width)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, width)); offset += sizeof(width); // set the height of the image ALIGN_OFFSET(offset, __alignof(height)); cutilDrvSafeCall(cuParamSeti(fpFunc, offset, height)); offset += sizeof(height); cutilDrvSafeCall(cuParamSetSize(fpFunc, offset)); // Launching the kernel, we need to pass in the grid dimensions status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID); #endif if (CUDA_SUCCESS != status) { fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %08x, retval = %d\n", (unsigned int)fpFunc, status); return status; } return status; }
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; unsigned int *a_buf, *b_buf, *c_buf; unsigned long long int a_phys, b_phys, c_phys; unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int)); int block_x, block_y, grid_x, grid_y; char fname[256]; int ret = 0; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; struct timeval tv_mem_alloc_start; struct timeval tv_data_init_start; float data_init; struct timeval tv_conf_kern_start; struct timeval tv_close_start; float mem_alloc; float exec; float init_gpu; float configure_kernel; float close_gpu; float data_read; unsigned int dummy_b, dummy_c; /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/madd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } gettimeofday(&tv_mem_alloc_start, NULL); /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (a) failed\n"); return -1; } res = cuMemGetPhysAddr(&a_phys, (void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (a) failed\n"); return -1; } /*printf("a[]: Physical Address 0x%llx\n", a_phys);*/ /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (b) failed\n"); return -1; } res = cuMemGetPhysAddr(&b_phys, (void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (b) failed\n"); return -1; } /*printf("b[]: Physical Address 0x%llx\n", b_phys);*/ /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (c) failed\n"); return -1; } res = cuMemGetPhysAddr(&c_phys, (void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (c) failed\n"); return -1; } /*printf("c[]: Physical Address 0x%llx\n", c_phys);*/ gettimeofday(&tv_data_init_start, NULL); /* initialize A[] & B[] */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { a_buf[idx++] = i; } } for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { b_buf[idx++] = i; } } gettimeofday(&tv_h2d_start, NULL); gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_conf_kern_start, NULL); /* set kernel parameters */ res = cuParamSeti(function, 0, a_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 4, a_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 8, b_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 12, b_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 16, c_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 20, c_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 24, n); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSetSize(function, 28); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ memcpy(c, c_buf, n*n*sizeof(unsigned int)); gettimeofday(&tv_d2h_end, NULL); /* Read back */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { dummy_c = c[idx++]; } } gettimeofday(&tv_close_start, NULL); res = cuMemUnmap((void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_mem_alloc_start, &tv_total_start, &tv); init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv); mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_start, &tv_data_init_start, &tv); data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_start, &tv_conf_kern_start, &tv); configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_close_start, &tv_d2h_end, &tv); data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_close_start, &tv); close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("Init: %f\n", init_gpu); printf("MemAlloc: %f\n", mem_alloc); printf("DataInit: %f\n", data_init); printf("HtoD: %f\n", h2d); printf("KernConf: %f\n", configure_kernel); printf("Exec: %f\n", exec); printf("DtoH: %f\n", d2h); printf("DataRead: %f\n", data_read); printf("Close: %f\n", close_gpu); printf("Total: %f\n", total); return ret; }
JNIEXPORT void JNICALL Java_org_trifort_rootbeer_runtime_CUDAContext_cudaRun (JNIEnv *env, jobject this_ref, jint device_index, jbyteArray cubin_file, jint cubin_length, jint block_shape_x, jint grid_shape_x, jint num_threads, jobject object_mem, jobject handles_mem, jobject exceptions_mem, jobject class_mem) { CUresult status; CUdevice device; CUcontext context; CUmodule module; CUfunction function; void * fatcubin; int offset; int info_space_size; CUdeviceptr gpu_info_space; CUdeviceptr gpu_object_mem; CUdeviceptr gpu_handles_mem; CUdeviceptr gpu_exceptions_mem; CUdeviceptr gpu_class_mem; CUdeviceptr gpu_heap_end; CUdeviceptr gpu_buffer_size; void * cpu_object_mem; void * cpu_handles_mem; void * cpu_exceptions_mem; void * cpu_class_mem; jlong cpu_object_mem_size; jlong cpu_handles_mem_size; jlong cpu_exceptions_mem_size; jlong cpu_class_mem_size; jlong cpu_heap_end; jclass cuda_memory_class; jmethodID get_address_method; jmethodID get_size_method; jmethodID get_heap_end_method; jlong * info_space; //---------------------------------------------------------------------------- //init device and function //---------------------------------------------------------------------------- status = cuDeviceGet(&device, device_index); CHECK_STATUS(env, "Error in cuDeviceGet", status, device) status = cuCtxCreate(&context, CU_CTX_MAP_HOST, device); CHECK_STATUS(env,"Error in cuCtxCreate", status, device) fatcubin = malloc(cubin_length); (*env)->GetByteArrayRegion(env, cubin_file, 0, cubin_length, fatcubin); status = cuModuleLoadFatBinary(&module, fatcubin); CHECK_STATUS(env, "Error in cuModuleLoad", status, device) free(fatcubin); status = cuModuleGetFunction(&function, module, "_Z5entryPcS_PiPxS1_S0_S0_i"); CHECK_STATUS(env, "Error in cuModuleGetFunction", status, device) //---------------------------------------------------------------------------- //get handles from java //---------------------------------------------------------------------------- cuda_memory_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/FixedMemory"); get_address_method = (*env)->GetMethodID(env, cuda_memory_class, "getAddress", "()J"); get_size_method = (*env)->GetMethodID(env, cuda_memory_class, "getSize", "()J"); get_heap_end_method = (*env)->GetMethodID(env, cuda_memory_class, "getHeapEndPtr", "()J"); cpu_object_mem = (void *) (*env)->CallLongMethod(env, object_mem, get_address_method); cpu_object_mem_size = (*env)->CallLongMethod(env, object_mem, get_size_method); cpu_heap_end = (*env)->CallLongMethod(env, object_mem, get_heap_end_method); cpu_handles_mem = (void *) (*env)->CallLongMethod(env, handles_mem, get_address_method); cpu_handles_mem_size = (*env)->CallLongMethod(env, handles_mem, get_size_method); cpu_exceptions_mem = (void *) (*env)->CallLongMethod(env, exceptions_mem, get_address_method); cpu_exceptions_mem_size = (*env)->CallLongMethod(env, exceptions_mem, get_size_method); cpu_class_mem = (void *) (*env)->CallLongMethod(env, class_mem, get_address_method); cpu_class_mem_size = (*env)->CallLongMethod(env, class_mem, get_size_method); info_space_size = 1024; info_space = (jlong *) malloc(info_space_size); info_space[1] = (*env)->CallLongMethod(env, object_mem, get_heap_end_method); //---------------------------------------------------------------------------- //allocate mem //---------------------------------------------------------------------------- status = cuMemAlloc(&gpu_info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_info_mem", status, device) status = cuMemAlloc(&gpu_object_mem, cpu_object_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_object_mem", status, device) status = cuMemAlloc(&gpu_handles_mem, cpu_handles_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_handles_mem", status, device) status = cuMemAlloc(&gpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_exceptions_mem", status, device) status = cuMemAlloc(&gpu_class_mem, cpu_class_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_class_mem", status, device) status = cuMemAlloc(&gpu_heap_end, 8); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_heap_end", status, device) status = cuMemAlloc(&gpu_buffer_size, 8); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_buffer_size", status, device) //---------------------------------------------------------------------------- //set function parameters //---------------------------------------------------------------------------- status = cuParamSetSize(function, (7 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env, "Error in cuParamSetSize", status, device) offset = 0; status = cuParamSetv(function, offset, (void *) &gpu_info_space, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv gpu_info_space", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_object_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_object_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_handles_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_handles_mem %", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_heap_end, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_heap_end", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_buffer_size, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_buffer_size", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_exceptions_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_exceptions_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_class_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_class_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSeti(function, offset, num_threads); CHECK_STATUS(env, "Error in cuParamSetv: num_threads", status, device) offset += sizeof(int); //---------------------------------------------------------------------------- //copy data //---------------------------------------------------------------------------- status = cuMemcpyHtoD(gpu_info_space, info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: info_space", status, device) status = cuMemcpyHtoD(gpu_object_mem, cpu_object_mem, cpu_object_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_object_mem", status, device) status = cuMemcpyHtoD(gpu_handles_mem, cpu_handles_mem, cpu_handles_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_handles_mem", status, device) status = cuMemcpyHtoD(gpu_class_mem, cpu_class_mem, cpu_class_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_class_mem", status, device) status = cuMemcpyHtoD(gpu_heap_end, &cpu_heap_end, sizeof(jlong)); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_heap_end", status, device) status = cuMemcpyHtoD(gpu_buffer_size, &cpu_object_mem_size, sizeof(jlong)); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_buffer_size", status, device) status = cuMemcpyHtoD(gpu_exceptions_mem, cpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device) //---------------------------------------------------------------------------- //launch //---------------------------------------------------------------------------- status = cuFuncSetBlockShape(function, block_shape_x, 1, 1); CHECK_STATUS(env, "Error in cuFuncSetBlockShape", status, device); status = cuLaunchGrid(function, grid_shape_x, 1); CHECK_STATUS(env, "Error in cuLaunchGrid", status, device) status = cuCtxSynchronize(); CHECK_STATUS(env, "Error in cuCtxSynchronize", status, device) //---------------------------------------------------------------------------- //copy data back //---------------------------------------------------------------------------- status = cuMemcpyDtoH(info_space, gpu_info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_info_space", status, device) cpu_heap_end = info_space[1]; status = cuMemcpyDtoH(cpu_object_mem, gpu_object_mem, cpu_heap_end); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_object_mem", status, device) status = cuMemcpyDtoH(cpu_exceptions_mem, gpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device) //---------------------------------------------------------------------------- //free resources //---------------------------------------------------------------------------- free(info_space); cuMemFree(gpu_info_space); cuMemFree(gpu_object_mem); cuMemFree(gpu_handles_mem); cuMemFree(gpu_exceptions_mem); cuMemFree(gpu_class_mem); cuMemFree(gpu_heap_end); cuMemFree(gpu_buffer_size); cuCtxDestroy(context); }
void CUDARunner::FindBestConfiguration() { unsigned long lowb=16; unsigned long highb=128; unsigned long lowt=16; unsigned long hight=256; unsigned long bestb=16; unsigned long bestt=16; int offset=0; void *ptr=0; int64 besttime=std::numeric_limits<int64>::max(); if(m_requestedgrid>0 && m_requestedgrid<=65536) { lowb=m_requestedgrid; highb=m_requestedgrid; } if(m_requestedthreads>0 && m_requestedthreads<=65536) { lowt=m_requestedthreads; hight=m_requestedthreads; } for(int numb=lowb; numb<=highb; numb*=2) { for(int numt=lowt; numt<=hight; numt*=2) { if(AllocateResources(numb,numt)==true) { // clear out any existing error CUresult err=CUDA_SUCCESS; int64 st=GetTimeMillis(); for(int it=0; it<128*256*2 && err==CUDA_SUCCESS; it+=(numb*numt)) { cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in)); offset=0; int loops=64; int bits=5; ptr=(void *)(size_t)m_devin; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ptr=(void *)(size_t)m_devout; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ALIGN_UP(offset, __alignof(loops)); cuParamSeti(m_function,offset,loops); offset+=sizeof(loops); ALIGN_UP(offset, __alignof(bits)); cuParamSeti(m_function,offset,bits); offset+=sizeof(bits); cuParamSetSize(m_function,offset); err=cuFuncSetBlockShape(m_function,numt,1,1); if(err!=CUDA_SUCCESS) { printf("cuFuncSetBlockShape error %d\n",err); continue; } err=cuLaunchGrid(m_function,numb,1); if(err!=CUDA_SUCCESS) { printf("cuLaunchGrid error %d\n",err); continue; } cuMemcpyDtoH(m_out,m_devout,numt*numb*sizeof(cuda_out)); if(err!=CUDA_SUCCESS) { printf("CUDA error %d\n",err); } } int64 et=GetTimeMillis(); printf("Finding best configuration step end (%d,%d) %"PRI64d"ms prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime); if((et-st)<besttime && err==CUDA_SUCCESS) { bestb=numb; bestt=numt; besttime=et-st; } } } } m_numb=bestb; m_numt=bestt; AllocateResources(m_numb,m_numt); }
int main( int argc, char** argv) { uint num_threads; uint num_blocks, block_size; uint length; uint nBytes; int *list; int status, verbose, c, i, j, logBlocks; int read_stdin; struct timeval start_time, end_time; unsigned long total_time; CUdevice hDevice; CUcontext hContext; CUmodule hModule; CUfunction bitonicBlockFn; CUfunction mergeBlocksFn; CUdeviceptr pDeviceArrayA; CUdeviceptr pDeviceArrayB; status = SUCCESS; verbose = 0; read_stdin = FALSE; length = 0; while ((c = getopt (argc, argv, "dip:vO")) != -1) { switch (c) { case 'd': verbose |= GROSS_DEBUG; break; case 'i': read_stdin = TRUE; case 'O': verbose |= OUTPUT; break; case 'p': length = 1 << atoi(optarg); break; case 'v': verbose |= DEBUG; break; case '?': default: print_usage(); return FAILURE; } } if ( read_stdin == TRUE ) { /* Read sequence of integers from stdin */ list = (int*) malloc (INIT_INPUT_SIZE * sizeof(int) ); length = readIntegers(list, INIT_INPUT_SIZE); } else if ( length > 0 ) { list = (int*) malloc (length * sizeof(int) ); randomInts(list, length); } else if (optind >= argc) { /* No size was given */ print_usage(); return FAILURE; } else { /* Generate our own integers */ length = atoi(argv[optind]); list = (int*) malloc (length * sizeof(int) ); randomInts(list, length); } /* * Phase 1: * There will be one thread for each element to be sorted. Each * block will perform bitonic sort on MAX_THREADS_PER_BLOCK elements. */ /* Initialize sizes */ num_threads = _min(length, MAX_THREADS_PER_BLOCK ); num_blocks = (length-1) / MAX_THREADS_PER_BLOCK + 1; nBytes = length * sizeof(int); if (verbose & DEBUG) printf("Initializing GPU.\n"); /* Start timing */ gettimeofday(&start_time, NULL); /* Initialize GPU */ cutilDrvSafeCall( cuInit(0) ); cutilDrvSafeCall( cuDeviceGet(&hDevice, 0) ); cutilDrvSafeCall( cuCtxCreate(&hContext, 0, hDevice) ); cutilDrvSafeCall( cuModuleLoad(&hModule, MODULE_FILE) ); cutilDrvSafeCall( cuModuleGetFunction(&bitonicBlockFn, hModule, BITONIC_BLOCK_FN) ); /* Allocate memory on the device */ cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayA, nBytes) ); cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayB, nBytes) ); cutilDrvSafeCall( cuMemcpyHtoD(pDeviceArrayA, list, nBytes) ); cutilDrvSafeCall( cuFuncSetBlockShape(bitonicBlockFn, num_threads, 1, 1)); cutilDrvSafeCall( cuParamSeti(bitonicBlockFn, 0, pDeviceArrayA) ); cutilDrvSafeCall( cuParamSetSize(bitonicBlockFn, 4) ); /* Execute the kernel on the GPU */ if ( verbose & DEBUG ) printf("Launching bitonic sort kernel with %d blocks and %d threads per block.\n", num_blocks, num_threads); cutilDrvSafeCall( cuLaunchGrid(bitonicBlockFn, num_blocks, 1) ); /* * Phase 2: * At this point each block is a sorted list. Now it's time to merge them. */ /* TODO This should go away after development */ if ( verbose & GROSS_DEBUG ) { cuMemcpyDtoH(list, pDeviceArrayA, nBytes); for (i=0; i<num_blocks; ++i) { printf("### Block %d:\n", i); for (j=0; j<num_threads; ++j) { printf("%d\n", list[i*num_threads + j]); } } } i=0; /* Do we need to merge blocks? */ if ( num_blocks > 1 ) { /* There will be Log_2(num_blocks) merge steps. */ logBlocks = 0; for (i=1; i<num_blocks; i *= 2) ++logBlocks; if ( verbose & DEBUG ) printf("There will be %d merge steps.\n", logBlocks); block_size = num_threads; /* How big the blocks were in the last grid launch. */ num_threads = num_blocks >> 1; /* Start with blocks/2 threads */ num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1; cutilDrvSafeCall( cuModuleGetFunction(&mergeBlocksFn, hModule, MERGE_BLOCKS_FN) ); cuParamSeti(mergeBlocksFn, 4, block_size); cuParamSetSize(mergeBlocksFn, 16); for (i=0; i < logBlocks; ++i) { cuFuncSetBlockShape(mergeBlocksFn, num_threads, 1, 1); cuParamSeti(mergeBlocksFn, 0, i); /* set merge level */ /* Merging uses a source array and destination array, the gpu has 2 arrays allocated * so we swap which is the source and which is the destination for each iteration. */ if ( i%2 == 0 ) { cuParamSeti(mergeBlocksFn, 8, pDeviceArrayA); cuParamSeti(mergeBlocksFn, 12, pDeviceArrayB); } else { cuParamSeti(mergeBlocksFn, 8, pDeviceArrayB); cuParamSeti(mergeBlocksFn, 12, pDeviceArrayA); } if ( verbose & DEBUG ) { printf("Launching block merge kernel with %d blocks and %d threads per block\n", num_blocks, num_threads/num_blocks); } cutilDrvSafeCall( cuLaunchGrid(mergeBlocksFn, num_blocks, 1) ); num_threads = num_threads >> 1; num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1; } }
//host driver //void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) { void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){ //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); unsigned int maxthreads = 65000; // Set threads per block here. unsigned int blocksdim1d = 256; dim3 threads(blocksdim1d, 1, 1); int nstreams = iDivUp(N, maxthreads*blocksdim1d); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * blocksdim1d; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * blocksdim1d; else size = maxthreads * blocksdim1d; int gridx = iDivUp(size, blocksdim1d); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); ALIGN_UP(poffset, __alignof(imx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imx); ALIGN_UP(poffset, __alignof(imy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imy); ALIGN_UP(poffset, __alignof(outx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outx); ALIGN_UP(poffset, __alignof(outy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outy); ALIGN_UP(poffset, __alignof(poolx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(poolx); ALIGN_UP(poffset, __alignof(pooly)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(pooly); // if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { // mexErrMsgTxt("Error in cuParamSeti"); // } // poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
int main(int argc, char *argv[]) { srand(time(NULL)); for(int k=0;k<4;k++) { int n = 30*(k+1); float x = ((float) rand()) / (float) RAND_MAX; float *a = new float[n+1]; float resultGPU; for(int i = 0; i < n + 1; i++) a[i] = i * 0.5*((float) rand()) / (float) RAND_MAX; int blocks = (n + 1) / BLK_SZ; if((n + 1) % BLK_SZ) blocks++; CUdevice hDevice; CUcontext hContext; CUmodule hModule; CUfunction hFunction; CALL( cuInit(0) ); CALL( cuDeviceGet(&hDevice, 0) ); CALL( cuCtxCreate(&hContext, 0, hDevice) ); CALL( cuModuleLoad(&hModule, "kernel.cubin") ); CALL( cuModuleGetFunction(&hFunction, hModule, "Polynomial") ); //dane wejsciowe - kopiowanie CUdeviceptr DevA, DevResult; CALL( cuMemAlloc(&DevA, (n+1)*sizeof(float) ) ); CALL( cuMemAlloc(&DevResult, sizeof(float) ) ); CALL( cuMemcpyHtoD(DevA, a, (n+1)*sizeof(float) ) ); CALL( cuFuncSetBlockShape(hFunction, BLK_SZ, 1, 1) ); //przekazanie parametrow do kernela int offset = 0; void *ptr; ptr = (void*)(size_t)DevResult; ALIGN_UP(offset, __alignof(ptr)); CALL( cuParamSetv(hFunction, offset, &ptr, sizeof(ptr)) ); offset += sizeof(ptr); ptr = (void*)(size_t)DevA; ALIGN_UP(offset, __alignof(ptr)); CALL( cuParamSetv(hFunction, offset, &ptr, sizeof(ptr)) ); offset += sizeof(ptr); ALIGN_UP(offset, __alignof(float)); CALL( cuParamSetf(hFunction, offset, x) ); offset += sizeof(float); ALIGN_UP(offset, __alignof(int)); CALL( cuParamSeti(hFunction, offset, n) ); offset += sizeof(int); CALL( cuParamSetSize(hFunction, offset) ); CALL( cuLaunchGrid(hFunction, blocks, 1) ); //kopiowanie wyniku na hosta CALL( cuMemcpyDtoH((void *) &resultGPU, DevResult, sizeof(float) ) ); //zwalnianie pamieci na urzadzeniu CALL( cuMemFree(DevA) ); CALL( cuMemFree(DevResult) ); //obliczenia na CPU float resultCPU = PolynomialCPU(a, x, n); std::cout << "GPU:\t" << resultGPU << std::endl; std::cout << "CPU:\t" << resultCPU << std::endl; std::cout << "roznica:\t" << fabs(resultGPU - resultCPU) << std::endl; delete [] a; } return 0; }
/* * parameter setting * provide kernel with needed parameter when it be launched */ void parameter_set(void){ res = cuParamSeti(function, 0, x_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(x) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 4, x_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(x) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 8, v_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(v) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 12, v_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(v) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSetv(function, 16, &a, 8); if(res != CUDA_SUCCESS){ printf("cuParamSetv(a) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 24, error_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 28, error_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 32, s_time_dev); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSeti(function, 36, s_time_dev >> 32); if(res != CUDA_SUCCESS){ printf("cuParamSeti(error) failed: res = %s\n", conv(res)); exit(1); } res = cuParamSetSize(function, 40); if(res != CUDA_SUCCESS){ printf("cuParaMSetSize() failed: res = %s\n", conv(res)); exit(1); } }
void Function::setParameter(int offset, int value) const { detail::error_check(cuParamSeti(impl->func, offset, value), "Can't set Cuda function parameter (int)"); }
/************************************************* * HOST DRIVERS *************************************************/ void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) { unsigned int maxthreads = MAXTHREADS_STREAM; int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * BLOCK_DIM1D; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * BLOCK_DIM1D; else size = maxthreads * BLOCK_DIM1D; int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char** argv) { CUcontext cuContext; // initialize CUDA CUfunction pk = NULL; const char cubin_name [] = "pass_kernel.cubin"; const char kernel_name [] = "pass_kernel"; CU_SAFE_CALL(initCuda(cuContext, argv[0], &pk, argc, argv, cubin_name, kernel_name)); printf("initCuda-returned CUfunction:\n"); // cuParamSetx, x=i f v // http://visionexperts.blogspot.com/2010/07/cuda-parameter-alignment.html - check alignment #define ALIGN_UP(offset, alignment) \ (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) size_t offset = 0; // input integers // CU paramset i. for(int i = 0 ; i < NUM_ARG ; i++) { int align = __alignof(int); ALIGN_UP(offset, align); cuParamSeti(pk, offset, i); printf ("offset %d = %d\n", i, offset); offset += sizeof(int); } // return array for updated inputs int size_int = sizeof(int); int size_array = size_int * NUM_ARG; CUdeviceptr d_return_values; cuMemAlloc (&d_return_values, size_array); void* ptr = (void*)(size_t)d_return_values; int align = __alignof(ptr); ALIGN_UP(offset, align); cuParamSetv(pk, offset, &ptr, sizeof(ptr)); printf("return values offset:%d\n", offset); offset += sizeof(ptr); CUdeviceptr d_return_N; cuMemAlloc(&d_return_N, size_int); void* ptrN = (void*)(size_t)d_return_N; int alignN = __alignof(ptrN); ALIGN_UP(offset, alignN); cuParamSetv(pk, offset, &ptrN, sizeof(ptr)); printf("return int offset:%d\n", offset); offset += sizeof(ptrN); // Calling kernel int BLOCK_SIZE_X = NUM_ARG; int BLOCK_SIZE_Y = 1; int BLOCK_SIZE_Z = 1; int GRID_SIZE = 1; cutilDrvSafeCallNoSync(cuFuncSetBlockShape(pk, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z)); printf("paramsetsize:%d\n", offset); CU_SAFE_CALL(cuParamSetSize(pk, offset)); CU_SAFE_CALL(cuLaunchGrid(pk, GRID_SIZE, GRID_SIZE)); int* h_return_values = (int*)malloc(NUM_ARG * sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_values, d_return_values, size_array)); CU_SAFE_CALL(cuMemFree(d_return_values)); for(int i=0;i<NUM_ARG;i++) printf("%dth value = %d\n", i, h_return_values[i]); free(h_return_values); int* h_return_N = (int*)malloc(sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_N, d_return_N, size_int)); CU_SAFE_CALL(cuMemFree(d_return_N)); printf("%d sizeof array\n", *h_return_N); if(cuContext !=NULL) cuCtxDetach(cuContext); }