void initGLBuffers() { if (pbo) { // delete old buffer checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); glDeleteBuffersARB(1, &pbo); } // create pixel buffer object for display glGenBuffersARB(1, &pbo); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(uchar4), 0, GL_STREAM_DRAW_ARB); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); #if USE_BUFFER_TEX // create buffer texture, attach to pbo if (bufferTex) { glDeleteTextures(1, &bufferTex); } glGenTextures(1, &bufferTex); glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); glTexBufferEXT(GL_TEXTURE_BUFFER_EXT, GL_RGBA8, pbo); glBindTexture(GL_TEXTURE_BUFFER_EXT, 0); #else // create texture for display if (displayTex) { glDeleteTextures(1, &displayTex); } glGenTextures(1, &displayTex); glBindTexture(GL_TEXTURE_TYPE, displayTex); glTexImage2D(GL_TEXTURE_TYPE, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glBindTexture(GL_TEXTURE_TYPE, 0); #endif // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); }
void CudaNarrowphase::squeezeContacts(void * overlappingPairs, unsigned numOverlappingPairs) { void * srcContact = m_contact[0]->bufferOnDevice(); const unsigned scanValidPairLength = iDivUp(numOverlappingPairs, 1024) * 1024; m_validCounts->create(scanValidPairLength * 4); void * counts = m_validCounts->bufferOnDevice(); narrowphaseComputeValidPairs((uint *)counts, (ContactData *)srcContact, numOverlappingPairs, scanValidPairLength); m_scanValidContacts[0]->create(scanValidPairLength * 4); m_scanValidContacts[1]->create(scanValidPairLength * 4); void * scanResult = m_scanValidContacts[0]->bufferOnDevice(); void * scanIntermediate = m_scanValidContacts[1]->bufferOnDevice(); scanExclusive((uint *)scanResult, (uint *)counts, (uint *)scanIntermediate, scanValidPairLength / 1024, 1024); m_numContacts = ScanUtil::getScanResult(m_validCounts, m_scanValidContacts[0], scanValidPairLength); if(m_numContacts < 1) return; m_contactPairs->create(numOverlappingPairs * 8); void * dstPairs = m_contactPairs->bufferOnDevice(); void * dstContact = m_contact[1]->bufferOnDevice(); narrowphaseSqueezeContactPairs((uint2 *)dstPairs, (uint2 *)overlappingPairs, (ContactData *)dstContact, (ContactData *)srcContact, (uint *)counts, (uint *)scanResult, numOverlappingPairs); }
void reshape(int w, int h) { width = w; height = h; initPixelBuffer(); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); glViewport(0, 0, w, h); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); }
extern "C" size_t histogram64( cl_command_queue cqCommandQueue, cl_mem d_Histogram, cl_mem d_Data, uint byteCount ){ cl_int ciErrNum; uint histogramCount; size_t localWorkSize, globalWorkSize; if(!cqCommandQueue) cqCommandQueue = cqDefaultCommandQue; { histogramCount = iDivUp(byteCount, HISTOGRAM64_WORKGROUP_SIZE * iSnapDown(255, 16)); shrCheckError( (byteCount % 16 == 0), shrTRUE ); shrCheckError( (histogramCount <= MAX_PARTIAL_HISTOGRAM64_COUNT), shrTRUE ); cl_uint dataCount = byteCount / 16; ciErrNum = clSetKernelArg(ckHistogram64, 0, sizeof(cl_mem), (void *)&d_PartialHistograms); ciErrNum |= clSetKernelArg(ckHistogram64, 1, sizeof(cl_mem), (void *)&d_Data); ciErrNum |= clSetKernelArg(ckHistogram64, 2, sizeof(cl_uint), (void *)&dataCount); shrCheckError(ciErrNum, CL_SUCCESS); localWorkSize = HISTOGRAM64_WORKGROUP_SIZE; globalWorkSize = histogramCount * HISTOGRAM64_WORKGROUP_SIZE; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckHistogram64, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); } { ciErrNum = clSetKernelArg(ckMergeHistogram64, 0, sizeof(cl_mem), (void *)&d_Histogram); ciErrNum |= clSetKernelArg(ckMergeHistogram64, 1, sizeof(cl_mem), (void *)&d_PartialHistograms); ciErrNum |= clSetKernelArg(ckMergeHistogram64, 2, sizeof(cl_uint), (void *)&histogramCount); shrCheckError(ciErrNum, CL_SUCCESS); localWorkSize = MERGE_WORKGROUP_SIZE; globalWorkSize = HISTOGRAM64_BIN_COUNT * MERGE_WORKGROUP_SIZE; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckMergeHistogram64, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); return HISTOGRAM64_WORKGROUP_SIZE; } }
/************************************************* * HOST DRIVERS *************************************************/ void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) { unsigned int maxthreads = MAXTHREADS_STREAM; int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * BLOCK_DIM1D; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * BLOCK_DIM1D; else size = maxthreads * BLOCK_DIM1D; int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements // offset: used for streams ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
//host driver //void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) { void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){ //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z); unsigned int maxthreads = 65000; // Set threads per block here. unsigned int blocksdim1d = 256; dim3 threads(blocksdim1d, 1, 1); int nstreams = iDivUp(N, maxthreads*blocksdim1d); CUresult err = CUDA_SUCCESS; for (int str = 0; str < nstreams; str++) { int offset = str * maxthreads * blocksdim1d; int size = 0; if (str == (nstreams - 1)) size = N - str * maxthreads * blocksdim1d; else size = maxthreads * blocksdim1d; int gridx = iDivUp(size, blocksdim1d); // number of x blocks // setup execution parameters if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) { mexErrMsgTxt("Error in cuFuncSetBlockShape"); } if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) { mexErrMsgTxt("Error in cuFuncSetSharedSize"); } //mexPrintf("block shape ok\n"); // add parameters int poffset = 0; // CUDA kernels interface // N: number of elements for (int p=0;p<nrhs;p++) { ALIGN_UP(poffset, prhs[p].align); if (CUDA_SUCCESS != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) { mexErrMsgTxt("Error in cuParamSetv"); } poffset += prhs[p].psize; } ALIGN_UP(poffset, __alignof(size)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(size); ALIGN_UP(poffset, __alignof(offset)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(offset); ALIGN_UP(poffset, __alignof(imx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imx); ALIGN_UP(poffset, __alignof(imy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(imy); ALIGN_UP(poffset, __alignof(outx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outx); ALIGN_UP(poffset, __alignof(outy)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(outy); ALIGN_UP(poffset, __alignof(poolx)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(poolx); ALIGN_UP(poffset, __alignof(pooly)); if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) { mexErrMsgTxt("Error in cuParamSeti"); } poffset += sizeof(pooly); // if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) { // mexErrMsgTxt("Error in cuParamSeti"); // } // poffset += sizeof(shmemX); if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) { mexErrMsgTxt("Error in cuParamSetSize"); } err = cuLaunchGridAsync(drvfun, gridx, 1, 0); if (CUDA_SUCCESS != err) { mexErrMsgTxt("Error running kernel"); } } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
void initData(int argc, char **argv) { // parse arguments char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { volumeFilename = filename; } int n; if (checkCmdLineFlag(argc, (const char **) argv, "size")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "size"); volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (checkCmdLineFlag(argc, (const char **) argv, "xsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize"); volumeSize.width = n; } if (checkCmdLineFlag(argc, (const char **) argv, "ysize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize"); volumeSize.height = n; } if (checkCmdLineFlag(argc, (const char **) argv, "zsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize"); volumeSize.depth = n; } char *path = sdkFindFilePath(volumeFilename, argv[0]); if (path == 0) { printf("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); FilterKernel_init(); Volume_init(&volumeOriginal,volumeSize, h_volume, 0); free(h_volume); Volume_init(&volumeFilter0, volumeSize, NULL, 1); Volume_init(&volumeFilter1, volumeSize, NULL, 1); VolumeRender_init(); VolumeRender_setPreIntegrated(preIntegrated); VolumeRender_setVolume(&volumeOriginal); sdkCreateTimer(&timer); sdkCreateTimer(&animationTimer); sdkStartTimer(&animationTimer); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); }
int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); int size = 20000000; int vt = 5; //example // size =9; int* vector = (int *) malloc (size * sizeof(int)); int* vectorCheck = (int *) malloc (size * sizeof(int)); /* //example vector[0] = 1; vector[1] = 3; vector[2] = 5; vector[3] = 2; vector[4] = 7; vector[5] = 9; vector[6] = 6; vector[7] = 2; vector[8] = 3; */ int number = 0; for (int i = 0; i < size; i++){ if (i % (vt * 128) == 0) number++; vector[i] = rand() % 10; } /* for (int i=0; i<size; i++) printf(" %d ", vector[i]); printf("\n"); */ pickCudaDevice(); checkCudaError(); int* d_vector; cudaMalloc((void **) &d_vector, size * sizeof(int)); checkCudaError(); int* d_result; cudaMalloc((void **) &d_result, size * sizeof(int)); checkCudaError(); int* d_vectorCheck; cudaMalloc((void **) &d_vectorCheck, size * sizeof(int)); checkCudaError(); int* d_resultCheck; cudaMalloc((void **) &d_resultCheck, size * sizeof(int)); checkCudaError(); uint numThreads, numBlocks; cudaMemcpy(d_vector,vector,size * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_vectorCheck,vector,size * sizeof(int), cudaMemcpyHostToDevice); computeGridSize(iDivUp(size,VT),NTHREADS,numBlocks,numThreads); printf("Start kernel\n"); //reduce_wrapper(numBlocks,numThreads,d_result,d_vector,size, vt); //checkCudaError(); //gpu time measurement cudaEvent_t gstart_exScan,gstop_exScan; cudaEventCreate(&gstart_exScan); cudaEventCreate(&gstop_exScan); cudaEventRecord(gstart_exScan, 0); exclusiveScan_wrapper2(numBlocks, numThreads, d_result, d_vector, size, VT); cudaEventRecord(gstop_exScan, 0); cudaEventSynchronize(gstop_exScan); float gpu_time_exScan; cudaEventElapsedTime(&gpu_time_exScan, gstart_exScan, gstop_exScan); printf("Our GPU version has finished, it took %f ms\n",gpu_time_exScan ); cudaEventDestroy(gstart_exScan); //cleaning up a bit cudaEventDestroy(gstop_exScan); checkCudaError(); //gpu time measurement cudaEvent_t gstart,gstop; cudaEventCreate(&gstart); cudaEventCreate(&gstop); cudaEventRecord(gstart, 0); exclusiveScan_thrust(d_vectorCheck, d_vectorCheck + size, d_resultCheck, 0); cudaEventRecord(gstop, 0); cudaEventSynchronize(gstop); float gpu_time; cudaEventElapsedTime(&gpu_time, gstart, gstop); printf("Thrust version has finished, it took %f ms\n",gpu_time ); cudaEventDestroy(gstart); //cleaning up a bit cudaEventDestroy(gstop); checkCudaError(); printf("End kernel\n"); cudaMemcpy(vector,d_result,size * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(vectorCheck,d_resultCheck,size * sizeof(int), cudaMemcpyDeviceToHost); /* for (int i=0; i<size; i++) printf(" %d ", vectorCheck[i]); printf("\n"); */ /* for (int i=0; i<size; i++) printf(" %d ", vector[i]); printf("\n"); */ printf("Difference %d\n", vectorsDifference(vector,vectorCheck,size)); if(areVectorsEqual(vector,vectorCheck,size) == 0) printf("Vectors are equal!!\n"); else printf("Vectors are NOT equal :( \n"); checkCudaError(); cudaFree(d_vector); free(vector); cudaFree(d_result); cudaFree(d_resultCheck); cudaFree(d_vectorCheck); free(vectorCheck); return 0; }
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength) { uint N = 0; CUdeviceptr d_Output; struct timeval start,stop; gettimeofday(&start, NULL); initScan(); gettimeofday(&stop, NULL); if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE) { for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){ if(arrayLength <= i){ N = i; break; } } checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N); //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE) { N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LL_SIZE) { N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE); printf("N = %d\n",N); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else{ cuMemFree(d_Output); closeScan(); return NULL; } closeScan(); cuMemFree(*d_Input); *d_Input = d_Output; printf("inside scan time:\n"); printDiff(start,stop); return d_Output; }