void CRoutine_Sum_NVidia::BuildKernels() { int whichKernel = 6; int numBlocks = 0; int numThreads = 0; #ifdef __APPLE__ int maxThreads = 64; #else int maxThreads = 128; #endif int maxBlocks = 64; int cpuFinalThreshold = 1; getNumBlocksAndThreads(whichKernel, mBufferSize, maxBlocks, maxThreads, numBlocks, numThreads); BuildReductionKernel(whichKernel, numThreads, isPow2(mBufferSize) ); mBlocks.push_back(numBlocks); mThreads.push_back(numThreads); mReductionPasses += 1; int s = numBlocks; int threads = 0, blocks = 0; int kernel = (whichKernel == 6) ? 5 : whichKernel; while(s > cpuFinalThreshold) { getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); BuildReductionKernel(kernel, threads, isPow2(s) ); mBlocks.push_back(blocks); mThreads.push_back(threads); s = (s + (threads*2-1)) / (threads*2); mReductionPasses += 1; } mFinalS = s; }
bool runTest(int argc, char **argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; if (checkCmdLineFlag(argc, (const char **) argv, "n")) { size = getCmdLineArgumentInt(argc, (const char **) argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "threads")) { maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **) argv, "kernel")) { whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel"); } if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks")) { maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks"); } printf("%d elements\n", size); printf("%d threads (max)\n", maxThreads); cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal"); if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh")) { cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh"); } bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo"); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) { cpuFinalThreshold = 1; } // allocate mem for the result on host side T *h_odata = (T *) malloc(numBlocks*sizeof(T)); printf("%d blocks\n\n", numBlocks); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); T gpu_result = 0; gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = reduceCPU<T>(h_idata, size); int precision = 0; double threshold = 0; double diff = 0; if (datatype == REDUCE_INT) { printf("\nGPU result = %d\n", (int)gpu_result); printf("CPU result = %d\n\n", (int)cpu_result); } else { if (datatype == REDUCE_FLOAT) { precision = 8; threshold = 1e-8 * size; } else { precision = 12; threshold = 1e-12 * size; } printf("\nGPU result = %.*f\n", precision, (double)gpu_result); printf("CPU result = %.*f\n\n", precision, (double)cpu_result); diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T *h_odata = (T *) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, maxNumBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up for (int kernel = 0; kernel < 7; kernel++) { reduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); // print headers printf("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); printf("Kernel"); for (int i = minN; i <= maxN; i *= 2) { printf(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { printf("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { sdkResetTimer(&timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if (numBlocks <= MAX_BLOCK_DIM_SIZE) { benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = sdkGetAverageTimerValue(&timer); } else { reduceTime = -1.0; } printf(", %.5f", reduceTime); } } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); }
T benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, StopWatchInterface *timer, T *h_odata, T *d_idata, T *d_odata) { T gpu_result = 0; bool needReadBack = true; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; cudaDeviceSynchronize(); sdkStartTimer(&timer); // execute the kernel reduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata); // check if kernel execution generated an error getLastCudaError("Kernel execution failed"); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost)); for (int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = whichKernel; while (s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); reduce<T>(s, threads, blocks, kernel, d_odata, d_odata); if (kernel < 3) { s = (s + threads - 1) / threads; } else { s = (s + (threads*2-1)) / (threads*2); } } if (s > 1) { // copy result from device to host checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost)); for (int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } cudaDeviceSynchronize(); sdkStopTimer(&timer); } if (needReadBack) { // copy final sum from device to host checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost)); } return gpu_result; }
bool runTestMax( int argc, char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog("METHOD: MAX\n"); shrLog("%d elements\n", size); shrLog("%d threads (max)\n", maxThreads); cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue); cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; // allocate mem for the result on host side T* h_odata = (T*) malloc(numBlocks*sizeof(T)); shrLog("%d blocks\n\n", numBlocks); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); T gpu_result = 0; gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = cutGetAverageTimerValue(timer) * 1e-3; shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = maxreduceCPU<T>(h_idata, size); double threshold = 1e-12; double diff = 0; if (datatype == REDUCE_INT) { shrLog("\nGPU result = %d\n", gpu_result); shrLog("CPU result = %d\n\n", cpu_result); } else { shrLog("\nGPU result = %f\n", gpu_result); shrLog("CPU result = %f\n\n", cpu_result); if (datatype == REDUCE_FLOAT) threshold = 1e-8 * size; diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup cutilCheckError( cutDeleteTimer(timer) ); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n"); exit(1); // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T*) malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up for (int kernel = 0; kernel < 7; kernel++) { sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); // print headers shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { cutResetTimer(timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = cutGetAverageTimerValue(timer); } else { reduceTime = -1.0; } shrLog(", %.5f", reduceTime); } } // cleanup cutilCheckError(cutDeleteTimer(timer)); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); }
//////////////////////////////////////////////////////////////////////////////// // The main function whihc runs the reduction test. //////////////////////////////////////////////////////////////////////////////// bool runTest(int argc, char **argv, ReduceType datatype) { //int size = 1<<24; // number of elements to reduce int size = 64 * 256; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; /*if (checkCmdLineFlag(argc, (const char **) argv, "n")) { size = getCmdLineArgumentInt(argc, (const char **) argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "threads")) { maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **) argv, "kernel")) { whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel"); } if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks")) { maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks"); }*/ printf("%d elements\n", size); printf("%d threads (max)\n", maxThreads); //cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal"); /*if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh")) { cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh"); }*/ //bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo"); /*if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else {*/ // create random input data on CPU unsigned int bytes = size * sizeof(int); int *h_idata = (int *) malloc(bytes); #ifdef _SYM klee_make_symbolic(h_idata, bytes, "h_idata_input"); #else for (int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum h_idata[i] = (int)(rand() & 0xFF); } #endif int numBlocks = maxBlocks; int numThreads = maxThreads; if (numBlocks == 1) { cpuFinalThreshold = 1; } // allocate mem for the result on host side int *h_odata = (int *) malloc(numBlocks*sizeof(int)); printf("%d blocks\n\n", numBlocks); // allocate device memory and data int *d_idata = NULL; int *d_odata = NULL; //checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); //checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T))); cudaMalloc((void **) &d_idata, bytes); cudaMalloc((void **) &d_odata, numBlocks*sizeof(int)); // copy data directly to device memory //checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); //checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice)); cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(int), cudaMemcpyHostToDevice); unsigned int i = 0; #ifdef _RED0 // warm-up i = 0; numBlocks = maxBlocks; numThreads = maxThreads; //getNumBlocksAndThreads(i, size, maxBlocks, maxThreads, numBlocks, numThreads); reduce<int>(size, numThreads, numBlocks, i, d_idata, d_odata); #elif defined _RED1 i = 1; numBlocks = maxBlocks; numThreads = maxThreads; //getNumBlocksAndThreads(i, size, maxBlocks, maxThreads, numBlocks, numThreads); reduce<int>(size, numThreads, numBlocks, i, d_idata, d_odata); #elif defined _RED2 i = 2; numBlocks = maxBlocks; numThreads = maxThreads; //getNumBlocksAndThreads(i, size, maxBlocks, maxThreads, numBlocks, numThreads); reduce<int>(size, numThreads, numBlocks, i, d_idata, d_odata); #elif defined _RED3 i = 3; numBlocks = maxBlocks/2; numThreads = maxThreads; //getNumBlocksAndThreads(i, size, maxBlocks, maxThreads, numBlocks, numThreads); reduce<int>(size, numThreads, numBlocks, i, d_idata, d_odata); #elif defined _RED4 i = 4; numBlocks = maxBlocks/2; numThreads = maxThreads; getNumBlocksAndThreads(i, size, maxBlocks, maxThreads, numBlocks, numThreads); reduce<int>(size, numThreads, numBlocks, i, d_idata, d_odata); #elif defined _RED5 i = 5; numBlocks = maxBlocks/2; numThreads = maxThreads; //getNumBlocksAndThreads(i, size, maxBlocks, maxThreads, numBlocks, numThreads); reduce<int>(size, numThreads, numBlocks, i, d_idata, d_odata); #else i = 6; numBlocks = maxBlocks/2; numThreads = maxThreads; //getNumBlocksAndThreads(i, size, maxBlocks, maxThreads, numBlocks, numThreads); reduce<int>(size, numThreads, numBlocks, i, d_idata, d_odata); #endif //StopWatchInterface *timer = 0; //sdkCreateTimer(&timer); //T gpu_result = 0; //gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, // whichKernel, testIterations, cpuFinalReduction, // cpuFinalThreshold, timer, // h_odata, d_idata, d_odata); //double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; //printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", // 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution //T cpu_result = reduceCPU<T>(h_idata, size); double threshold = 1e-12; double diff = 0; /*if (datatype == REDUCE_INT) { printf("\nGPU result = %d\n", gpu_result); //printf("CPU result = %d\n\n", cpu_result); } else { printf("\nGPU result = %f\n", gpu_result); //printf("CPU result = %f\n\n", cpu_result); if (datatype == REDUCE_FLOAT) { threshold = 1e-8 * size; } diff = fabs((double)gpu_result - (double)cpu_result); }*/ // cleanup //sdkDeleteTimer(&timer); free(h_idata); free(h_odata); //checkCudaErrors(cudaFree(d_idata)); //checkCudaErrors(cudaFree(d_odata)); cudaFree(d_idata); cudaFree(d_odata); /*if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); }*/ //} return true; }
bool runTest( int argc, const char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads; cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1); clReleaseKernel(reductionKernel); if (smallBlock) maxThreads = 64; // number of threads per block else maxThreads = 128; int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog(" %d elements\n", size); shrLog(" %d threads (max)\n", maxThreads); cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE); shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE); #ifdef GPU_PROFILING if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); return true; } else #endif { // create random input data on CPU unsigned int bytes = size * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; shrLog(" %d blocks\n\n", numBlocks); // allocate mem for the result on host side T* h_odata = (T*)malloc(numBlocks * sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; T gpu_result = 0; gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, &dTotalTime, h_odata, d_idata, d_odata); #ifdef GPU_PROFILING double reduceTime = dTotalTime/(double)testIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); #endif // compute reference solution shrLog("\nComparing against Host/C++ computation...\n"); T cpu_result = reduceCPU<T>(h_idata, size); if (datatype == REDUCE_INT) { shrLog(" GPU result = %d\n", gpu_result); shrLog(" CPU result = %d\n\n", cpu_result); shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED"); } else { shrLog(" GPU result = %.9f\n", gpu_result); shrLog(" CPU result = %.9f\n\n", cpu_result); double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12; double diff = abs((double)gpu_result - (double)cpu_result); shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED"); } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return (gpu_result == cpu_result); } }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; // print headers shrLog("Time in seconds for various numbers of elements for each kernel\n"); shrLog("\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n"); shrLog("%d", kernel); for (int i = minN; i <= maxN; i *= 2) { int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); double reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata); reduceTime = dTotalTime/(double)testIterations; } else { reduceTime = -1.0; } shrLog(", %.4f m", reduceTime); } } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); }
T profileReduce(ReduceType datatype, cl_int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, double* dTotalTime, T* h_odata, cl_mem d_idata, cl_mem d_odata) { T gpu_result = 0; bool needReadBack = true; cl_kernel finalReductionKernel[10]; int finalReductionIterations=0; //shrLog("Profile Kernel %d\n", whichKernel); cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) ); clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata); clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n); clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL); if( !cpuFinalReduction ) { int s=numBlocks; int threads = 0, blocks = 0; int kernel = (whichKernel == 6) ? 5 : whichKernel; while(s > cpuFinalThreshold) { getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) ); clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n); clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); finalReductionIterations++; } } size_t globalWorkSize[1]; size_t localWorkSize[1]; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; clFinish(cqCommandQueue); if(i>0) shrDeltaT(1); // execute the kernel globalWorkSize[0] = numBlocks * numThreads; localWorkSize[0] = numThreads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); // check if kernel execution generated an error oclCheckError(ciErrNum, CL_SUCCESS); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = (whichKernel == 6) ? 5 : whichKernel; int it = 0; while(s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); globalWorkSize[0] = threads * blocks; localWorkSize[0] = threads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); it++; } if (s > 1) { // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } clFinish(cqCommandQueue); if(i>0) *dTotalTime += shrDeltaT(1); } if (needReadBack) { // copy final sum from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), &gpu_result, 0, NULL, NULL); } // Release the kernels clReleaseKernel(reductionKernel); if( !cpuFinalReduction ) { for(int it=0; it<finalReductionIterations; ++it) { clReleaseKernel(finalReductionKernel[it]); } } return gpu_result; }