int main(int argc, char **argv) { printf("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; StopWatchInterface *hTimer = NULL; const uint N = 13 * 1048576 / 2; printf("Allocating and initializing host arrays...\n"); sdkCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for (uint i = 0; i < N; i++) { h_Input[i] = rand(); } printf("Allocating and initializing CUDA arrays...\n"); checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice)); printf("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { printf("\n"); printf("scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { printf("\n"); printf("scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("Shutting down...\n"); closeScan(); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFree(d_Input)); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); // pass or fail (cumulative... all tests in the loop) exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { // Start logs //shrSetLogFileName ("scan.txt"); printf("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s /* if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); */ uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; uint hTimer; //const uint N = 13 * 1048576 / 2; const uint N = 2048; printf("Allocating and initializing host arrays...\n"); //cutCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); //srand(2009); //for(uint i = 0; i < N; i++) // h_Input[i] = rand(); klee_make_symbolic(h_Input, sizeof(uint) * N, "input"); printf("Allocating and initializing CUDA arrays...\n"); cudaMalloc((void **)&d_Input, N * sizeof(uint)); cudaMalloc((void **)&d_Output, N * sizeof(uint)); cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice); printf("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; //const int iCycles = 100; const int iCycles = 5; printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){ printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); //cutilSafeCall( cudaDeviceSynchronize() ); //cutResetTimer(hTimer); //cutStartTimer(hTimer); //for(int i = 0; i < iCycles; i++) //{ //printf("The arrayLength in scanExclusiveShort: %d, the i: %d\n", arrayLength, i); szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); //} //cutilSafeCall( cudaDeviceSynchronize()); //cutStopTimer(hTimer); //double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost); printf(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; #ifndef _SYM for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } #endif // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { printf("\n"); //printfEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", // (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); printf("\n"); } } printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){ printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); //for(int i = 0; i < iCycles; i++) //{ printf("The arrayLength in scanExclusiveLarge: %d\n", arrayLength); szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); //} printf("Validating the results...\n"); printf("...reading back GPU results\n"); cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost); printf("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; #ifndef _SYM for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } #endif // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { printf("\n"); //printfEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", // (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); printf("\n"); } } // pass or fail (cumulative... all tests in the loop) printf(globalFlag ? "PASSED\n\n" : "FAILED\n\n"); printf("Shutting down...\n"); closeScan(); cudaFree(d_Output); cudaFree(d_Input); free(h_Input); free(h_OutputCPU); free(h_OutputGPU); }
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength) { uint N = 0; CUdeviceptr d_Output; struct timeval start,stop; gettimeofday(&start, NULL); initScan(); gettimeofday(&stop, NULL); if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE) { for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){ if(arrayLength <= i){ N = i; break; } } checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N); //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE) { N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LL_SIZE) { N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE); printf("N = %d\n",N); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else{ cuMemFree(d_Output); closeScan(); return NULL; } closeScan(); cuMemFree(*d_Input); *d_Input = d_Output; printf("inside scan time:\n"); printDiff(start,stop); return d_Output; }