/** * Calculates the feature histogram. * * If the feature has 64 different values or less, uses * histogram 64 to compute, otherwise uses histogramNaive (See CUDA kernels). * * @param index : The index for the feature. * @return h_acum : The histogram with the bin count. */ t_histogram Histogram::getHistogram(uint index) { uint vr = rawData.getValuesRange(index); t_histogram d_acum = rawData.getAcum(); t_feature d_vector = rawData.getFeatureGPU(index); if (vr < 64) { histogram64(d_acum, d_vector, rawData.getDataSize()); } else { histogramNaive(d_vector, d_acum, rawData.getDataSize(), 240); } cudaMemcpy(h_acum, d_acum, vr * sizeof(uint), cudaMemcpyDeviceToHost); cudaError err = cudaGetLastError(); if (cudaSuccess != err) { printf("Error copying histogram from GPU to CPU: %d\n", err); } return h_acum; }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
int main(int argc, char **argv){ uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; const uint byteCount = 64 * 16; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; printf("Initializing data...\n"); printf("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); klee_make_symbolic(h_Data, byteCount, "h_Data_input"); printf("...allocating GPU memory and copying input data\n\n"); cudaMalloc((void **)&d_Data, byteCount); cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)); cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice); { printf("Starting up 64-bin histogram...\n\n"); initHistogram64(); printf("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); histogram64(d_Histogram, d_Data, byteCount); printf("\nValidating GPU results...\n"); printf(" ...reading back GPU results\n"); cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost); printf(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); #ifndef _SYM // printf(" ...comparing the results...\n"); // for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) // if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; // printf(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); #endif printf("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { printf("Initializing 256-bin histogram...\n"); initHistogram256(); printf("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); histogram256(d_Histogram, d_Data, byteCount); printf("\nValidating GPU results...\n"); // printf(" ...reading back GPU results\n"); cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost); printf(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); #ifndef _SYM // printf(" ...comparing the results\n"); // for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) // if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; // printf(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); #endif printf("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } //printf("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) // printf("%s\n\n", PassFailFlag ? "PASSED" : "FAILED"); printf("Shutting down...\n"); cudaFree(d_Histogram); cudaFree(d_Data); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); }