// keplereq_wrapper_C: // C wrapper function to solve's Kepler's equation num times. // inputs: // ph_ma: pointer to beginning element of array of doubles containing mean anomaly in radians // ph_ecc: pointer to beginning element of array of doubles containing eccentricity // num: integer size of input arrays // ph_eccanom: pointer to beginning element of array of doubles eccentric anomaly in radians // outputs: // ph_eccanom: values overwritten with eccentric anomaly // assumptions: // input mean anomalies between 0 and 2pi // input eccentricities between 0 and 1 // all three arrays have at least num elements // void keplereq_wrapper_c(double *ph_ma, double *ph_ecc, int num, double *ph_eccanom) { int gpuid = init_cuda(); // put vectors in thrust format from raw points thrust::host_vector<double> h_ecc(ph_ecc,ph_ecc+num); thrust::host_vector<double> h_ma(ph_ma,ph_ma+num); cutCreateTimer(&memoryTime); cutCreateTimer(&kernelTime); cutResetTimer(memoryTime); cutResetTimer(kernelTime); if(gpuid>=0) { cutStartTimer(memoryTime); // transfer input params to GPU thrust::device_vector<double> d_ecc = h_ecc; thrust::device_vector<double> d_ma = h_ma; // allocate mem on GPU thrust::device_vector<double> d_eccanom(num); cudaThreadSynchronize(); cutStopTimer(memoryTime); // distribute the computation to the GPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(d_ma.begin(),d_ecc.begin(),d_eccanom.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_ma.end(), d_ecc.end(), d_eccanom.end())), keplereq_functor() ); cudaThreadSynchronize(); cutStopTimer(kernelTime); // transfer results back to host cutStartTimer(memoryTime); thrust::copy(d_eccanom.begin(),d_eccanom.end(),ph_eccanom); cudaThreadSynchronize(); cutStopTimer(memoryTime); } else { // distribute the computation to the CPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(h_ma.begin(),h_ecc.begin(),ph_eccanom)), thrust::make_zip_iterator(thrust::make_tuple(h_ma.end(), h_ecc.end(), ph_eccanom+num)), keplereq_functor() ); cutStopTimer(kernelTime); } }
void _selectDemo(int index) { assert(index < numDemos); activeParams = demoParams[index]; camera_trans[0] = camera_trans_lag[0] = activeParams.m_x; camera_trans[1] = camera_trans_lag[1] = activeParams.m_y; camera_trans[2] = camera_trans_lag[2] = activeParams.m_z; reset(numBodies, NBODY_CONFIG_SHELL); cutilCheckError(cutResetTimer(demoTimer)); }
void computeFPS() { fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f); sprintf(fps, "CUDA Bilateral Filter: %3.f fps (euclidean_delta=%.2f, gaussian_delta=%.2f, iterations=%.2f)", ifps, (double)euclidean_delta, (double)gaussian_delta, (double)iterations); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.0f); cutilCheckError(cutResetTimer(timer)); } }
// Simple method to display the Frames Per Second in the window title void computeFPS() { static int fpsCount=0; static int fpsLimit=100; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps ", ifps); glutSetWindowTitle(fps); fpsCount = 0; cutilCheckError(cutResetTimer(timer)); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit-1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "%s %s (sigma=%4.2f): %3.1f fps", sSDKsample, ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest: " : ""), sigma, ifps); glutSetWindowTitle(fps); fpsCount = 0; cutilCheckError(cutResetTimer(timer)); AutoQATest(); } }
void computeFPS() { frameCount++; if (fpsCount++ == fpsLimit-1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f); sprintf(fps, "%sCUDA Box Filter (radius=%d): %3.1f fps", ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "[AutoTest]: " : ""), filter_radius, ifps); glutSetWindowTitle(fps); fpsCount = 0; if (g_CheckRender && !g_CheckRender->IsQAReadback()) fpsLimit = (int)MAX(ifps, 1.0f); cutilCheckError(cutResetTimer(timer)); AutoQATest(); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit-1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "%s Cuda Edge Detection (%s): %3.1f fps", ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest:" : ""), filterMode[g_SobelDisplayMode], ifps); glutSetWindowTitle(fps); fpsCount = 0; if (g_CheckRender && !g_CheckRender->IsQAReadback()) fpsLimit = (int)MAX(ifps, 1.f); cutilCheckError(cutResetTimer(timer)); AutoQATest(); } }
int main(int argc, char** argv) { printHeader("Initializare"); initCUDA(); init(); printHeader("Calcul CPU"); cutilCheckError(cutStartTimer(timer)); // Calculeaza sampleul de control - CPU printf("Asteptati: Se calculeaza controlul pe CPU ... "); computeControl(); printf("DONE\n"); float time = cutGetTimerValue(timer); printf("Timp de calcul pe CPU = %f milisecunde\n",time); cutilCheckError(cutResetTimer(timer)); printHeader("Calcul CUDA"); // Se calculeaza pe CUDA printf("Asteptati: Se calculeaza pe CUDA ... "); runCUDA(); printf("DONE\n"); time = cutGetTimerValue(timer); printf("Timp de calcul pe GPU = %f milisecunde\n",time); printHeader("Verificare calcule"); // Se verifica daca s-a calculat corect pe CUDA printf("Se verifica daca rezultatul pe CUDA corespunde cu rezultatul pe CPU : "); verificaCalcule(); printHeader(""); cleanup(); printf("Apasa ENTER pentru a termina programul\n"); getchar(); return 0; }
int main(int argc, char **argv) { GpuProfiling::initProf(); // Start logs shrSetLogFileName ("scan.txt"); shrLog("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; uint hTimer; const uint N = 13 * 1048576 / 2; shrLog("Allocating and initializing host arrays...\n"); cutCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_Input[i] = rand(); shrLog("Allocating and initializing CUDA arrays...\n"); cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) ); shrLog("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize()); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } // pass or fail (cumulative... all tests in the loop) shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n"); GpuProfiling::printResults(); shrLog("Shutting down...\n"); closeScan(); cutilSafeCall( cudaFree(d_Output)); cutilSafeCall( cudaFree(d_Input)); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); exit(0); shrEXIT(argc, (const char**)argv); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv){ uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint hTimer; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; printf("Allocating and initializing host arrays...\n\n"); cutCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_SrcKey[i] = rand() % numValues; fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) ); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); printf("Time: %f ms\n", cutGetTimerValue(hTimer)); printf("Reading back GPU merge sort results...\n"); cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) ); cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) ); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf( (keysFlag && valuesFlag) ? "TEST PASSED\n" : "TEST FAILED\n"); printf("Shutting down...\n"); closeMergeSort(); cutilCheckError( cutDeleteTimer(hTimer) ); cutilSafeCall( cudaFree(d_SrcVal) ); cutilSafeCall( cudaFree(d_SrcKey) ); cutilSafeCall( cudaFree(d_BufVal) ); cutilSafeCall( cudaFree(d_BufKey) ); cutilSafeCall( cudaFree(d_DstVal) ); cutilSafeCall( cudaFree(d_DstKey) ); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); cudaThreadExit(); cutilExit(argc, argv); }
// main rendering loop void display() { cutilCheckError(cutStartTimer(timer)); if( !gestures.m_bPause ) { //Read next available data gestures.m_Context.WaitAndUpdateAll(); } //Process the data gestures.m_DepthGenerator.GetMetaData( depthMD ); gestures.m_UserGenerator.GetUserPixels( 0, sceneMD ); // move camera if (cameraPos[1] > 0.0f) cameraPos[1] = 0.0f; cameraPosLag += (cameraPos - cameraPosLag) * inertia; cameraRotLag += (cameraRot - cameraRotLag) * inertia; cursorPosLag += (cursorPos - cursorPosLag) * inertia; // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glRotatef(cameraRotLag[0], 1.0, 0.0, 0.0); glRotatef(cameraRotLag[1], 0.0, 1.0, 0.0); glTranslatef(cameraPosLag[0], cameraPosLag[1], cameraPosLag[2]); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); // update the simulation if (!paused) { if (emitterOn) { runEmitter(); } SimParams &p = psystem->getParams(); p.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); psystem->step(timestep); currentTime += timestep; } renderer->calcVectors(); vec3f sortVector = renderer->getSortVector(); psystem->setSortVector(make_float3(sortVector.x, sortVector.y, sortVector.z)); psystem->setModelView(modelView); psystem->setSorting(sort); psystem->depthSort(); // render glClearColor(0.0, 0.0, 0.0, 1.0); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); renderScene(); // draw particles if (displayEnabled) { // render scene to offscreen buffers to get correct occlusion renderer->beginSceneRender(SmokeRenderer::LIGHT_BUFFER); renderScene(); renderer->endSceneRender(SmokeRenderer::LIGHT_BUFFER); renderer->beginSceneRender(SmokeRenderer::SCENE_BUFFER); renderScene(); renderer->endSceneRender(SmokeRenderer::SCENE_BUFFER); renderer->setPositionBuffer(psystem->getPosBuffer()); renderer->setVelocityBuffer(psystem->getVelBuffer()); renderer->setIndexBuffer(psystem->getSortedIndexBuffer()); renderer->setNumParticles(psystem->getNumParticles()); renderer->setParticleRadius(spriteSize); renderer->setDisplayLightBuffer(displayLightBuffer); renderer->setAlpha(alpha); renderer->setShadowAlpha(shadowAlpha); renderer->setLightPosition(lightPos); renderer->setColorAttenuation(colorAttenuation); renderer->setLightColor(lightColor); renderer->setNumSlices(numSlices); renderer->setNumDisplayedSlices(numDisplayedSlices); renderer->setBlurRadius(blurRadius); renderer->render(); if (drawVectors) { renderer->debugVectors(); } } // display sliders if (displaySliders) { glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); params->Render(0, 0); glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); } glutSwapBuffers(); glutReportErrors(); cutilCheckError(cutStopTimer(timer)); // readback for verification//sw/devrel/SDK10/Compute/projects/recursiveGaussian/recursiveGaussian.cpp if (g_CheckRender && g_CheckRender->IsQAReadback() && (++frameNumber >= frameCheckNumber)) { printf("> (Frame %d) Readback BackBuffer\n", frameNumber); g_CheckRender->readback( winWidth, winHeight ); g_CheckRender->savePPM(sOriginal, true, NULL); bool passed = g_CheckRender->PPMvsPPM(sOriginal, sReference, MAX_EPSILON_ERROR, THRESHOLD); printf("Summary: %d errors!\n", passed ? 0 : 1); printf("%s\n", passed ? "PASSED" : "FAILED"); cleanup(); exit(0); } fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "CUDA Smoke Particles (%d particles): %3.1f fps", numParticles, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (paused) fpsLimit = 0; cutilCheckError(cutResetTimer(timer)); } }
void quickshift(image_t im, float sigma, float tau, float * map, float * gaps, float * E) { int verb = 1 ; float *M = 0, *n = 0; float tau2; int K, d; int N1,N2, i1,i2, j1,j2, R, tR; int medoid = 0 ; float const * I = im.I; N1 = im.N1; N2 = im.N2; K = im.K; d = 2 + K ; /* Total dimensions include spatial component (x,y) */ tau2 = tau*tau; if (medoid) { /* n and M are only used in mediod shift */ M = (float *) calloc(N1*N2*d, sizeof(float)) ; n = (float *) calloc(N1*N2, sizeof(float)) ; } R = (int) ceil (3 * sigma) ; tR = (int) ceil (tau) ; if (verb) { printf("quickshift: [N1,N2,K]: [%d,%d,%d]\n", N1,N2,K) ; printf("quickshift: type: %s\n", medoid ? "medoid" : "quick"); printf("quickshift: sigma: %g\n", sigma) ; /* R is ceil(3 * sigma) and determines the window size to accumulate * similarity */ printf("quickshift: R: %d\n", R) ; printf("quickshift: tau: %g\n", tau) ; printf("quickshift: tR: %d\n", tR) ; } /* ----------------------------------------------------------------- * n * -------------------------------------------------------------- */ /* If we are doing medoid shift, initialize n to the inner product of the * image with itself */ if (n) { for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { n [i1 + N1 * i2] = inner(I,N1,N2,K, i1,i2, i1,i2) ; } } } unsigned int Etimer; cutilCheckError( cutCreateTimer(&Etimer) ); cutilCheckError( cutResetTimer(Etimer) ); cutilCheckError( cutStartTimer(Etimer) ); /* ----------------------------------------------------------------- * E = - [oN'*F]', M * -------------------------------------------------------------- */ /* D_ij = d(x_i,x_j) E_ij = exp(- .5 * D_ij / sigma^2) ; F_ij = - E_ij E_i = sum_j E_ij M_di = sum_j X_j F_ij E is the parzen window estimate of the density 0 = dissimilar to everything, windowsize = identical */ for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { float Ei = 0; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; /* For each pixel in the window compute the distance between it and the * source pixel */ for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; /* Make distance a similarity */ float Fij = exp(- Dij / (2*sigma*sigma)) ; /* E is E_i above */ Ei += Fij; if (M) { /* Accumulate votes for the median */ int k ; M [i1 + N1*i2 + (N1*N2) * 0] += j1 * Fij ; M [i1 + N1*i2 + (N1*N2) * 1] += j2 * Fij ; for (k = 0 ; k < K ; ++k) { M [i1 + N1*i2 + (N1*N2) * (k+2)] += I [j1 + N1*j2 + (N1*N2) * k] * Fij ; } } } /* j1 */ } /* j2 */ /* Normalize */ E [i1 + N1 * i2] = Ei / ((j1max-j1min)*(j2max-j2min)); /*E [i1 + N1 * i2] = Ei ; */ } /* i1 */ } /* i2 */ cutilCheckError( cutStopTimer(Etimer) ); float ETime = cutGetTimerValue(Etimer); printf("ComputeE: %fms\n", ETime); unsigned int Ntimer; cutilCheckError( cutCreateTimer(&Ntimer) ); cutilCheckError( cutResetTimer(Ntimer) ); cutilCheckError( cutStartTimer(Ntimer) ); /* ----------------------------------------------------------------- * Find best neighbors * -------------------------------------------------------------- */ if (medoid) { /* Qij = - nj Ei - 2 sum_k Gjk Mik n is I.^2 */ /* medoid shift */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float sc_best = 0 ; /* j1/j2 best are the best indicies for each i */ float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Qij = - n [j1 + j2 * N1] * E [i1 + i2 * N1] ; int k ; Qij -= 2 * j1 * M [i1 + i2 * N1 + (N1*N2) * 0] ; Qij -= 2 * j2 * M [i1 + i2 * N1 + (N1*N2) * 1] ; for (k = 0 ; k < K ; ++k) { Qij -= 2 * I [j1 + j2 * N1 + (N1*N2) * k] * M [i1 + i2 * N1 + (N1*N2) * (k + 2)] ; } if (Qij > sc_best) { sc_best = Qij ; j1_best = j1 ; j2_best = j2 ; } } } /* map_i is the linear index of j which is the best pair (in matlab * notation * gaps_i is the score of the best match */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /*+ 1 ; */ gaps[i1 + N1 * i2] = sc_best ; } } } else { /* Quickshift assigns each i to the closest j which has an increase in the * density (E). If there is no j s.t. Ej > Ei, then gaps_i == inf (a root * node in one of the trees of merges). */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float E0 = E [i1 + N1 * i2] ; float d_best = INF ; float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - tR, 0 ) ; int j1max = VL_MIN(i1 + tR, N1-1) ; int j2min = VL_MAX(i2 - tR, 0 ) ; int j2max = VL_MIN(i2 + tR, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { if (E [j1 + N1 * j2] > E0) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; if (Dij <= tau2 && Dij < d_best) { d_best = Dij ; j1_best = j1 ; j2_best = j2 ; } } } } /* map is the index of the best pair */ /* gaps_i is the minimal distance, inf implies no Ej > Ei within * distance tau from the point */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /* + 1 ; */ if (map[i1 + N1 * i2] != i1 + N1 * i2) gaps[i1 + N1 * i2] = sqrt(d_best) ; else gaps[i1 + N1 * i2] = d_best; /* inf */ } } } if (M) free(M) ; if (n) free(n) ; cutilCheckError( cutStopTimer(Ntimer) ); float NTime = cutGetTimerValue(Ntimer); printf("ComputeN: %fms\n", NTime); }
int main(int argc, char **argv) { char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } const int MAX_GPU_COUNT = 8; const int OPT_N = 256; const int PATH_N = 1 << 18; const unsigned int SEED = 777; //Input data array TOptionData optionData[OPT_N]; //Final GPU MC results TOptionValue callValueGPU[OPT_N]; //"Theoretical" call values by Black-Scholes formula float callValueBS[OPT_N]; //Solver config TOptionPlan optionSolver[MAX_GPU_COUNT]; //OS thread ID CUTThread threadID[MAX_GPU_COUNT]; //GPU number present in the system int GPU_N; int gpuBase, gpuIndex; int i; //Timer unsigned int hTimer; float time; double delta, ref, sumDelta, sumRef, sumReserve; cutilSafeCall( cudaGetDeviceCount(&GPU_N) ); cutilCheckError( cutCreateTimer(&hTimer) ); #ifdef _EMU GPU_N = 1; #endif printf("main(): generating input data...\n"); srand(123); for(i = 0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for(i = 0; i < GPU_N; i++) optionSolver[i].optionCount = OPT_N / GPU_N; //Take into account cases with "odd" option counts for(i = 0; i < (OPT_N % GPU_N); i++) optionSolver[i].optionCount++; //Assign GPU option ranges gpuBase = 0; for(i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } //Start the timer cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); //Start CPU thread for each GPU for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); //Stop the timer cutilCheckError( cutStopTimer(hTimer) ); time = cutGetTimerValue(hTimer); printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics\n"); for(i = 0; i < GPU_N; i++) { printf("GPU #%i\n", optionSolver[i].device); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } printf("\nTotal time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for(i = 0; i < OPT_N; i++) { BlackScholesCall( callValueBS[i], optionData[i] ); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta; #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n"); printf("Shutting down...\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cutilExit(argc, argv); }
void displayFunc(void){ cutStartTimer(hTimer); TColor *d_dst = NULL; size_t num_bytes; if(frameCounter++ == 0) cutResetTimer(hTimer); // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); cutilCheckMsg("cudaGraphicsMapResources failed"); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &num_bytes, cuda_pbo_resource)); cutilCheckMsg("cudaGraphicsResourceGetMappedPointer failed"); cutilSafeCall( CUDA_Bind2TextureArray() ); runImageFilters(d_dst); cutilSafeCall( CUDA_UnbindTexture() ); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(gl_PBO)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); if (g_bFBODisplay) { g_FrameBufferObject->bindRenderPath(); } // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0) ); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (g_bFBODisplay) { g_FrameBufferObject->unbindRenderPath(); glBindTexture(GL_TEXTURE_2D, 0); } if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { printf("> (Frame %d) readback BackBuffer\n", frameCount); if (g_bFBODisplay) { g_CheckRender->readback( imageW, imageH, g_FrameBufferObject->getFbo() ); } else { g_CheckRender->readback( imageW, imageH ); } g_CheckRender->savePPM ( sOriginal[g_Kernel], true, NULL ); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Verify = false; } if(frameCounter == frameN){ frameCounter = 0; if(g_FPS){ printf("FPS: %3.1f\n", frameN / (cutGetTimerValue(hTimer) * 0.001) ); g_FPS = false; } } glutSwapBuffers(); cutStopTimer(hTimer); computeFPS(); }
void display() { cutilCheckError(cutStartTimer(timer)); // update the simulation if (!bPause) { psystem->setIterations(iterations); psystem->setDamping(damping); psystem->setGravity(-gravity); psystem->setCollideSpring(collideSpring); psystem->setCollideDamping(collideDamping); psystem->setCollideShear(collideShear); psystem->setCollideAttraction(collideAttraction); psystem->update(timestep); renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); } else { usleep(32666); } // render glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); // cube glColor3f(1.0, 1.0, 1.0); glutWireCube(2.0); // collider glPushMatrix(); float4 p = psystem->getColliderPos(); glTranslatef(p.x, p.y, p.z); glColor3f(1.0, 0.0, 0.0); glutSolidSphere(psystem->getColliderRadius(), 20, 10); glPopMatrix(); if (displayEnabled) { renderer->display(displayMode); } if (displaySliders) { glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); params->Render(0, 0); glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); } cutilCheckError(cutStopTimer(timer)); glutSwapBuffers(); fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "CUDA particles (%d particles): %3.1f fps", numParticles, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (bPause) fpsLimit = 0; cutilCheckError(cutResetTimer(timer)); } glutReportErrors(); }
int main(int argc, char **argv) { // Start logs shrSetLogFileName ("quasirandomGenerator.txt"); shrLog("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else{ if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU; float *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; unsigned int hTimer; if(sizeof(INT64) != 8){ shrLog("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); cutilCheckError(cutCreateTimer(&hTimer)); int deviceIndex; cutilSafeCall(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ shrLog("Double precision not supported.\n"); cudaThreadExit(); return 0; } shrLog("Allocating GPU memory...\n"); cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) ); shrLog("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); shrLog("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if(useDoublePrecision) initTable_SM13(tableCPU); else initTable_SM10(tableCPU); shrLog("Testing QRNG...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) quasirandomGenerator_SM13(d_Output, 0, N); else quasirandomGenerator_SM10(d_Output, 0, N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); shrLog("\nReading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for(dim = 0; dim < QRNG_DIMENSIONS; dim++) for(pos = 0; pos < N; pos++){ ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n", sumDelta / sumRef); shrLog("\nTesting inverseCNDgpu()...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); else inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); shrLog("Reading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){ double p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1); ref = MoroInvCNDcpu(p); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n"); shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); free(h_OutputGPU); cutilSafeCall( cudaFree(d_Output) ); cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n"); exit(1); // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T*) malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up for (int kernel = 0; kernel < 7; kernel++) { sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); // print headers shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { cutResetTimer(timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = cutGetAverageTimerValue(timer); } else { reduceTime = -1.0; } shrLog(", %.5f", reduceTime); } } // cleanup cutilCheckError(cutDeleteTimer(timer)); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); }
int main(int argc, char** argv) { printf("[%s]\n", sSDKsample); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device if possible, otherwise search for capable device if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); int device; cudaGetDevice( &device ); if( checkCUDAProfile( device ) == false ) { cudaThreadExit(); cutilExit(argc, argv); } } else { //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) cudaGLSetGLDevice( dev ); else { cudaThreadExit(); cutilExit(argc, argv); } } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); if (g_bOpenGLQA) { loadDefaultImage( argc, argv ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display image\n"); printf("T: display Sobel edge detection (computed with tex)\n"); printf("S: display Sobel edge detection (computed with tex+shared memory)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (mean/Sobel)\n"); printf("p: swtich point filter operation (threshold on/off)\n"); fflush(stdout); atexit(cleanup); glutMainLoop(); } } cudaThreadExit(); cutilExit(argc, argv); }
int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -qatest\n\n", argv[0]); printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) { cudaGLSetGLDevice( dev ); } else { shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); if (g_bOpenGLQA) { loadDefaultImage( argc, argv ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display Image (no filtering)\n"); printf("T: display Sobel Edge Detection (Using Texture)\n"); printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (mean/Sobel)\n"); printf("p: switch point filter operation (threshold on/off)\n"); fflush(stdout); atexit(cleanup); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); } } cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ const unsigned int OPT_N_MAX = 512; unsigned int useDoublePrecision; printf("[binomialOptions]\n"); int devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); cutilExit(argc, argv); exit(0); } cutilSafeCall(cudaGetDevice(&devID)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID)); char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) { useDoublePrecision = 0; } else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n"); const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX; TOptionData optionData[OPT_N_MAX]; float callValueBS[OPT_N_MAX], callValueGPU[OPT_N_MAX], callValueCPU[OPT_N_MAX]; double sumDelta, sumRef, gpuTime, errorVal; unsigned int hTimer; int i; cutilCheckError( cutCreateTimer(&hTimer) ); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ printf("Double precision is not supported.\n"); return 0; } printf("Generating input data...\n"); //Generate options set srand(123); for(i = 0; i < OPT_N; i++){ optionData[i].S = randData(5.0f, 30.0f); optionData[i].X = randData(1.0f, 100.0f); optionData[i].T = randData(0.25f, 10.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; BlackScholesCall(callValueBS[i], optionData[i]); } printf("Running GPU binomial tree...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); if(useDoublePrecision) binomialOptions_SM13(callValueGPU, optionData, OPT_N); else binomialOptions_SM10(callValueGPU, optionData, OPT_N); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer(hTimer) ); gpuTime = cutGetTimerValue(hTimer); printf("Options count : %i \n", OPT_N); printf("Time steps : %i \n", NUM_STEPS); printf("binomialOptionsGPU() time: %f msec\n", gpuTime); printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); printf("Running CPU binomial tree...\n"); for(i = 0; i < OPT_N; i++) binomialOptionsCPU(callValueCPU[i], optionData[i]); printf("Comparing the results...\n"); sumDelta = 0; sumRef = 0; printf("GPU binomial vs. Black-Scholes\n"); for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i] - callValueGPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. Black-Scholes\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i]- callValueCPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. GPU binomial\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); sumRef += callValueCPU[i]; } if(sumRef > 1E-5) printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); else printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N); printf("Shutting down...\n"); printf("\n[binomialOptions] - Test Summary:\n"); printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); cutilExit(argc, argv); }
void display() { static double gflops = 0; static double ifps = 0; static double interactionsPerSecond = 0; // update the simulation if (!bPause) { if (cycleDemo && (cutGetTimerValue(demoTimer) > demoTime)) { activeDemo = (activeDemo + 1) % numDemos; selectDemo(activeDemo); } updateSimulation(); if (!useCpu) cudaEventRecord(hostMemSyncEvent, 0); // insert an event to wait on before rendering } glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); if (displayEnabled) { // view transform { glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); } displayNBodySystem(); // display user interface if (bShowSliders) { glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); paramlist->Render(0, 0); glDisable(GL_BLEND); } if (bFullscreen) { beginWinCoords(); char msg0[256], msg1[256], msg2[256]; if (bDispInteractions) { sprintf(msg1, "%0.2f billion interactions per second", interactionsPerSecond); } else { sprintf(msg1, "%0.2f GFLOP/s", gflops); } sprintf(msg0, "%s", deviceName); sprintf(msg2, "%0.2f FPS [%s | %d bodies]", ifps, fp64 ? "double precision" : "single precision", numBodies); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); glColor3f(0.46f, 0.73f, 0.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 122, msg0, GLUT_BITMAP_TIMES_ROMAN_24); glColor3f(1.0f, 1.0f, 1.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 96, msg2, GLUT_BITMAP_TIMES_ROMAN_24); glColor3f(1.0f, 1.0f, 1.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 70, msg1, GLUT_BITMAP_TIMES_ROMAN_24); glDisable(GL_BLEND); endWinCoords(); } glutSwapBuffers(); } fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float milliseconds = 1; // stop timer if (useCpu) { milliseconds = cutGetTimerValue(timer); cutilCheckError(cutResetTimer(timer)); } else { cutilSafeCall(cudaEventRecord(stopEvent, 0)); cutilSafeCall(cudaEventSynchronize(stopEvent)); cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } milliseconds /= (float)fpsCount; computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); ifps = 1.f / (milliseconds / 1000.f); sprintf(fps, "CUDA N-Body (%d bodies): " "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s", numBodies, ifps, interactionsPerSecond, gflops, fp64 ? "double precision" : "single precision"); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (bPause) fpsLimit = 0; // restart timer if (!useCpu) { cutilSafeCall(cudaEventRecord(startEvent, 0)); } } glutReportErrors(); }
int main(int argc, char** argv) { // EDISON ////////////////////////////////////////////////////////////////// sigmaS = 7.0f; sigmaR = 6.5f; edison.minRegion = 20.0f; cutLoadPPMub("image.ppm", &edison.inputImage_, &width, &height); edison.meanShift(); cutSavePPMub("segmimage.ppm", edison.segmImage_, width, height); cutSavePPMub("filtimage.ppm", edison.filtImage_, width, height); unsigned char data[height * width]; memset(data, 0, height * width * sizeof(unsigned char)); for(int i = 0; i < edison.numBoundaries_; i++) { data[edison.boundaries_[i]] = 255; } cutSavePGMub("bndyimage.pgm", data, width, height); //return 0; // EDISON ////////////////////////////////////////////////////////////////// if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if(!strncmp( "Tesla", prop.name, 5 )) { printf("This sample needs a card capable of OpenGL and display.\n"); printf("Please choose a different device with the -device=x argument.\n"); cudaThreadExit(); cutilExit(argc, argv); } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); if (g_bOpenGLQA) { loadDefaultImage( argv[0] ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename); } } else { loadDefaultImage( argv[0]); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display image\n"); printf("T: display Sobel edge detection (computed with tex)\n"); printf("S: display Sobel edge detection (computed with tex+shared memory)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); fflush(stdout); atexit(cleanup); glutMainLoop(); } } cudaThreadExit(); cutilExit(argc, argv); }