Example #1
0
    void initGlWindow(int argc, char ** argv){ 
#ifdef USE_OPENGL
      cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

      if( !glfwInit() )
	{
	  cerr << "Failed to initalize GLFW" << endl;
	  exit( 1 );
	}

      if ( !glfwOpenWindow( width, height, 
			    8, 8, 8, 8, 8, 8,
			    GLFW_WINDOW ) )
	{
	  cerr << "Failed to open window" << endl;
	  exit( 1 );
	}

      glewInit();
      if (! glewIsSupported("GL_VERSION_2_0 ")) 
	{
	  cerr << "ERROR: Support for necessary OpenGL extensions missing." << endl;
	  exit( 1 );
	}
    
      //glEnable(GL_POINT_SMOOTH);

      glViewport(0, 0, height, width);
      glLoadIdentity();
      glOrtho(-1.0, 1.0, -1.0, 1.0, 0.0, 1.0);
#endif
    };
Example #2
0
    void GLManager::initGlWindow(){ 
      cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

      if( !glfwInit() )
	{
	  std::cerr << "Failed to initalize GLFW" << std::endl;
	  exit( 1 );
	}

      glfwOpenWindowHint( GLFW_WINDOW_NO_RESIZE, GL_TRUE );

      if ( !glfwOpenWindow( width, height, 
			    8, 8, 8, 8, 8, 8,
			    GLFW_WINDOW ) )
	{
	  std::cerr << "Failed to open window" << std::endl;
	  exit( 1 );
	}

      glewInit();
      if (! glewIsSupported("GL_VERSION_2_0 ")) 
	{
	  std::cerr << "ERROR: Support for necessary OpenGL extensions missing." << std::endl;
	  exit( 1 );
	}
    


      glViewport(0, 0, height, width);
      glLoadIdentity();
      glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);

    };
Example #3
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
CUTBoolean runTest(int argc, char** argv)
{
    if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noqatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (CUTFalse == initGL(argc, argv)) {
        return CUTFalse;
    }

	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilGLDeviceInit(argc, argv);
    else {
        cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
    }

    // Create the CUTIL timer
    cutilCheckError( cutCreateTimer( &timer));

    // register callbacks
    glutDisplayFunc(display);
    glutKeyboardFunc(keyboard);
    glutMouseFunc(mouse);
    glutMotionFunc(motion);

    if (g_bQAReadback) {
        g_CheckRender = new CheckBackBuffer(window_width, window_height, 4);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);
    }

    // create VBO
    createVBO(&vbo);

    // run the cuda part
    runCuda(vbo);

    // check result of Cuda step
    checkResultCuda(argc, argv, vbo);

    atexit(cleanup);

    // start rendering mainloop
    glutMainLoop();

    cudaThreadExit();

	return CUTTrue;
}
Example #4
0
void runAutoTest(int argc, char **argv)
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        cutilDeviceInit(argc, argv);
    } else {
        cudaSetDevice( cutGetMaxGflopsDeviceId() );
    }

    loadDefaultImage( argv[0] );

    if (argc > 1) {
        char *filename;
        if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
            initializeData(filename);
        }
    } else {
        loadDefaultImage( argv[0]);
    }

    g_CheckRender       = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false);
    g_CheckRender->setExecPath(argv[0]);

    Pixel *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) );

    while (g_SobelDisplayMode <= 2) 
    {
        printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);

        sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale );

        cutilSafeCall( cudaThreadSynchronize() );

        cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost);

        g_CheckRender->savePGM(sOriginal[g_Index], false, NULL);

        if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Index++;
        g_SobelDisplayMode = (SobelDisplayMode)g_Index;
    }

    cutilSafeCall( cudaFree( d_result ) );
    delete g_CheckRender;

    if (!g_TotalErrors) 
        printf("TEST PASSED!\n");
    else 
        printf("TEST FAILED!\n");
}
Example #5
0
/**
 * Selects a GPU for the CUDA execution.
 * @param id id of the selected GPU.
 */
void selectGPU(int id) {
    int deviceId;
    if (id == -1) {
        deviceId = cutGetMaxGflopsDeviceId();
    } else {
        deviceId = id;
    }

    cutilSafeCall(cudaSetDevice( deviceId ));
    cutilCheckMsg("cudaSetDevice failed");
}
Example #6
0
void Application::_init()
{
    // Pick the best CUDA device
    const int deviceIdx = cutGetMaxGflopsDeviceId();
    CudaSafeCall( cudaSetDevice( deviceIdx ) );

    // CUDA configuration
    CudaSafeCall( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) );

    return;
}
void initCuda(){
  // Use device with highest Gflops/s
  cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

  initPBO(&pbo);

  // Clean up on program exit
  atexit(cleanupCuda);

  runCuda();
}
Example #8
0
bool initCUDA( int argc, char **argv)
{
	return true;
	if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device"))
	{
		cutilGLDeviceInit(argc, argv);
	}
	else 
	{
		cudaGLSetGLDevice (cutGetMaxGflopsDeviceId());
	}
	return true;
}
void
runGraphicsTest(int argc, char** argv)
{
	printf("MarchingCubes ");
    if (g_bFBODisplay) printf("[w/ FBO] ");
    if (g_bOpenGLQA) printf("[Readback Comparisons] ");
    printf("\n");

    if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
		printf("[%s]\n", argv[0]);
		printf("   Does not explicitly support -device=n in OpenGL mode\n");
		printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
		printf(" > %s -device=n -qatest\n", argv[0]);
		printf("exiting...\n");
        exit(0);
	}

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if(CUTFalse == initGL(&argc, argv)) {
        return;
    }

    cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

    // register callbacks
    glutDisplayFunc(display);
    glutKeyboardFunc(keyboard);
    glutMouseFunc(mouse);
    glutMotionFunc(motion);
    glutIdleFunc(idle);
    glutReshapeFunc(reshape);
    initMenus();

    // Initialize CUDA buffers for Marching Cubes 
    initMC(argc, argv);

    cutilCheckError( cutCreateTimer( &timer));

    if (g_bOpenGLQA) {
        g_CheckRender = new CheckBackBuffer(window_width, window_height, 4);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);
    }

    // start rendering mainloop
    glutMainLoop();

    cudaThreadExit();
}
Example #10
0
File: main.cpp Project: elf11/GPGPU
bool initCUDA(void)
{
#if __DEVICE_EMULATION__
	return true;
#else
	int count = 0;
	int i = 0;

	cudaGetDeviceCount(&count);
	if(count == 0) {
		fprintf(stderr, "Nu exista nici un device.\n");
		return false;
	}

	printf("Exista %d device-uri.\n",count);
	
	for(i = 0; i < count; i++) {
		cudaDeviceProp prop;
		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
			if(prop.major >= 1) {
				break;
			}
		}
		
		if(!prop.canMapHostMemory) {
		
			fprintf(stderr, "Device %d cannot map host memory!\n",0);
			exit(EXIT_FAILURE);
		
		}

	}

	if(i == count) {
		fprintf(stderr, "Nu exista nici un device care suporta CUDA.\n");
		return false;
	}
	cudaSetDevice(cutGetMaxGflopsDeviceId());

	cudaSetDeviceFlags(cudaDeviceMapHost);
	checkCUDAError("cudaSetDeviceFlags");
	
	
	printf("CUDA initializat cu succes\n");

	// Create the CUTIL timer
    cutilCheckError( cutCreateTimer( &timer));

	return true;
#endif
}
Example #11
0
void initCuda(){
  // Use device with highest Gflops/s
#if CUDA_VERSION >= 5000
	cudaGLSetGLDevice( gpuGetMaxGflopsDeviceId() );
#else
	cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
#endif

  initPBO(&pbo);

  // Clean up on program exit
  atexit(cleanupCuda);

  runCuda();
}
Example #12
0
void initCuda(){
  // Use device with highest Gflops/s
  cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

  initPBO(&pbo);
  dptr=NULL;
  cudaGLMapBufferObject((void**)&dptr, pbo);
  clearPBOpos(dptr,width,height);
  cudaGLUnmapBufferObject(pbo);
  // Clean up on program exit
  atexit(cleanupCuda);
  SetScissorWindow(glm::vec4(300,300,500,500));
  texture.mapptr = stbi_load("cow.jpeg",&texture.width, &texture.height,&texture.depth,0);
  runCuda();
}
Example #13
0
void initCuda(){
  // Use device with highest Gflops/s
  cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );


  initPBO(&pbo);

    paraMap = new float[(int)renderCam->resolution.x *(int)renderCam->resolution.y];
   effectiveRayMap =new int[(int)renderCam->resolution.x *(int)renderCam->resolution.y];
   initialRayMap = new ray[(int)renderCam->resolution.x * (int)renderCam->resolution.y];
   generateRayMap(renderCam, targetFrame);
  // Clean up on program exit
  atexit(cleanupCuda);

  runCuda();
}
RemoteCUDARunner::RemoteCUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA),m_metahashsize(0)
{
	m_in=0;
	m_devin=0;
	m_out=0;
	m_devout=0;
	m_metahash=0;
	m_devmetahash=0;

	cutilSafeCall(cudaGetDeviceCount(&m_devicecount));

	if(m_devicecount>0)
	{
		if(m_deviceindex<0 || m_deviceindex>=m_devicecount)
		{
			m_deviceindex=cutGetMaxGflopsDeviceId();
			std::cout << "Setting CUDA device to Max GFlops device at index " << m_deviceindex << std::endl;
		}
		else
		{
			std::cout << "Setting CUDA device to device at index " << m_deviceindex << std::endl;
		}
		
		cudaDeviceProp props;
		cudaGetDeviceProperties(&props,m_deviceindex);

		std::cout << "Device info for " << props.name << " :" << std::endl;
		std::cout << "Compute Capability : " << props.major << "." << props.minor << std::endl;
		std::cout << "Clock Rate (hz) : " << props.clockRate << std::endl;

		if(props.major>999)
		{
			std::cout << "CUDA seems to be running in CPU emulation mode" << std::endl;
		}

		cutilSafeCall(cudaSetDevice(m_deviceindex));

	}
	else
	{
		m_deviceindex=-1;
		std::cout << "No CUDA capable device detected" << std::endl;
	}
}
Example #15
0
CUDARunner::CUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA)
{
	m_in=0;
	m_devin=0;
	m_out=0;
	m_devout=0;

	cutilSafeCall(cudaGetDeviceCount(&m_devicecount));

	if(m_devicecount>0)
	{
		if(m_deviceindex<0 || m_deviceindex>=m_devicecount)
		{
			m_deviceindex=cutGetMaxGflopsDeviceId();
			printf("Setting CUDA device to Max GFlops device at index %u\n",m_deviceindex);
		}
		else
		{
			printf("Setting CUDA device to device at index %u\n",m_deviceindex);
		}
		
		cudaDeviceProp props;
		cudaGetDeviceProperties(&props,m_deviceindex);

		printf("Device info for %s :\nCompute Capability : %d.%d\nClock Rate (hz) : %d\n",props.name,props.major,props.minor,props.clockRate);

		if(props.major>999)
		{
			printf("CUDA seems to be running in CPU emulation mode\n");
		}

		cutilSafeCall(cudaSetDevice(m_deviceindex));

	}
	else
	{
		m_deviceindex=-1;
		printf("No CUDA capable device detected\n");
	}
}
Example #16
0
int DrawScene::InitGL( )
{

	GLenum err = glewInit();
	if (GLEW_OK != err)
	{
		/* Problem: glewInit failed, something is seriously wrong. */
		::MessageBox(NULL,"glewInit failed, something is seriously wrong.",
			"glew error occured.",MB_OK );
		return false;
	}
#if CUDA_ENABLE
	// cuda初始化
	cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
#endif

	for (int i=0;i<COUNT_MODEL;i++)
	{
		m_model[i].loadModelData(FILENAME_MS3D);
		m_model[i].reloadTextures();										// Loads Model Textures
	}

	glEnable(GL_TEXTURE_2D);										// Enable Texture Mapping ( NEW )
	glShadeModel(GL_SMOOTH);										// Enable Smooth Shading
	glClearColor(0.0f, 0.0f, 0.0f, 0.5f);							// Black Background
	glClearDepth(1.0f);												// Depth Buffer Setup
	glEnable(GL_DEPTH_TEST);										// Enables Depth Testing
	glDepthFunc(GL_LEQUAL);											// The Type Of Depth Testing To Do
	glHint(GL_PERSPECTIVE_CORRECTION_HINT, GL_NICEST);				// Really Nice Perspective Calculations
#if ENABLE_CONSOLE_WINDOW
	AllocConsole(); 
	freopen( "CONOUT$","w",stdout);
#endif


	m_particleMngr.addParticleNode("", PARTICLE_TYPE_FOUNTAIN );

	return TRUE;
}
Example #17
0
//------------------------------------------------------------------------------
static void
#if GLFW_VERSION_MAJOR>=3
keyboard(GLFWwindow *, int key, int scancode, int event, int mods) {
#else
#define GLFW_KEY_ESCAPE GLFW_KEY_ESC
keyboard(int key, int event) {
#endif

    if (event == GLFW_RELEASE) return;
    if (g_hud.KeyDown(tolower(key))) return;

    switch (key) {
        case 'Q': g_running = 0; break;
        case 'F': fitFrame(); break;
        case '+':  
        case '=': g_tessLevel++; break;
        case '-': g_tessLevel = std::max(g_tessLevelMin, g_tessLevel-1); break;
        case '.': g_moveModels = std::max(g_moveModels*2, 1); break;
        case ',': g_moveModels = std::max(g_moveModels/2, 0); break;
        case 'I': g_modelCount = std::max(g_modelCount/2, 1); rebuild(); break;
        case 'O': g_modelCount = std::min(g_modelCount*2, MAX_MODELS); rebuild(); break;
        case GLFW_KEY_ESCAPE: g_hud.SetVisible(!g_hud.IsVisible()); break;
    }
}

//------------------------------------------------------------------------------
static void
callbackDisplayStyle(int b)
{
    if (g_displayStyle == kVaryingColor or b == kVaryingColor or
        g_displayStyle == kFaceVaryingColor or b == kFaceVaryingColor) {
        // need to rebuild for varying reconstruct
        g_displayStyle = b;
        rebuild();
        return;
    }
    g_displayStyle = b;
}

static void
callbackKernel(int k)
{
    g_kernel = k;

#ifdef OPENSUBDIV_HAS_OPENCL
    if (g_kernel == kCL and g_clContext == NULL) {
        if (initCL(&g_clContext, &g_clQueue) == false) {
            printf("Error in initializing OpenCL\n");
            exit(1);
        }
    }
#endif
#ifdef OPENSUBDIV_HAS_CUDA
    if (g_kernel == kCUDA and g_cudaInitialized == false) {
        g_cudaInitialized = true;
        cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
    }
#endif

    rebuild();
}
Example #18
0
void WaterPlaneCUDA::configure(Vector upperLeft, Vector lowerRight, float dampFactor, float resolution)
{

	cudaSetDevice(cutGetMaxGflopsDeviceId());
	cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());
	
	timeSinceLast = timePassed = 0;

	unsigned int free, total;
	int gpuCount, i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;

	cuInit(0);

	cuDeviceGetCount(&gpuCount);
	printf("Detected %d GPU\n",gpuCount);

	for (i=0; i<gpuCount; i++)
	{
		cuDeviceGet(&dev,i);
		cuCtxCreate(&ctx, 0, dev);
		res = cuMemGetInfo(&free, &total);
		if(res != CUDA_SUCCESS)
			printf("!!!! cuMemGetInfo failed! (status = %x)", res);
		printf("^^^^ Device: %d\n",i);
		printf("^^^^ Free : %lu bytes (%lu KB) (%lu MB)\n", free, inKB(free), inMB(free));
		printf("^^^^ Total: %lu bytes (%lu KB) (%lu MB)\n", total, inKB(total), inMB(total));
		printf("^^^^ %f%% free, %f%% used\n", 100.0*free/(double)total, 100.0*(total - free)/(double)total);
		cuCtxDetach(ctx);
	}


	this->stepSize = 1.0f/resolution;

	this->resolutionFactor = resolution;

	//reale Z - Achse ist x - Achse der WaterPlaneCUDA
	this->sizeX = (unsigned int) abs(upperLeft.z - lowerRight.z);

	//reale X -Achse ist y- Achse der WaterPlaneCUDA
	this->sizeY = (unsigned int) abs(upperLeft.x - lowerRight.x);

	//Anzahl der Netzpunkte in X -Richtung
	this->pointsX = (unsigned int)(sizeX * resolution);

	//Anzahl der Netzpunkte in Y -Richtung
	pointsY = (unsigned int)(sizeY * resolution);

	uLeft = upperLeft;

	lRight = lowerRight;

	//Der "Meeresspiegel"
	baseHeight = lRight.y;

	//Das Höhenfeld der WaterPlaneCUDA
	waveMap = NULL;

	initBuffer();

	gpu_newVertices = new float3[pointsX*pointsY];
	gpu_oldVertices = new float3[pointsX*pointsY];
	gpu_normals = new float3[pointsX*pointsY];

	for (int i=0;i<pointsX*pointsY;i++) {
		gpu_newVertices[i]=make_float3(0,0,0);
		gpu_oldVertices[i]=make_float3(0,0,0);
		gpu_normals[i]=make_float3(0,1.0,0);
	}
	cutilSafeCall(cudaMalloc((void**)&gpu_newVertices,pointsX*pointsY*sizeof(float3)));
	cutilSafeCall(cudaMalloc((void**)&gpu_oldVertices,pointsX*pointsY*sizeof(float3)));
	cutilSafeCall(cudaMalloc((void**)&gpu_normals,pointsX*pointsY*sizeof(float3)));

	drawMesh();
}
Example #19
0
////////////////////////////////////////////////////////////////////////////////
//! Run test
////////////////////////////////////////////////////////////////////////////////
void runGraphicsTest(int argc, char** argv)
{
    printf("[%s] ", sSDKsample);
    if (g_bOpenGLQA)   printf("[OpenGL Readback Comparisons] ");
    printf("\n");

    if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device") ) { 
		printf("[%s]\n", argv[0]);
		printf("   Does not explicitly support -device=n in OpenGL mode\n");
		printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
		printf(" > %s -device=n -qatest\n", argv[0]);
		printf("exiting...\n");
		exit(0);
    }

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if(CUTFalse == initGL( &argc, argv )) {
        cudaThreadExit();
        return;
    }

    cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

    // create FFT plan
    CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) );

    // allocate memory
    fftInputW = (meshW / 2)+1;
    fftInputH = meshH;
    fftInputSize = (fftInputW*fftInputH)*sizeof(float2);

    cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) );
    cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) );
    h_h0 = (float2 *) malloc(fftInputSize);
    generate_h0();
    cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) );

    cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) );

    cutCreateTimer(&timer);
    cutStartTimer(timer);
    prevTime = cutGetTimerValue(timer);

    // create vertex buffers and register with CUDA
    createVBO(&heightVertexBuffer, meshW*meshH*sizeof(float));
    // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(heightVertexBuffer));
	cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, cudaGraphicsMapFlagsWriteDiscard));

    createVBO(&slopeVertexBuffer, meshW*meshH*sizeof(float2));
    // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(slopeVertexBuffer));
	cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, cudaGraphicsMapFlagsWriteDiscard));

    // create vertex and index buffer for mesh
    createMeshPositionVBO(&posVertexBuffer, meshW, meshH);
    createMeshIndexBuffer(&indexBuffer, meshW, meshH);

    // Creating the Auto-Validation Code
    if (g_bOpenGLQA) {
        g_CheckRender = new CheckBackBuffer(windowH, windowH, 4);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);
    }

    runCuda();

    // register callbacks
    glutDisplayFunc(display);
    glutKeyboardFunc(keyboard);
    glutMouseFunc(mouse);
    glutMotionFunc(motion);
    glutReshapeFunc(reshape);
    glutIdleFunc(idle);

    // start rendering mainloop
    glutMainLoop();
    cudaThreadExit();
}
Example #20
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    printf("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n\n");
    
    bool benchmark = 
        (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0);

    bool compareToCPU = 
        ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) ||
        !(cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest")  != 0));

    bool regression = 
        (cutCheckCmdLineFlag(argc, (const char**) argv, "regression") != 0);

    int devID;
    cudaDeviceProp props;

    // nBody has a mode that allows it to be run without using GL interop
    if (benchmark || compareToCPU || regression) {
				/*
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            devID = cutGetMaxGflopsDeviceId();
            cudaSetDevice( devID );
        } */
    } 
    else 
    { 
        // This mode shows the OpenGL results rendered
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        glutInit(&argc, argv);
        glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
        glutInitWindowSize(720, 480);
        glutCreateWindow("CUDA n-body system");

        GLenum err = glewInit();
        if (GLEW_OK != err)
        {
            printf("GLEW Error: %s\n", glewGetErrorString(err));
        }
        else
        {
#if   defined(WIN32)
            wglSwapIntervalEXT(0);
#elif defined(LINUX)
            glxSwapIntervalSGI(0);
#endif      
        }
    	
        initGL();
        initParameters();
    	
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            devID = cutGetMaxGflopsDeviceId();
            cudaGLSetGLDevice( devID );
        }
    }


    // get number of SMs on this GPU
    cutilSafeCall(cudaGetDevice(&devID));
    cutilSafeCall(cudaGetDeviceProperties(&props, devID));

    numIterations = 0;
    int p = 256;
    int q = 1;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q);


    // default number of bodies is #SMs * 4 * CTA size
    numBodies = compareToCPU ? 4096 : p*q*4*props.multiProcessorCount;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "n", &numBodies);

    switch (numBodies)
    {
    case 1024:
        activeParams.m_clusterScale = 1.52f;
        activeParams.m_velocityScale = 2.f;
        break;
    case 2048:
        activeParams.m_clusterScale = 1.56f;
        activeParams.m_velocityScale = 2.64f;
        break;
    case 4096:
        activeParams.m_clusterScale = 1.68f;
        activeParams.m_velocityScale = 2.98f;
        break;
    case 8192:
        activeParams.m_clusterScale = 1.98f;
        activeParams.m_velocityScale = 2.9f;
        break;
    default:
    case 16384:
        activeParams.m_clusterScale = 1.54f;
        activeParams.m_velocityScale = 8.f;
        break;
    case 32768:
        activeParams.m_clusterScale = 1.44f;
        activeParams.m_velocityScale = 11.f;
        break;
    }

    if (q * p > 256)
    {
        p = 256 / q;
        printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
    }

    if (q == 1 && numBodies < p)
    {
        p = numBodies;
    }

    init(numBodies, p, q, !(benchmark || compareToCPU));
    
    reset(nbody, numBodies, NBODY_CONFIG_SHELL, !(benchmark || compareToCPU));


    if (benchmark)
    {
        if (numIterations <= 0) 
            numIterations = 100;
        runBenchmark(numIterations);
    }
    else if (compareToCPU || regression)
    {
        compareResults(regression, numBodies);
    }
    else
    {
        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        cutilSafeCall(cudaEventRecord(startEvent, 0));
        glutMainLoop();
    }

    if (nbodyCPU)
        delete nbodyCPU;
    if (nbodyCUDA)
        delete nbodyCUDA;

    if (hPos)
        delete [] hPos;
    if (hVel)
        delete [] hVel;
    if (hColor)
        delete [] hColor;

    cutilSafeCall(cudaEventDestroy(startEvent));
    cutilSafeCall(cudaEventDestroy(stopEvent));
    cutilCheckError(cutDeleteTimer(demoTimer));

    return 0;
}
Example #21
0
int main(int argc, char **argv)
{
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	shrQAStart(argc, argv);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        dev = cutilDeviceInit(argc, argv);
        if (dev < 0) {
           printf("No CUDA Capable Devices found, exiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }
    } else {
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
        cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    }
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", 
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11) 
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        cutilDeviceReset();
		shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

    shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++) 
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

    {
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); 

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram64CPU()\n");
               histogram64CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results...\n");
                for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 64-bin histogram...\n\n\n");
            closeHistogram64();
    }

    {
        shrLog("Initializing 256-bin histogram...\n");
            initHistogram256();

        shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram256(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); 
                
        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin histogram...\n\n\n");
            closeHistogram256();
    }

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        cutilSafeCall( cudaFree(d_Histogram) );
        cutilSafeCall( cudaFree(d_Data) );
        free(h_HistogramGPU);
        free(h_HistogramCPU);
        free(h_Data);

    cutilDeviceReset();
	shrLog("%s - Test Summary\n", sSDKsample);
    // pass or fail (for both 64 bit and 256 bit histograms)
    shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED));
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv) 
{
	int retVal = 0;

	retVal = xnInit( argc, argv );

    printf("[ %s ]\n", sSDKsample); 

    if (argc > 1) {
        cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles);
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
			cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")
			) 
		{
            g_bQAReadback = true;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
		{
            g_bQAGLVerify = true;
        }
    }

    if (g_bQAReadback) {
        // For Automated testing, we do not use OpenGL/CUDA interop
        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
	        cutilDeviceInit (argc, argv);
        } else {
            cudaSetDevice (cutGetMaxGflopsDeviceId() );
        }

        g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        // This code path is used for Automated Testing
        initParticles(numParticles, false, false);
        initParams();

        if (emitterOn) {
            runEmitter();
        }
        SimParams &params = psystem->getParams();
        params.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z);

        psystem->step(timestep);

        float4 *pos = NULL, *vel = NULL;
        int g_TotalErrors = 0;

        psystem->dumpBin(&pos, &vel);

        g_CheckRender->dumpBin(pos, numParticles*sizeof(float4), "smokeParticles_pos.bin");
        g_CheckRender->dumpBin(vel, numParticles*sizeof(float4), "smokeParticles_vel.bin");

        if (!g_CheckRender->compareBin2BinFloat("smokeParticles_pos.bin", sRefBin[0], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD))
           g_TotalErrors++;

        if (!g_CheckRender->compareBin2BinFloat("smokeParticles_vel.bin", sRefBin[1], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD))
           g_TotalErrors++;


        delete psystem;
        delete g_CheckRender;

        printf("%s\n", (g_TotalErrors > 0) ? "FAILED" : "PASSED");

        cudaThreadExit();
    } else {
        // Normal smokeParticles rendering path
        // 1st initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is needed to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
	        cutilGLDeviceInit (argc, argv);
        } else {
            cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        }

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }

        // This is the normal code path for SmokeParticles
        initParticles(numParticles, true, true);
        initParams();
        initMenus();

        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutKeyboardUpFunc(keyUp);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        glutMainLoop();
    }

    cutilExit(argc, argv);
	return retVal;
}
Example #23
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    bool bTestResults = true;

    shrQAStart(argc, argv);

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) {
        showHelp();
        return 0;
    }

    shrLog("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n");
	shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n");
	shrLog("\t-fp64       (use double precision floating point values for simulation)\n");
    shrLog("\t-numdevices=N (use first N CUDA devices for simulation)\n");
//    shrLog("\t-hostmem  (stores simulation data in host memory)\n");
//    shrLog("\t-cpu      (performs simulation on the host)\n");
    shrLog("\n");

	bFullscreen  = (cutCheckCmdLineFlag(argc, (const char**) argv, "fullscreen") != 0);
    if (bFullscreen)
        bShowSliders = false;

    benchmark    = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0);

    compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) ||
                   (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest")  != 0));

    QATest       = (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest")  != 0);

    useHostMem   = (cutCheckCmdLineFlag(argc, (const char**) argv, "hostmem") != 0);

    fp64         = (cutCheckCmdLineFlag(argc, (const char**) argv, "fp64") != 0);

    flopsPerInteraction = fp64 ? 30 : 20;

    useCpu       = (cutCheckCmdLineFlag(argc, (const char**) argv, "cpu") != 0);

    cutGetCmdLineArgumenti(argc, (const char**) argv, "numdevices", &numDevsRequested);

    // for multi-device we currently require using host memory -- the devices share
    // data via the host 
    if (numDevsRequested > 1)
        useHostMem = true;

    int numDevsAvailable = 0;
    bool customGPU = false;
    cudaGetDeviceCount(&numDevsAvailable);

    if (numDevsAvailable < numDevsRequested) {
        shrLog("Error: only %d Devices available, %d requested.  Exiting.\n", numDevsAvailable, numDevsRequested);
        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
    }

	shrLog("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed");
	shrLog("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video" );
	shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single");
    shrLog("> %d Devices used for simulation\n", numDevsRequested);

    int devID;
    cudaDeviceProp props;

    // Initialize GL and GLUT if necessary
    if (!benchmark && !compareToCPU) {
        initGL(&argc, argv);
        initParameters();
    }
    
    if (useCpu) {
        useHostMem = true;
        compareToCPU = false;
        bSupportDouble = true;

#ifdef OPENMP
        shrLog("> Simulation with CPU using OpenMP\n");
#else
        shrLog("> Simulation with CPU\n");
#endif
    }
    else
    {
        // Now choose the CUDA Device
        // Either without GL interop:
        if (benchmark || compareToCPU || useHostMem) 
        {
            // Note if we are using host memory for the body system, we
            // don't use CUDA-GL interop.

            if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) 
            {
                devID = cutilDeviceInit(argc, argv);
                if (devID < 0) {
                   printf("exiting...\n");
                   shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
                }
                customGPU = true;
            } 
            else 
            {
                devID = cutGetMaxGflopsDeviceId();
                cudaSetDevice( devID );
            }
        } 
        else // or with GL interop:
        {    	
            if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
                cutilGLDeviceInit(argc, argv);
                customGPU = true;
            } else {
                devID = cutGetMaxGflopsDeviceId();
                cudaGLSetGLDevice( devID );
            }
        }

        cutilSafeCall(cudaGetDevice(&devID));
        cutilSafeCall(cudaGetDeviceProperties(&props, devID));

        bSupportDouble = true;

#if CUDART_VERSION < 4000
        if (numDevsRequested > 1)
        {
            shrLog("MultiGPU n-body requires CUDA 4.0 or later\n");
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char**)argv, QA_PASSED);
        }
#endif

        // Initialize devices
        if(numDevsRequested > 1 && customGPU)
        {
            printf("You can't use --numdevices and --device at the same time.\n");
            shrQAFinishExit(argc, (const char**)argv, QA_PASSED);
        }

        if(customGPU) { 
            cudaDeviceProp props;
            cutilSafeCall(cudaGetDeviceProperties(&props, devID));
            shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);
        }
        else
        {
            for (int i = 0; i < numDevsRequested; i++)
            {
                cudaDeviceProp props;
                cutilSafeCall(cudaGetDeviceProperties(&props, i));
            
                shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);

                if (useHostMem)
                {
#if CUDART_VERSION >= 2020
                    if(!props.canMapHostMemory)
                    {
                        fprintf(stderr, "Device %d cannot map host memory!\n", devID);
                        cutilDeviceReset();
                        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
                    }
                    if (numDevsRequested > 1)
                        cutilSafeCall(cudaSetDevice(i));
                    cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
                    fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n");
                    cutilDeviceReset();
                    shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
#endif
                }
            }

            // CC 1.2 and earlier do not support double precision
            if (props.major*10 + props.minor <= 12)
                bSupportDouble = false;
        }

        //if(numDevsRequested > 1)
        //    cutilSafeCall(cudaSetDevice(devID));

        if (fp64 && !bSupportDouble) {
            fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n");
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
        }
    }
	
    numIterations = 0;
    p = 0;
    q = 1;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q);

    if (p == 0) // p not set on command line
    {
        p = 256;
        if (q * p > 256)
        {
            p = 256 / q;
            shrLog("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
        }
    }

    // default number of bodies is #SMs * 4 * CTA size
    if (useCpu)
#ifdef OPENMP
        numBodies = 8192;
#else
        numBodies = 4096;
#endif
    else if (numDevsRequested == 1)
Example #24
0
//------------------------------------------------------------------------------
int main(int argc, char ** argv) {

    glutInit(&argc, argv);

    glutInitDisplayMode(GLUT_RGBA |GLUT_DOUBLE | GLUT_DEPTH);
    glutInitWindowSize(1024, 1024);
    glutCreateWindow("OpenSubdiv test");

    std::string str;
    if (argc > 1) {
        std::ifstream ifs(argv[1]);
        if (ifs) {
            std::stringstream ss;
            ss << ifs.rdbuf();
            ifs.close();
            str = ss.str();

            g_defaultShapes.push_back(SimpleShape(str.c_str(), argv[1], kCatmark));
        }
    }



    initializeShapes();

    int smenu = glutCreateMenu(modelMenu);
    for(int i = 0; i < (int)g_defaultShapes.size(); ++i){
        glutAddMenuEntry( g_defaultShapes[i].name.c_str(), i);
    }

    int lmenu = glutCreateMenu(levelMenu);
    for(int i = 1; i < 8; ++i){
        char level[16];
        sprintf(level, "Level %d\n", i);
        glutAddMenuEntry(level, i);
    }

    // Register Osd compute kernels
    OpenSubdiv::OsdCpuKernelDispatcher::Register();
    OpenSubdiv::OsdGlslKernelDispatcher::Register();

#if OPENSUBDIV_HAS_OPENCL
    OpenSubdiv::OsdClKernelDispatcher::Register();
#endif

#if OPENSUBDIV_HAS_CUDA
    OpenSubdiv::OsdCudaKernelDispatcher::Register();

    // Note: This function randomly crashes with linux 5.0-dev driver.
    // cudaGetDeviceProperties overrun stack..?
    cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
#endif

    int kmenu = glutCreateMenu(kernelMenu);
    int nKernels = OpenSubdiv::OsdKernelDispatcher::kMAX;

    for(int i = 0; i < nKernels; ++i)
        if(OpenSubdiv::OsdKernelDispatcher::HasKernelType(
               OpenSubdiv::OsdKernelDispatcher::KernelType(i)))
            glutAddMenuEntry(getKernelName(i), i);

    glutCreateMenu(menu);
    glutAddSubMenu("Level", lmenu);
    glutAddSubMenu("Model", smenu);
    glutAddSubMenu("Kernel", kmenu);
    glutAttachMenu(GLUT_RIGHT_BUTTON);

    glutDisplayFunc(display);
    glutReshapeFunc(reshape);
    glutMouseFunc(mouse);
    glutKeyboardFunc(keyboard);
    glutMotionFunc(motion);
    glewInit();
    initGL();

    const char *filename = NULL;

    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "-d"))
            g_level = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-c"))
            g_repeatCount = atoi(argv[++i]);
        else
            filename = argv[i];
    }

    modelMenu(0);

    glutIdleFunc(idle);
    glutMainLoop();

    quit();
}
Example #25
0
//------------------------------------------------------------------------------
int main(int argc, char ** argv) {

    glutInit(&argc, argv);

    glutInitDisplayMode(GLUT_RGBA |GLUT_DOUBLE | GLUT_DEPTH);
    glutInitWindowSize(1024, 1024);
    glutCreateWindow("OpenSubdiv ptexViewer");

    int lmenu = glutCreateMenu(levelMenu);
    for(int i = 1; i < 8; ++i){
        char level[16];
        sprintf(level, "Level %d\n", i);
        glutAddMenuEntry(level, i);
    }
    int smenu = glutCreateMenu(schemeMenu);
    glutAddMenuEntry("Catmark", 0);
    glutAddMenuEntry("Bilinear", 1);

    // Register Osd compute kernels
    OpenSubdiv::OsdCpuKernelDispatcher::Register();

#if OPENSUBDIV_HAS_GLSL
    OpenSubdiv::OsdGlslKernelDispatcher::Register();
#endif

#if OPENSUBDIV_HAS_OPENCL
    OpenSubdiv::OsdClKernelDispatcher::Register();
#endif

#if OPENSUBDIV_HAS_CUDA
    OpenSubdiv::OsdCudaKernelDispatcher::Register();

    // Note: This function randomly crashes with linux 5.0-dev driver.
    // cudaGetDeviceProperties overrun stack..?
    cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
#endif

    int kmenu = glutCreateMenu(kernelMenu);
    int nKernels = OpenSubdiv::OsdKernelDispatcher::kMAX;

    for(int i = 0; i < nKernels; ++i)
        if(OpenSubdiv::OsdKernelDispatcher::HasKernelType(
               OpenSubdiv::OsdKernelDispatcher::KernelType(i)))
            glutAddMenuEntry(getKernelName(i), i);

    glutCreateMenu(menu);
    glutAddSubMenu("Level", lmenu);
    glutAddSubMenu("Scheme", smenu);
    glutAddSubMenu("Kernel", kmenu);
    glutAttachMenu(GLUT_RIGHT_BUTTON);

    glutDisplayFunc(display);
    glutReshapeFunc(reshape);
    glutMouseFunc(mouse);
    glutKeyboardFunc(keyboard);
    glutMotionFunc(motion);
    glewInit();
    initGL();

    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "-d"))
            g_level = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-c"))
            g_repeatCount = atoi(argv[++i]);
        else if (g_ptexColorFile == NULL)
            g_ptexColorFile = argv[i];
        else if (g_ptexDisplacementFile == NULL)
            g_ptexDisplacementFile = argv[i];
        else if (g_ptexOcclusionFile == NULL)
            g_ptexOcclusionFile = argv[i];

    }

    if (g_ptexColorFile == NULL) {
        printf("Usage: %s <color.ptx> [<displacement.ptx>] [<occlusion.ptx>] \n", argv[0]);
        return 1;
    }

    createOsdMesh(g_level, g_kernel);

    fitFrame();

    glutIdleFunc(idle);
    glutMainLoop();

    quit();
}
Example #26
0
int main(int argc, char** argv)
{
	ModelParameters model_params;

	fillCalculationParameters(model_params);
	fillDerivedParameters(model_params, params);

	if (CUTFalse == initGL(argc, argv, params))
		return CUTFalse;

	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
	if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
		cutilDeviceInit(argc, argv);
	else
		cudaSetDevice(cutGetMaxGflopsDeviceId());

	// initialize calculations
	initConstants(params);

	timeval init_start, init_stop;

	// calculate steady state
	value_pair *steady_state = new value_pair[params.cells];

	initSpectre();
	initWaveVectors(params);

	gettimeofday(&init_start, NULL);
	calculateSteadyState(steady_state, params);
	gettimeofday(&init_stop, NULL);
	printf("Steady state calculation: %.3f s\n", time_diff(init_start, init_stop));

/*
	FILE *f = fopen("plot_gs_mu.txt", "w");
	int shift = (params.nvz / 2) * params.nvx * params.nvy + (params.nvy / 2) * params.nvx;
	for(int i = 0; i < params.nvx; i++)
	{
		value_pair val = steady_state[shift + i];
		fprintf(f, "%f %f\n", (-params.xmax + params.dx * i) * 1000000, (val.x * val.x + val.y * val.y));
	}
	fclose(f);
 */

	gettimeofday(&init_start, NULL);
	state.init(params);
	initEvolution(steady_state, params, state);
	gettimeofday(&init_stop, NULL);
	printf("Evolution init: %.3f s\n", time_diff(init_start, init_stop));

	delete[] steady_state;

	// measure propagation time, for testing purposes
	calculateEvolution(params, state, 0.0); // warm-up
	gettimeofday(&init_start, NULL);
	calculateEvolution(params, state, 0.0); // zero time step - because we are just measuring speed here
	gettimeofday(&init_stop, NULL);
	printf("Propagation time: %.3f ms\n", time_diff(init_start, init_stop) * 1000.0f);

	// prepare textures
	a_xy.init(params.nvx, params.nvy);
	b_xy.init(params.nvx, params.nvy);
	a_zy.init(params.nvz, params.nvy);
	b_zy.init(params.nvz, params.nvy);

	// remember starting time
	gettimeofday(&time_start, NULL);

	// start main application cycle
        atexit(cleanup);
        glutMainLoop();
	return 0;
}
int main()
{
	int width = 512;
	int height = 512;

	// Creation du device
	cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) );

	// Creation des buffers sur CPU
	int * a = new int[width*height];
	int * b = new int[width*height];
	int * res = new int[width*height];
	for(int i = 0; i < width*height; i++)
	{
		a[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100);
		b[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100);
	}

	// Allocation des objects on the device
	// *** data
	unsigned int size = width * height * sizeof(int);
	std::cout << "Allocation d'un buffer de taille " << width*height << "\n";
	int* d_a = NULL;
	int* d_b = NULL;
	int* d_res = NULL;
    cutilSafeCall( cudaMalloc( (void**) &d_a, size));
	cutilSafeCall( cudaMalloc( (void**) &d_b, size));
	cutilSafeCall( cudaMalloc( (void**) &d_res, size));

	// Copy des donnees
	cutilSafeCall( cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice));
	cutilSafeCall( cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice));

    // Lancer le calcul
	std::cout << "Lancer le kernel ... \n";
	runKernel(d_a, d_b, d_res, width*height);
	cutilSafeCall( cutilDeviceSynchronize() );

	// Copie DeviceHost
	cutilSafeCall( cudaMemcpy(res, d_res, size, cudaMemcpyDeviceToHost));

	// Verification du test
	int i = 0;
	for(;i < width*height;i++)
		if(res[i] != a[i] + b[i])
			std::cout << "Error : [" << i << "] " << res[i] << " != " << a[i] << " + " << b[i] << std::endl;


	// Liberation des ressources
	// *** Device
	cutilSafeCall(cudaFree(d_a));
	cutilSafeCall(cudaFree(d_b));
	cutilSafeCall(cudaFree(d_res));
	// *** CPU
	delete[] res;
	delete[] a;
	delete[] b;

	// Close device
	cutilDeviceReset();
	std::cout << "Test result : " << ((i == width*height) ? "Succes" : "Error" ) <<  std::endl;
	std::cout.flush();

	return 0;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
    shrQAStart(argc, argv);

    // start logs
    shrSetLogFileName ("bilateralFilter.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads );
    cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius);

	// load image to process
    loadImageData(argc, argv);

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
    {
		// Running CUDA kernel (bilateralFilter) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv);
	    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
	} 
	else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) 
	{
		// Running CUDA kernel (bilateralFilter) in Benchmarking Mode
        runBenchmark(argc, argv);
	    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
	} 
	else 
	{	
		// Running CUDA kernel (bilateralFilter) in CUDA + OpenGL Visualization Mode
        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
             printf("[%s]\n", argv[0]);
             printf("   Does not explicitly support -device=n in OpenGL mode\n");
             printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
             printf(" > %s -device=n -qatest\n", argv[0]);
             printf("exiting...\n");
             exit(0);
        }

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( argc, argv );

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        }
        initCuda();
        initOpenGL();
    }

    atexit(cleanup);

    printf("Running Standard Demonstration with GLUT loop...\n\n");
    printf("Press '+' and '-' to change number of iterations\n"
        "Press LEFT and RIGHT change euclidean delta\n"
        "Press UP and DOWN to change gaussian delta\n"
        "Press '1' to show original image\n"
        "Press '2' to show result\n\n");

    glutMainLoop();

    cutilDeviceReset();
	shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    //start logs
    shrSetLogFileName ("volumeRender.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
	{
        g_bQAGLVerify = true;
        fpsLimit = frameCheckNumber;
    }

    if (g_bQAReadback) {
	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            cudaSetDevice( cutGetMaxGflopsDeviceId() );
        }

    } else {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
        }
/*
        int device;
        struct cudaDeviceProp prop;
        cudaGetDevice( &device );
        cudaGetDeviceProperties( &prop, device );
        if( !strncmp( "Tesla", prop.name, 5 ) ) {
            shrLog("This sample needs a card capable of OpenGL and display.\n");
            shrLog("Please choose a different device with the -device=x argument.\n");
            cutilExit(argc, argv);
        }
*/
	}

    // parse arguments
    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) {
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) {
        volumeSize.width = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) {
        volumeSize.height = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) {
         volumeSize.depth = n;
    }

    // load volume data
    char* path = shrFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        shrLog("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);
    
    initCuda(h_volume, volumeSize);
    free(h_volume);

    cutilCheckError( cutCreateTimer( &timer));

    shrLog("Press '=' and '-' to change density\n"
           "      ']' and '[' to change brightness\n"
           "      ';' and ''' to modify transfer function offset\n"
           "      '.' and ',' to modify transfer function scale\n\n");

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));

    if (g_bQAReadback) {
        g_CheckRender = new CheckBackBuffer(width, height, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        uint *d_output;
        cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint)));
        cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint)));

        float modelView[16] = 
        {
            1.0f, 0.0f, 0.0f, 0.0f,
            0.0f, 1.0f, 0.0f, 0.0f,
            0.0f, 0.0f, 1.0f, 0.0f,
            0.0f, 0.0f, 4.0f, 1.0f
        };

        invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12];
        invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13];
        invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14];

        // call CUDA kernel, writing results to PBO
	    copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);
        
        // Start timer 0 and process n loops on the GPU 
        int nIter = 10;
        for (int i = -1; i < nIter; i++)
        {
            if( i == 0 ) {
                cudaThreadSynchronize();
                cutStartTimer(timer); 
            }
            
            render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);
        }
        cudaThreadSynchronize();
        cutStopTimer(timer);
        // Get elapsed time and throughput, then log to sample and master logs
        double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0);
        shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", 
               (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); 
        

        cutilCheckMsg("Error: render_kernel() execution FAILED");
        cutilSafeCall( cudaThreadSynchronize() );

        cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) {
            shrLog("\nFAILED\n\n");
        } else {
            shrLog("\nPASSED\n\n");
        }

        cudaFree(d_output);
    	freeCudaBuffers();

        if (g_CheckRender) {
            delete g_CheckRender; g_CheckRender = NULL;
        }

    } else {
        // This is the normal rendering path for VolumeRender
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        initPixelBuffer();

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(width, height, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }
        atexit(cleanup);

        glutMainLoop();
    }

    cudaThreadExit();
    shrEXIT(argc, (const char**)argv);
}
Example #30
0
int main(int argc, char **argv)
{
	GpuProfiling::initProf();
    // Start logs
    shrSetLogFileName ("scan.txt");
    shrLog("%s Starting...\n\n", argv[0]);

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    uint *d_Input, *d_Output;
    uint *h_Input, *h_OutputCPU, *h_OutputGPU;
    uint hTimer;
    const uint N = 13 * 1048576 / 2;

    shrLog("Allocating and initializing host arrays...\n");
        cutCreateTimer(&hTimer);
        h_Input     = (uint *)malloc(N * sizeof(uint));
        h_OutputCPU = (uint *)malloc(N * sizeof(uint));
        h_OutputGPU = (uint *)malloc(N * sizeof(uint));
        srand(2009);
        for(uint i = 0; i < N; i++)
            h_Input[i] = rand();

    shrLog("Allocating and initializing CUDA arrays...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) );

    shrLog("Initializing CUDA-C scan...\n\n");
        initScan();

    int globalFlag = 1;
    size_t szWorkgroup;
    const int iCycles = 100;
    shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);
        for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){
            shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
                cutilSafeCall( cudaThreadSynchronize() );
                cutResetTimer(hTimer);
                cutStartTimer(hTimer);
                for(int i = 0; i < iCycles; i++)
                {
                    szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
                }
                cutilSafeCall( cudaThreadSynchronize());
                cutStopTimer(hTimer);
                double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

            shrLog("Validating the results...\n");
                shrLog("...reading back GPU results\n");
                    cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) );

                shrLog(" ...scanExclusiveHost()\n");
                    scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results\n");
                    int localFlag = 1;
                    for(uint i = 0; i < N; i++)
                    {
                        if(h_OutputCPU[i] != h_OutputGPU[i])
                        {
                            localFlag = 0;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                // Data log
                if (arrayLength == MAX_SHORT_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
        }

    shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);
        for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){
            shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
                cutilSafeCall( cudaThreadSynchronize() );
                cutResetTimer(hTimer);
                cutStartTimer(hTimer);
                for(int i = 0; i < iCycles; i++)
                {
                    szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
                }
                cutilSafeCall( cudaThreadSynchronize() );
                cutStopTimer(hTimer);
                double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

            shrLog("Validating the results...\n");
                shrLog("...reading back GPU results\n");
                    cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) );

                shrLog("...scanExclusiveHost()\n");
                    scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results\n");
                    int localFlag = 1;
                    for(uint i = 0; i < N; i++)
                    {
                        if(h_OutputCPU[i] != h_OutputGPU[i])
                        {
                            localFlag = 0;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                // Data log
                if (arrayLength == MAX_LARGE_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
        }

    // pass or fail (cumulative... all tests in the loop)
    shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n");
	GpuProfiling::printResults();

    shrLog("Shutting down...\n");
        closeScan();
        cutilSafeCall( cudaFree(d_Output));
        cutilSafeCall( cudaFree(d_Input));

        cutilCheckError( cutDeleteTimer(hTimer) );

        cudaThreadExit();
		exit(0);
        shrEXIT(argc, (const char**)argv);
}