예제 #1
0
파일: oclNbody.cpp 프로젝트: ebads67/ipmacc
//*****************************************************************************
bool CompareResults(int numBodies)
{
    // Run computation on the device/GPU
    shrLog("  Computing on the Device / GPU...\n");
    nbodyGPU->update(0.001f);
    nbodyGPU->synchronizeThreads();

    // Write out device/GPU data file for regression analysis
    shrLog("  Writing out Device/GPU data file for analysis...\n");
    float* fGPUData = nbodyGPU->getArray(BodySystem::BODYSYSTEM_POSITION);
    shrWriteFilef( "oclNbody_Regression.dat", fGPUData, numBodies, 0.0, false);

    // Run computation on the host CPU
    shrLog("  Computing on the Host / CPU...\n\n");
    BodySystemCPU* nbodyCPU = new BodySystemCPU(numBodies);
    nbodyCPU->setArray(BodySystem::BODYSYSTEM_POSITION, hPos);
    nbodyCPU->setArray(BodySystem::BODYSYSTEM_VELOCITY, hVel);
    nbodyCPU->update(0.001f);

    // Check if result matches 
    shrBOOL bMatch = shrComparefe(fGPUData, 
                        nbodyGPU->getArray(BodySystem::BODYSYSTEM_POSITION), 
						numBodies, .001f);
    shrLog("Results %s\n\n", (shrTRUE == bMatch) ? "Match" : "do not match!");

    // Cleanup local allocation
    if(nbodyCPU)delete nbodyCPU; 

    return (shrTRUE == bMatch);
}
extern "C" void initHistogram64(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("...loading Histogram64.cl from file\n");
        char *cHistogram64 = oclLoadProgSource(shrFindFilePath("Histogram64.cl", argv[0]), "// My comment\n", &kernelLength);
        shrCheckError(cHistogram64 != NULL, shrTRUE);

    shrLog("...creating histogram64 program\n");
         cpHistogram64 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cHistogram64, &kernelLength, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...building histogram64 program\n");
        ciErrNum = clBuildProgram(cpHistogram64, 0, NULL, compileOptions, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...creating histogram64 kernels\n");
        ckHistogram64 = clCreateKernel(cpHistogram64, "histogram64", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);
        ckMergeHistogram64 = clCreateKernel(cpHistogram64, "mergeHistogram64", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...allocating internal histogram64 buffer\n");
        d_PartialHistograms = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint), NULL, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    //Save default command queue
    cqDefaultCommandQue = cqParamCommandQue;

    //Discard temp storage
    free(cHistogram64);

    //Save ptx code to separate file
    oclLogPtx(cpHistogram64, oclGetFirstDev(cxGPUContext), "Histogram64.ptx");
}
예제 #3
0
파일: oclMatrixMul.cpp 프로젝트: sk1go/mm
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
    shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
    int i,j,k;
    int error_count=0;
    for (j = 0; j < height; j++) 
    {
        if (error_count < iListLength)
        {
            shrLog("\n  Row %d:\n", j);
        }
        for (i = 0; i < width; i++) 
        {
            k = j * width + i;
            float fDiff = fabs(data1[k] - data2[k]);
            if (fDiff > fListTol) 
            {                
                if (error_count < iListLength)
                {
                    shrLog("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
                }
                error_count++;
            }
        }
    }
    shrLog(" \n  Total Errors = %d\n\n", error_count);
}
예제 #4
0
// assumes the values were initially indices into the array, for simplicity of
// checking correct order of values
bool verifySortUint(unsigned int *keysSorted,
        unsigned int *valuesSorted,
        unsigned int *keysUnsorted,
        unsigned int len)
{
    bool passed = true;
    for(unsigned int i=0; i<len-1; ++i)
    {
        if( (keysSorted[i])>(keysSorted[i+1]) )
        {
            shrLog("Unordered key[%d]: %d > key[%d]: %d\n", i, keysSorted[i], i+1, keysSorted[i+1]);
            passed = false;
            break;
        }
    }

    if (valuesSorted)
    {
        for(unsigned int i=0; i<len; ++i)
        {
            if( keysUnsorted[valuesSorted[i]] != keysSorted[i] )
            {
                shrLog("Incorrectly sorted value[%u] (%u): %u != %u\n",
                        i, valuesSorted[i], keysUnsorted[valuesSorted[i]], keysSorted[i]);
                passed = false;
                break;
            }
        }
    }

    return passed;
}
예제 #5
0
void initGL( int *argc, char **argv )
{
    // initialize GLUT
    glutInit(argc, argv);
    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
    glutInitWindowSize(768, 768);
    glutCreateWindow("CUDA Box Filter");
    glutDisplayFunc(display);

    glutKeyboardFunc(keyboard);
    glutReshapeFunc(reshape);
    glutTimerFunc(REFRESH_DELAY, timerEvent, 0);

    glewInit();
    if (g_bFBODisplay) {
        if (!glewIsSupported( "GL_VERSION_2_0 GL_ARB_fragment_program GL_EXT_framebuffer_object" )) {
            shrLog("Error: failed to get minimal extensions for demo\n");
            shrLog("This sample requires:\n");
            shrLog("  OpenGL version 2.0\n");
            shrLog("  GL_ARB_fragment_program\n");
            shrLog("  GL_EXT_framebuffer_object\n");
            exit(-1);
        }
    } else {
        if (!glewIsSupported( "GL_VERSION_1_5 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object" )) {
            shrLog("Error: failed to get minimal extensions for demo\n");
            shrLog("This sample requires:\n");
            shrLog("  OpenGL version 1.5\n");
            shrLog("  GL_ARB_vertex_buffer_object\n");
            shrLog("  GL_ARB_pixel_buffer_object\n");
            exit(-1);
        }
    }
}
예제 #6
0
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default
//!
//! @return the id 
//! @param clSelectedPlatformID         OpenCL platoform ID
//////////////////////////////////////////////////////////////////////////////
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
{
    char chBuffer[1024];
    cl_uint num_platforms; 
    cl_platform_id* clPlatformIDs;
    cl_int ciErrNum;
    *clSelectedPlatformID = NULL;

    // Get OpenCL platform count
    ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
        return -1000;
    }
    else 
    {
        if(num_platforms == 0)
        {
            shrLog("No OpenCL platform found!\n\n");
            return -2000;
        }
        else 
        {
            // if there's a platform or more, make space for ID's
            if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
            {
                shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
                return -3000;
            }

            // get platform info for each platform and trap the NVIDIA platform if found
            ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
            for(cl_uint i = 0; i < num_platforms; ++i)
            {
                ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
                if(ciErrNum == CL_SUCCESS)
                {
                    if(strstr(chBuffer, "NVIDIA") != NULL)
                    {
                        *clSelectedPlatformID = clPlatformIDs[i];
                        break;
                    }
                }
            }

            // default to zeroeth platform if NVIDIA not found
            if(*clSelectedPlatformID == NULL)
            {
                shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
                *clSelectedPlatformID = clPlatformIDs[0];
            }

            free(clPlatformIDs);
        }
    }

    return CL_SUCCESS;
}
extern "C" void initConvolutionSeparable(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("Loading ConvolutionSeparable.cl...\n");
        char *cPathAndName = shrFindFilePath("ConvolutionSeparable.cl", argv[0]);
        oclCheckError(cPathAndName != NULL, shrTRUE);
        char *cConvolutionSeparable = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
        oclCheckError(cConvolutionSeparable != NULL, shrTRUE);

    shrLog("Creating convolutionSeparable program...\n");
        cpConvolutionSeparable = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cConvolutionSeparable, &kernelLength, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Building convolutionSeparable program...\n");
        char compileOptions[2048];
        #ifdef _WIN32
            sprintf_s(compileOptions, 2048, "\
                -cl-fast-relaxed-math                                  \
                -D KERNEL_RADIUS=%u\
                -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\
                -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\
                -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\
                -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\
                ",
                KERNEL_RADIUS,
                ROWS_BLOCKDIM_X,   COLUMNS_BLOCKDIM_X,
                ROWS_BLOCKDIM_Y,   COLUMNS_BLOCKDIM_Y,
                ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS,
                ROWS_HALO_STEPS,   COLUMNS_HALO_STEPS
            );
        #else
            sprintf(compileOptions, "\
                -cl-fast-relaxed-math                                  \
                -D KERNEL_RADIUS=%u\
                -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\
                -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\
                -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\
                -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\
                ",
                KERNEL_RADIUS,
                ROWS_BLOCKDIM_X,   COLUMNS_BLOCKDIM_X,
                ROWS_BLOCKDIM_Y,   COLUMNS_BLOCKDIM_Y,
                ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS,
                ROWS_HALO_STEPS,   COLUMNS_HALO_STEPS
            );
        #endif
        ciErrNum = clBuildProgram(cpConvolutionSeparable, 0, NULL, compileOptions, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ckConvolutionRows = clCreateKernel(cpConvolutionSeparable, "convolutionRows", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ckConvolutionColumns = clCreateKernel(cpConvolutionSeparable, "convolutionColumns", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    cqDefaultCommandQueue = cqParamCommandQueue;
    free(cConvolutionSeparable);
}
예제 #8
0
//-----------------------------------------------------------------------------
// Name: InitD3D9()
// Desc: Initializes Direct3D9
//-----------------------------------------------------------------------------
HRESULT InitD3D9(HWND hWnd) 
{
	// Create the D3D object.
    if( NULL == ( g_pD3D = Direct3DCreate9( D3D_SDK_VERSION ) ) )
	{
        shrLog("No Direct3D9 device available\n");
        Cleanup(EXIT_SUCCESS);
	}

    // Find the first CL capable device
    for(g_iAdapter = 0; g_iAdapter < g_pD3D->GetAdapterCount(); g_iAdapter++)
    {
		D3DCAPS9 caps;
		if (FAILED(g_pD3D->GetDeviceCaps(g_iAdapter, D3DDEVTYPE_HAL, &caps)))
			// Adapter doesn't support Direct3D
			continue;

        if(FAILED(g_pD3D->GetAdapterIdentifier(g_iAdapter, 0, &g_adapter_id)))
			return E_FAIL;
		break;
    }
    // we check to make sure we have found a OpenCL-compatible D3D device to work on
    if(g_iAdapter == g_pD3D->GetAdapterCount() ) 
    {
        shrLog("No OpenCL-compatible Direct3D9 device available\n");
		// destroy the D3D device
		g_pD3D->Release();
        Cleanup(EXIT_SUCCESS);
    }

	// Create the D3D Display Device
    RECT                  rc;       GetClientRect(hWnd,&rc);
    D3DDISPLAYMODE        d3ddm;    g_pD3D->GetAdapterDisplayMode(g_iAdapter, &d3ddm);
    D3DPRESENT_PARAMETERS d3dpp;    ZeroMemory( &d3dpp, sizeof(d3dpp) );
    d3dpp.Windowed               = TRUE;
    d3dpp.BackBufferCount        = 1;
    d3dpp.SwapEffect             = D3DSWAPEFFECT_DISCARD;
    d3dpp.hDeviceWindow          = hWnd;
	d3dpp.BackBufferWidth	     = g_WindowWidth;
    d3dpp.BackBufferHeight       = g_WindowHeight;

    d3dpp.BackBufferFormat       = d3ddm.Format;

    
	if (FAILED (g_pD3D->CreateDevice (g_iAdapter, D3DDEVTYPE_HAL, hWnd, 
									  D3DCREATE_HARDWARE_VERTEXPROCESSING, 
									  &d3dpp, &g_pD3DDevice) ))
		return E_FAIL;	

	// We clear the back buffer
	g_pD3DDevice->BeginScene();
	g_pD3DDevice->Clear(0, NULL, D3DCLEAR_TARGET, 0, 1.0f, 0);	
	g_pD3DDevice->EndScene();

	return S_OK;
}
예제 #9
0
void
showHelp()
{
    shrLog("\n> Command line options\n");
    shrLog("\t-fullscreen    (run n-body simulation in fullscreen mode)\n");
    shrLog("\t-fp64          (use double precision floating point values for simulation)\n");
    shrLog("\t-hostmem       (stores simulation data in host memory)\n");
    shrLog("\t-benchmark     (run benchmark to measure perfomance) \n");
    shrLog("\t-n=<numBodies> (set particle number) \n"); 
    shrLog("\t-compare       (compare to CPU results)\n");
    shrLog("\t-device=<d>    (where d=0,1,2.... for the CUDA device to use)\n");
    shrLog("\t-numdevices=<n> (where n is the number of CUDA devices to use for simulation)\n");
    shrLog("\t-cpu (run n-body simulation on the CPU)\n\n");
}
// Keyboard events handler
//*****************************************************************************
void KeyboardGL(unsigned char key, int x, int y)
{
    switch(key) 
    {
        case 'P':   // P toggles Processing between CPU and GPU
        case 'p':   // p toggles Processing between CPU and GPU
            if (iProcFlag == 0)
            {
                iProcFlag = 1;
            }
            else 
            {
                iProcFlag = 0;
            }
            shrLog("\n%s Processing...\n", cProcessor[iProcFlag]);
            break;
        case ' ':   // space bar toggles processing on and off
            bPostprocess = !bPostprocess;
            shrLog("\nPostprocessing (Blur Filter) Toggled %s...\n", bPostprocess ? "ON" : "OFF");
            break;
        case 'A':   // 'A' toggles animation (spinning of teacup) on/off  
        case 'a':   // 'a' toggles animation (spinning of teacup) on/off 
            bAnimate = !bAnimate;
            shrLog("\nGL Animation (Rotation) Toggled %s...\n", bAnimate ? "ON" : "OFF");
            break;
        case '=':
        case '+':
            if (blur_radius < 16) blur_radius++;
            shrLog("\nBlur radius = %d\n", blur_radius);
            break;
        case '-':
        case '_':
            if (blur_radius > 1) blur_radius--;
            shrLog("\nBlur radius = %d\n", blur_radius);
            break;
        case '\033': // escape quits
        case '\015': // Enter quits    
        case 'Q':    // Q quits
        case 'q':    // q (or escape) quits
            // Cleanup then quit (without prompting)
            bNoPrompt = shrTRUE;
            Cleanup(EXIT_SUCCESS);
            break;
    }

    // Trigger fps update and call for refresh
    TriggerFPSUpdate();
    glutPostRedisplay();
}
예제 #11
0
파일: oclMatrixMul.cpp 프로젝트: sk1go/mm
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
    shrQAStart(argc, argv);

    // start the logs
    shrSetLogFileName ("oclMatrixMul.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // run the code
    bool bOK = (runTest(argc, (const char **)argv) == CL_SUCCESS);
    shrLog("%s\n\n", (bOK ? "PASSED" : "FAILED"));

    // finish
    shrQAFinishExit(argc, (const char **)argv, (bOK ? QA_PASSED : QA_FAILED));
}
예제 #12
0
//////////////////////////////////////////////////////////////////////
//  readback
//
//////////////////////////////////////////////////////////////////////
bool CheckBackBuffer::readback( GLuint width, GLuint height )
{
    bool ret = false;

    if (m_bUsePBO) 
    {
        // binds the PBO for readback
        bindReadback();

        // Initiate the readback BLT from BackBuffer->PBO->membuf
	    glReadPixels(0, 0, width, height, getPixelFormat(),      GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
        ret = checkStatus(__FILE__, __LINE__, true);
        if (!ret) shrLog("CheckBackBuffer::glReadPixels() checkStatus = %d\n", ret);

	    // map - unmap simulates readback without the copy
	    void *ioMem = glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB);
        memcpy(m_pImageData,    ioMem, width*height*m_Bpp);

		glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB);

        // release the PBO
        unbindReadback();
    } else {
        // reading direct from the backbuffer
        glReadBuffer(GL_FRONT);
        glReadPixels(0, 0, width, height, getPixelFormat(), GL_UNSIGNED_BYTE, m_pImageData);
    }

    return ret; 
}
예제 #13
0
// Cleanup and exit code
// *********************************************************************
void Cleanup(int iExitCode)
{
    // Cleanup allocated objects
    shrLog("Starting Cleanup...\n\n");
    if(cdDevices)free(cdDevices);
    if(cPathAndName)free(cPathAndName);
    if(cSourceCL)free(cSourceCL);
    if(ceEvent)clReleaseEvent(ceEvent);  
    if(ckKernel)clReleaseKernel(ckKernel);  
    if(cpProgram)clReleaseProgram(cpProgram);
    if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
    if(cxGPUContext)clReleaseContext(cxGPUContext);
    if (cmM)clReleaseMemObject(cmM);
    if (cmV)clReleaseMemObject(cmV);
    if (cmW)clReleaseMemObject(cmW);

    // Free host memory
    free(M); 
    free(V);
    free(W);
    free(Golden);

    shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", cExecutableName);
    exit (iExitCode);
}
예제 #14
0
void 
CheckRender::savePGM(  const char *zfilename, bool bInvert, void **ppReadBuf )
{
    if (zfilename != NULL) {
        if (bInvert) {
            unsigned char *readBuf;
            unsigned char *writeBuf= (unsigned char *)malloc(m_Width * m_Height);

            for (unsigned int y=0; y < m_Height; y++) {
                if (ppReadBuf) {
                    readBuf = *(unsigned char **)ppReadBuf;
                } else {
                    readBuf = (unsigned char *)m_pImageData;
                }
                memcpy(&writeBuf[m_Width*m_Bpp*y], (readBuf+ m_Width*(m_Height-1-y)), m_Width);
            }
            // we copy the results back to original system buffer
            if (ppReadBuf) {
                memcpy(*ppReadBuf, writeBuf, m_Width*m_Height);
            } else {
                memcpy(m_pImageData, writeBuf, m_Width*m_Height);
            }
            free (writeBuf);
        }
        shrLog("\n> Saving PGM: <%s>\n", zfilename);
        if (ppReadBuf) {
		    shrSavePGMub(zfilename, *(unsigned char **)ppReadBuf, m_Width, m_Height);
        } else {
		    shrSavePGMub(zfilename, (unsigned char *)m_pImageData, m_Width, m_Height);
        }
    }
}
예제 #15
0
bool 
CheckRender::PPMvsPPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold )
{
    char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath);
    if (ref_file_path == NULL) {
        shrLog("\nCheckRender::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, m_ExecPath);
        return false;
    } 

    if (src_file == NULL || ref_file == NULL) {
        shrLog("\nCheckRender::PPMvsPPM: Aborting comparison\n");
        return false;
    }
    
    return (shrComparePPM(src_file, ref_file_path, epsilon, threshold) == shrTRUE ? true : false);
}
void Cleanup (int iExitCode)
{
    // Cleanup allocated objects
    shrLog("Starting Cleanup...\n\n");
    if(cPathAndName)free(cPathAndName);
    if(cSourceCL)free(cSourceCL);
	if(ckKernel)clReleaseKernel(ckKernel);  
    if(cpProgram)clReleaseProgram(cpProgram);
    if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
    if(cxGPUContext)clReleaseContext(cxGPUContext);
    if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
    if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
    if(cmDevDst)clReleaseMemObject(cmDevDst);

    // Free host memory
    free(srcA); 
    free(srcB);
    free (dst);
    free(Golden);

    // finalize logs and leave
    if (bNoPrompt)
    {
        shrLogEx(LOGBOTH | CLOSELOG, 0, "oclVectorAdd.exe Exiting...\n");
    }
    else 
    {
        shrLogEx(LOGBOTH | CLOSELOG, 0, "oclVectorAdd.exe Exiting...\nPress <Enter> to Quit\n");
        getchar();
    }
    exit (iExitCode);
}
예제 #17
0
void keyboard(unsigned char key, int /*x*/, int /*y*/)
{
    switch(key) {
    case 27:
        exit(0);
        break;
    case '=':
    case '+':
        if (filter_radius < (int)width-1 &&
                filter_radius < (int)height-1)
        {
            filter_radius++;
        }
        break;
    case '-':
        if (filter_radius > 1) filter_radius--;
        break;
    case ']':
        iterations++;
        break;
    case '[':
        if (iterations>1) iterations--;
        break;
    default:
        break;
    }
    shrLog("radius = %d, iterations = %d\n", filter_radius, iterations);
}
예제 #18
0
    bool _compareResults(int numBodies)
    {
        assert(m_nbodyCuda);

        bool passed = true;

        m_nbody->update(0.001f);

        {
            m_nbodyCpu = new BodySystemCPU<T>(numBodies);

            m_nbodyCpu->setArray(BODYSYSTEM_POSITION, m_hPos);
            m_nbodyCpu->setArray(BODYSYSTEM_VELOCITY, m_hVel);

            m_nbodyCpu->update(0.001f);

            T* cudaPos = m_nbodyCuda->getArray(BODYSYSTEM_POSITION);
            T* cpuPos  = m_nbodyCpu->getArray(BODYSYSTEM_POSITION);

            T tolerance = 0.0005f;
            for (int i = 0; i < numBodies; i++)
            {
//                if (((i + 1) % 4) && fabs(cpuPos[i] - cudaPos[i]) > tolerance)
                if (((i + 1) % 4) && fabs(cpuPos[i] - cudaPos[i]) > tolerance)
                {
                    passed = false;
                    shrLog("Error: element %d: (host)%f != (device)%f\n", i, cpuPos[i], cudaPos[i]);
                }
            }
        }
        return passed;
    }
void ParticleSystem::_initialize(int numParticles){
    assert(!m_bInitialized);
    m_numParticles = numParticles;

    //Allocate host storage
    m_hPos          = (float *)malloc(m_numParticles * 4 * sizeof(float));
    m_hVel          = (float *)malloc(m_numParticles * 4 * sizeof(float));
    m_hReorderedPos = (float *)malloc(m_numParticles * 4 * sizeof(float));
    m_hReorderedVel = (float *)malloc(m_numParticles * 4 * sizeof(float));
    m_hHash         = (uint  *)malloc(m_numParticles * sizeof(uint));
    m_hIndex        = (uint  *)malloc(m_numParticles * sizeof(uint));
    m_hCellStart    = (uint  *)malloc(m_numGridCells * sizeof(uint));
    m_hCellEnd      = (uint  *)malloc(m_numGridCells * sizeof(uint));

    memset(m_hPos, 0, m_numParticles * 4 * sizeof(float));
    memset(m_hVel, 0, m_numParticles * 4 * sizeof(float));
    memset(m_hCellStart, 0, m_numGridCells * sizeof(uint));
    memset(m_hCellEnd,   0, m_numGridCells * sizeof(uint));

    //Allocate GPU data
    shrLog("Allocating GPU Data buffers...\n\n");
    allocateArray(&m_dPos,          m_numParticles * 4 * sizeof(float));
    allocateArray(&m_dVel,          m_numParticles * 4 * sizeof(float));
    allocateArray(&m_dReorderedPos, m_numParticles * 4 * sizeof(float));
    allocateArray(&m_dReorderedVel, m_numParticles * 4 * sizeof(float));
    allocateArray(&m_dHash,         m_numParticles * sizeof(uint));
    allocateArray(&m_dIndex,        m_numParticles * sizeof(uint));
    allocateArray(&m_dCellStart,    m_numGridCells * sizeof(uint));
    allocateArray(&m_dCellEnd,      m_numGridCells * sizeof(uint));

    if (!m_bQATest)
    {
        //Allocate VBO storage
        m_posVbo   = createVBO(m_numParticles * 4 * sizeof(float));
        m_colorVBO = createVBO(m_numParticles * 4 * sizeof(float));

        //Fill color buffer
        glBindBufferARB(GL_ARRAY_BUFFER, m_colorVBO);
        float *data = (float *)glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
            float *ptr = data;
            for(uint i = 0; i < m_numParticles; i++){
                float t = (float)i / (float) m_numParticles;
                #if 0
                    *ptr++ = rand() / (float) RAND_MAX;
                    *ptr++ = rand() / (float) RAND_MAX;
                    *ptr++ = rand() / (float) RAND_MAX;
                #else
                    colorRamp(t, ptr);
                    ptr += 3;
                #endif
                *ptr++ = 1.0f;
            }
        glUnmapBufferARB(GL_ARRAY_BUFFER);
    }

    setParameters(&m_params);
    setParametersHost(&m_params);
    m_bInitialized = true;
}
예제 #20
0
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram    OpenCL program
//! @param cdDevice     device of interest
//////////////////////////////////////////////////////////////////////////////
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
{
    // write out the build log and ptx, then exit
    char cBuildLog[10240];
    clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 
                          sizeof(cBuildLog), cBuildLog, NULL );
    shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
}
예제 #21
0
void loadImageData(int argc, char **argv)
{
    // load image (needed so we can get the width and height before we create the window
    char* image_path = NULL;
    if (argc >= 1) image_path = shrFindFilePath(image_filename, argv[0]);
    if (image_path == 0) {
        shrLog("Error finding image file '%s'\n", image_filename);
        exit(EXIT_FAILURE);
    }

    cutilCheckError(cutLoadPPM4ub(image_path, (unsigned char **) &h_img, &width, &height));
    if (!h_img) {
        shrLog("Error opening file '%s'\n", image_path);
        exit(-1);
    }
    shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height);
}
예제 #22
0
    void _runBenchmark(int iterations)
    {
        // once without timing to prime the device
        if (!useCpu)
            m_nbody->update(activeParams.m_timestep);

        if (useCpu)
        {
            cutCreateTimer(&timer);
            cutStartTimer(timer);
        }
        else
        {
            cutilSafeCall(cudaEventRecord(startEvent, 0));
        }

        for (int i = 0; i < iterations; ++i)
        {
            m_nbody->update(activeParams.m_timestep);
        }

        float milliseconds = 0;

        if (useCpu)
        {
            cutStopTimer(timer);
            milliseconds = cutGetTimerValue(timer);
            cutDeleteTimer(timer);
        }
        else
        {
            cutilSafeCall(cudaEventRecord(stopEvent, 0));  
            cutilSafeCall(cudaEventSynchronize(stopEvent));
            cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
        }
        
        double interactionsPerSecond = 0;
        double gflops = 0;
        computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations);

        shrLog("%d bodies, total time for %d iterations: %.3f ms\n", 
            numBodies, iterations, milliseconds);
        shrLog("= %.3f billion interactions per second\n", interactionsPerSecond);
        shrLog("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, 
               (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction);   
    }
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
	CUresult error_result = cuDeviceGetAttribute( attribute, device_attribute, device );
    if (error_result != CUDA_SUCCESS) {
        shrLog( "cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result) );
		exit(0);
    }
}
예제 #24
0
void b2CLCommonData::initReadLastJointImpulses()
{
	if (b2clGlobal_OpenCLSupported)
	{
		printf("Initializing b2CLCommonData...\n");
    
		int err;
    
		//load opencl programs from files
		char* commonDataSource = 0;
		size_t commonDataSourceLen = 0;

		shrLog("...loading b2CLCommonData.cl\n");
    #ifdef linux
    	commonDataSource = b2clLoadProgSource(shrFindFilePath("/opt/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen);
	#elif defined (_WIN32)
		commonDataSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen);
    #elif defined (__EMSCRIPTEN__)
        commonDataSource = b2clLoadProgSource(shrFindFilePath("./Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen);
	#else
		commonDataSource = b2clLoadProgSource(shrFindFilePath("../../../Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen);
	#endif
		if(commonDataSource == NULL)
		{
			b2Log("Could not load program source, is path 'b2CLCommonData.cl' correct?");
		}

		//create the compute program from source kernel code
		commonDataProgram = clCreateProgramWithSource(b2CLDevice::instance().GetContext(), 1, (const char**) &commonDataSource, NULL, &err);
		if (!commonDataProgram)
		{
			printf("Error: Failed to create compute program!\n");
			exit(1);
		}
    
		//build the program
		err = clBuildProgram(commonDataProgram, 0, NULL, OPENCL_BUILD_PATH, NULL, NULL);
		if (err != CL_SUCCESS)
		{
			size_t len;
			char buffer[20480];
        
			printf("Error: Failed to build program executable!\n");
			clGetProgramBuildInfo(commonDataProgram, b2CLDevice::instance().GetCurrentDevice(), CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			printf("%s\n", buffer);
			exit(1);
		}
    
		//create the compute kernel
		readLastJointImpulseKernel = clCreateKernel(commonDataProgram, "ReadLastJointImpulses", &err);
		if (!readLastJointImpulseKernel || err != CL_SUCCESS)
		{
			printf("Error: Failed to create compute kernel!\n");
			exit(1);
		}
		b2CLDevice::instance().getMaximumKernelWorkGroupSize(readLastJointImpulseKernel, maxWorkGroupSizeForreadLastJointImpulse);
	}
}
예제 #25
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runBenchmark( int argc, char **argv )
{
    int devID = 0;
    shrLog("[runBenchmark]: [%s]\n", sSDKsample);
    devID = cutilChooseCudaDevice(argc, argv);

    loadImageData(argc, argv);

    initCuda();

    g_CheckRender       = new CheckBackBuffer(width, height, 4, false);
    g_CheckRender->setExecPath(argv[0]);

    unsigned int *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) );

    // warm-up
    boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads);
    cutilSafeCall( cutilDeviceSynchronize() );

    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    shrLog("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles);
    shrDeltaT(2);
    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads);
    }

    // check if kernel execution generated an error and sync host
    cutilCheckMsg("Error: boxFilterRGBA Kernel execution FAILED");
    cutilSafeCall(cutilDeviceSynchronize());

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    shrLogEx(LOGBOTH | MASTER, 0, "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n",
             (1.0e-6 * width * height)/dProcessingTime, dProcessingTime,
             (width * height), 1, nthreads);
    shrLog("\n");
}
예제 #26
0
void CheckRender::bindReadback() {
    if (!m_bQAReadback) {
        shrLog("CheckRender::bindReadback() uninitialized!\n");
        return;
    }
    if (m_bUsePBO) {
		glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, m_pboReadback);	// Bind the PBO
    }
}
예제 #27
0
void CheckRender::unbindReadback() {
    if (!m_bQAReadback) {
        shrLog("CheckRender::unbindReadback() uninitialized!\n");
        return;
    }
    if (m_bUsePBO) {
		glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0);	// Release the bind on the PBO
    }
}
예제 #28
0
//////////////////////////////////////////////////////////////////////
//  readback 
//
//   Code to handle reading back of the FBO data
//
//////////////////////////////////////////////////////////////////////
bool CheckFBO::readback( GLuint width, GLuint height )
{
    bool ret = false;

    if (m_bUsePBO) {
        // binds the PBO for readback
        bindReadback();

        // bind FBO buffer (we want to transfer FBO -> PBO)
        glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_pFrameBufferObject->getFbo());

        // Now initiate the readback to PBO
	    glReadPixels(0, 0, width, height, getPixelFormat(),      GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
        ret = checkStatus(__FILE__, __LINE__, true);
        if (!ret) shrLog("CheckFBO::readback() FBO->PBO checkStatus = %d\n", ret);

	    // map - unmap simulates readback without the copy
	    void *ioMem = glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB);
        memcpy(m_pImageData,    ioMem, width*height*m_Bpp);

		glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB);

        // release the FBO
		glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); 

        // release the PBO
        unbindReadback();
    } else {
        // Reading back from FBO using glReadPixels
        glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_pFrameBufferObject->getFbo());
        ret = checkStatus(__FILE__, __LINE__, true);
        if (!ret) shrLog("CheckFBO::readback::glBindFramebufferEXT() checkStatus = %d\n", ret);

        glReadBuffer(static_cast<GLenum>(GL_COLOR_ATTACHMENT0_EXT));
        ret &= checkStatus(__FILE__, __LINE__, true);
        if (!ret) shrLog("CheckFBO::readback::glReadBuffer() checkStatus = %d\n", ret);

	    glReadPixels(0, 0, width, height, getPixelFormat(), GL_UNSIGNED_BYTE, m_pImageData);

	    glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
    }

    return CHECK_FBO;
}
/////////////////////////////////////////////////////////
//print results in an easily read format
////////////////////////////////////////////////////////
void printResultsReadable(unsigned int *memSizes, double* bandwidths, unsigned int count, memcpyKind kind, accessMode accMode, memoryMode memMode, int iNumDevs)
{
    // log config information 
    if (kind == DEVICE_TO_DEVICE)
    {
        shrLog("Device to Device Bandwidth, %i Device(s)\n", iNumDevs);
    }
    else 
    {
        if (kind == DEVICE_TO_HOST)
        {
            shrLog("Device to Host Bandwidth, %i Device(s), ", iNumDevs);
        }
        else if (kind == HOST_TO_DEVICE)
        {
            shrLog("Host to Device Bandwidth, %i Device(s), ", iNumDevs);
        }
        if(memMode == PAGEABLE)
        {
            shrLog("Paged memory");
        }
        else if (memMode == PINNED)
        {
            shrLog("Pinned memory");
        }
        if(accMode == DIRECT)
        {
            shrLog(", direct access\n");
        }
        else if (accMode == MAPPED)
        {
            shrLog(", mapped access\n");
        }
    }

    shrLog("   Transfer Size (Bytes)\tBandwidth(MB/s)\n");
    unsigned int i; 
    for(i = 0; i < (count - 1); i++)
    {
        shrLog("   %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]);
    }
    shrLog("   %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]);
}
예제 #30
0
bool CheckBackBuffer::checkStatus(const char *zfile, int line, bool silent)
{
    GLenum nErrorCode = glGetError();

    if (nErrorCode != GL_NO_ERROR)
    {
        if (!silent)
           shrLog("Assertion failed(%s,%d): %s\n", zfile, line, gluErrorString(nErrorCode));
    }
    return true;
}