Exemplo n.º 1
0
void ofxClScheduler::printDeviceInfo(int device_nr) {
	// get and log device info
	if (device_nr != -1)
		device = oclGetDev(context, device_nr);
	else
		device = oclGetMaxFlopsDev(context);
	
	oclPrintDevInfo(0, device);
}
Exemplo n.º 2
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, const char** argv) {

  const char *my_name = "[oclAvcDisc]";

  int bPassed = 1;
  char filename[500], cBuffer[1024], sProfileString[2048];
  FILE *log_ofs=NULL;
  time_t g_the_time;

  /* OpenCL variables */
  cl_int ciErrNum;
  cl_platform_id clSelectedPlatformID = NULL; 
  cl_uint ciDeviceCount;
  cl_device_id *devices;

  sprintf(filename, "oclAvcDisc.txt");
  if( (log_ofs=fopen(filename, "a"))== NULL )  {
    fprintf(stderr, "[oclAvcDisc] Error, could not open file %s\n", filename);
    exit(1);
  }

  g_the_time = time(NULL);

  _write_log(log_ofs, "%s oclDeviceQuery.exe Starting...\n", my_name); 

  /* Get OpenCL platform ID for NVIDIA if avaiable, otherwise default */
  _write_log(log_ofs, "%s OpenCL SW Info:\n", my_name);
  ciErrNum = oclGetPlatformID (&clSelectedPlatformID);
  oclCheckError(ciErrNum, CL_SUCCESS);

  /* Get OpenCL platform name and version */
  ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
  if (ciErrNum == CL_SUCCESS) {
    _write_log(log_ofs, "%s CL_PLATFORM_NAME: \t%s\n", my_name, cBuffer);
  } else {
    _write_log(log_ofs, "%s Error %i in clGetPlatformInfo Call !!!\n\n", my_name, ciErrNum);
    bPassed = 0;
  }

  ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_VERSION, sizeof(cBuffer), cBuffer, NULL);
  if (ciErrNum == CL_SUCCESS) {
    _write_log(log_ofs, "%s CL_PLATFORM_VERSION: \t%s\n", my_name, cBuffer);
  } else {
    _write_log(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum);
    bPassed = 0;
  }

  // Get and log OpenCL device info 
  _write_log(log_ofs, "%s OpenCL Device Info:\n\n", my_name);
  ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount);

  // check for 0 devices found or errors... 
  if (ciDeviceCount == 0) {
    _write_log(log_ofs, "%s No devices found supporting OpenCL (return code %i)\n\n", my_name, ciErrNum);
    bPassed = false;
  } else if (ciErrNum != CL_SUCCESS) {
    _write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum);
    bPassed = false;
  } else {
    // Get and log the OpenCL device ID's
     char cTemp[2];
    _write_log(log_ofs, "%s %u devices found supporting OpenCL:\n\n", my_name , ciDeviceCount);
    sprintf(cTemp, "%u", ciDeviceCount);
    if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) == NULL) {
      _write_log(log_ofs, "%s Failed to allocate memory for devices !!!\n\n", my_name);
      bPassed = false;
    }
    ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount);
    if (ciErrNum == CL_SUCCESS) {
      //Create a context for the devices
      cl_context cxGPUContext = clCreateContext(0, ciDeviceCount, devices, NULL, NULL, &ciErrNum);
      if (ciErrNum != CL_SUCCESS) {
        _write_log(log_ofs, "%s Error %i in clCreateContext call !!!\n\n", my_name, ciErrNum);
        bPassed = false;
      } else {
        // show info for each device in the context
        for(unsigned int i = 0; i < ciDeviceCount; ++i ) {  
          _write_log(log_ofs, "%s ---------------------------------\n", my_name);
          clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
          _write_log(log_ofs, "%s Device %s\n", my_name, cBuffer);
          _write_log(log_ofs, "%s ---------------------------------\n", my_name);
          oclPrintDevInfo(LOGBOTH, devices[i]);
        }
        // Determine and show image format support 
        cl_uint uiNumSupportedFormats = 0;

        // 2D
        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, NULL, NULL, &uiNumSupportedFormats);
        cl_image_format *ImageFormats = NULL;
        ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format));
        if(ImageFormats==NULL) {
          _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name);
          exit(2);
        }

        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, uiNumSupportedFormats, ImageFormats, NULL);
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  2D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); 
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); 
        for(unsigned int i = 0; i < uiNumSupportedFormats; i++) {  
           _write_log(log_ofs, "%s  %-6u%-16s%-22s\n", my_name, (i + 1), 
               oclImageFormatString(ImageFormats[i].image_channel_order), 
               oclImageFormatString(ImageFormats[i].image_channel_data_type));
        }
        _write_log(log_ofs, "%s\n", my_name); 
        free(ImageFormats); ImageFormats = NULL;

        // 3D
        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, NULL, NULL, &uiNumSupportedFormats);
        ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format));
        if(ImageFormats==NULL) {
          _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name);
          exit(3);
        }
        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, uiNumSupportedFormats, ImageFormats, NULL);
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  3D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); 
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); 
        for(unsigned int i = 0; i < uiNumSupportedFormats; i++) {  
          _write_log(log_ofs, "%s  %-6u%-16s%-22s\n", my_name, (i + 1),
              oclImageFormatString(ImageFormats[i].image_channel_order), 
              oclImageFormatString(ImageFormats[i].image_channel_data_type));
        }
        write_log(log_ofs, "%s\n", my_name); 
        free(ImageFormats); ImageFormats=NULL;
      }
    } else {
      write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum);
      bPassed = 0;
    }
  }

  // finish
  _write_log(log_ofs, "%s %s\n\n", my_name, bPassed==1 ? "PASSED" : "FAILED"); 
  fflush(log_ofs);
  fclose(log_ofs);
  return(0);
}
Exemplo n.º 3
0
int main(int argc, const char **argv)
{
    cl_platform_id cpPlatform;                      // OpenCL platform
    cl_uint nDevice;                                // OpenCL device count
    cl_device_id* cdDevices;                        // OpenCL device list
    cl_context cxGPUContext;                        // OpenCL context
    cl_command_queue cqCommandQueue[MAX_GPU_COUNT]; // OpenCL command que
    cl_int ciErrNum;

    shrSetLogFileName ("oclRadixSort.txt");
    shrLog("%s starting...\n\n", argv[0]);

    shrLog("clGetPlatformID...\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clGetDeviceIDs...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clCreateContext...\n");
    cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("Create command queue...\n\n");
    int id_device;
    if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line
    {
        // get & log device index # and name
        cl_device_id cdDevice = cdDevices[id_device];

        // create a command que
        cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        oclPrintDevInfo(LOGBOTH, cdDevice);
        nDevice = 1;
    }
    else
    {
        // create command queues for all available devices
        for (cl_uint i = 0; i < nDevice; i++)
        {
            cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &ciErrNum);
            oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        }
        for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]);
    }

    int ctaSize;
    if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize))
    {
        ctaSize = 128;
    }

    shrLog("Running Radix Sort on %d GPU(s) ...\n\n", nDevice);

    unsigned int numElements = 1048576;//128*128*128*2;

    // Alloc and init some data on the host, then alloc and init GPU buffer
    unsigned int **h_keys       = (unsigned int**)malloc(nDevice * sizeof(unsigned int*));
    unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*));
    cl_mem       *d_keys        = (cl_mem*       )malloc(nDevice * sizeof(cl_mem));
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        h_keys[iDevice]       = (unsigned int*)malloc(numElements * sizeof(unsigned int));
        h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int));
        makeRandomUintVector(h_keys[iDevice], numElements, keybits);

        d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE,
                sizeof(unsigned int) * numElements, NULL, &ciErrNum);
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0,
                sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    }

    // instantiate RadixSort objects
    RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*));
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, argv[0], ctaSize, true);
    }

#ifdef GPU_PROFILING
    int numIterations = 30;
    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
            {
                clFinish(cqCommandQueue[iDevice]);
            }
            shrDeltaT(1);
        }
#endif
        for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
        {
            radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits);
        }
#ifdef GPU_PROFILING
    }
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clFinish(cqCommandQueue[iDevice]);
    }
    double gpuTime = shrDeltaT(1)/(double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclRadixSort, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %d, Workgroup = %d\n",
            (1.0e-6 * (double)(nDevice * numElements)/gpuTime), gpuTime, nDevice * numElements, nDevice, ctaSize);
#endif

    // copy sorted keys to CPU
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements,
                h_keysSorted[iDevice], 0, NULL, NULL);
    }

    // Check results
    bool passed = true;
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements);
    }
    shrLog("\n%s\n\n", passed ? "PASSED" : "FAILED");

    // cleanup allocs
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clReleaseMemObject(d_keys[iDevice]);
        free(h_keys[iDevice]);
        free(h_keysSorted[iDevice]);
        delete radixSort[iDevice];
    }
    free(radixSort);
    free(h_keys);
    free(h_keysSorted);

    // remaining cleanup and exit
    free(cdDevices);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clReleaseCommandQueue(cqCommandQueue[iDevice]);
    }
    clReleaseContext(cxGPUContext);
    shrEXIT(argc, argv);
}
// main function
//*****************************************************************************
int main(int argc, const char **argv)
{
    cl_platform_id cpPlatform;      // OpenCL platform
    cl_uint nDevice;                // OpenCL device count
    cl_device_id* cdDevices;        // OpenCL device list    
    cl_context cxGPUContext;        // OpenCL context
    cl_command_queue cqCommandQue[MAX_GPU_COUNT];             // OpenCL command que
    cl_int ciErrNum = 1;            // Error code var

    shrQAStart(argc, (char **)argv);

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("clGetPlatformID...\n"); 

    //Get all the devices
    cl_uint uiNumDevices = 0;           // Number of devices available
    cl_uint uiTargetDevice = 0;	        // Default Device to compute on
    cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    // Get command line device options and config accordingly
    shrLog("  # of Devices Available = %u\n", uiNumDevices); 
    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) 
    {
        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
    }
    shrLog("  Using Device %u: ", uiTargetDevice); 
    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
    ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("\n  # of Compute Units = %u\n", uiNumComputeUnits); 
	
    shrSetLogFileName ("oclHiddenMarkovModel.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Get platform...\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("Get devices...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clCreateContext\n");
    cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clCreateCommandQueue\n"); 
    int id_device;
    if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line
    {
        // create a command que
        cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[id_device], 0, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        oclPrintDevInfo(LOGBOTH, cdDevices[id_device]);
        nDevice = 1;   
    } 
    else 
    { // create command queues for all available devices        
        for (cl_uint i = 0; i < nDevice; i++) 
        {
            cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        }
        for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]);
    }

    shrLog("\nUsing %d GPU(s)...\n\n", nDevice);
	int wgSize;
	if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) 
	{
		wgSize = 256;
	}

    shrLog("Init Hidden Markov Model parameters\n");
    int nState = 256*16; // number of states, must be a multiple of 256
    int nEmit  = 128; // number of possible observations
    
    float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability
    float *mtState  = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix
    float *mtEmit   = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix
    initHMM(initProb, mtState, mtEmit, nState, nEmit);

    // define observational sequence
    int nObs = 100; // size of observational sequence
    int **obs = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*));
    float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); 
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        obs[iDevice] = (int*)malloc(sizeof(int)*nObs);
        for (int i = 0; i < nObs; i++)
            obs[iDevice][i] = i % 15;
        viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
        viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
    }

    shrLog("# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n",
        nState, nEmit, nObs);

    shrLog("Compute Viterbi path on GPU\n\n");

    HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*));
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize);
    }

    cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &ciErrNum);
        vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &ciErrNum);
    }

#ifdef GPU_PROFILING
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clFinish(cqCommandQue[iDevice]);;
    }
	shrDeltaT(1);
#endif

    size_t szWorkGroup;
	for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
	{
		szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]);
	}

	for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
	{
	  clFinish(cqCommandQue[iDevice]);
	}

#ifdef GPU_PROFILING
    double dElapsedTime = shrDeltaT(1);
    shrLogEx(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f GB/s, Time = %.5f s, Size = %u items, NumDevsUsed = %u, Workgroup = %u\n",
        (1.0e-9 * 2.0 * sizeof(float) * nDevice * nState * nState * (nObs-1))/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); 

#endif

    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        ciErrNum = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL);
    }

    shrLog("\nCompute Viterbi path on CPU\n");
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        ciErrNum = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit);
    }
    
    if (!ciErrNum)
    {
        shrEXIT(argc, argv);
    }

    bool pass = true;
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        for (int i = 0; i < nObs; i++)
        {
            if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) 
            {
                pass = false;
                break;
            }
        }
    }
        
    // NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    shrLog("Release CPU buffers and OpenCL objects...\n"); 
    free(initProb);
    free(mtState);
    free(mtEmit);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        free(obs[iDevice]);
        free(viterbiPathCPU[iDevice]);
        free(viterbiPathGPU[iDevice]);
        delete oHmm[iDevice];
        clReleaseCommandQueue(cqCommandQue[iDevice]);
    }
    free(obs);
    free(viterbiPathCPU);
    free(viterbiPathGPU);
    free(viterbiProbCPU);
    free(cdDevices);
    free(oHmm);
    clReleaseContext(cxGPUContext);

    // finish
    shrQAFinishExit(argc, (const char **)argv, pass ? QA_PASSED : QA_FAILED);

    shrEXIT(argc, argv);
}
Exemplo n.º 5
0
int InitOpenCLContext() 
{
	// start logs
	shrSetLogFileName ("oclVolumeRender.txt");

	// get command line arg for quick test, if provided
	// process command line arguments

	// First initialize OpenGL context, so we can properly setup the OpenGL / OpenCL interop.

// 	glewInit();
// 	GLboolean bGLEW = glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object"); 
// 	oclCheckErrorEX(bGLEW, shrTRUE, pCleanup);
	g_glInterop = true;


	// Create OpenCL context, get device info, select device, select options for image/texture and CL-GL interop
	createCLContext();

	// Print device info
	clGetDeviceInfo(cdDevices[uiDeviceUsed], CL_DEVICE_IMAGE_SUPPORT, sizeof(g_bImageSupport), &g_bImageSupport, NULL);
	//shrLog("%s...\n\n", g_bImageSupport ? "Using Image (Texture)" : "No Image (Texuture) Support");      
//	shrLog("Detailed Device info:\n\n");
	oclPrintDevInfo(LOGBOTH, cdDevices[uiDeviceUsed]);

	// create a command-queue
	cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	// Program Setup
	size_t program_length;
	cPathAndName = shrFindFilePath("Transform.cl", ".");
	oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
	cSourceCL = oclLoadProgSource(cPathAndName, "", &program_length);
	oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

	// create the program
	cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
		(const char **)&cSourceCL, &program_length, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	// build the program
	std::string buildOpts = "-cl-single-precision_constant";
//	buildOpts += g_bImageSupport ? " -DIMAGE_SUPPORT" : "";
//	ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],"-cl-fast-relaxed-math", NULL, NULL);
	ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],NULL, NULL, NULL);
	if (ciErrNum != CL_SUCCESS)
	{
		// write out standard error, Build Log and PTX, then cleanup and return error
		shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclVolumeRender.ptx");
		Cleanup(EXIT_FAILURE); 
	}

	// create the kernel
	ScalseKernel = clCreateKernel(cpProgram, "d_render", &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	TransformKernel = clCreateKernel(cpProgram, "angle", &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	LongToShortKernel = clCreateKernel(cpProgram, "transfer", &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	return TRUE;
}
// main function
//*****************************************************************************
int main(int argc, const char **argv)
{
    cl_context cxGPUContext;                   // OpenCL context
    cl_command_queue cqCommandQue[MAX_GPU_COUNT];             // OpenCL command que
    cl_device_id* cdDevices;                   // OpenCL device list    
    cl_int err = 1;                     // Error code var

    shrSetLogFileName ("oclHiddenMarkovModel.txt");
    shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); 

    shrLog(LOGBOTH, 0, "Create context\n");
    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
    shrCheckErrorEX(err, CL_SUCCESS, NULL);

    shrLog(LOGBOTH, 0, "Get device info...\n");
    int nDevice = 0;
    size_t nDeviceBytes;
    err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
    cdDevices = (cl_device_id*)malloc(nDeviceBytes);
    err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, nDeviceBytes, cdDevices, NULL);
    nDevice = (int)(nDeviceBytes/sizeof(cl_device_id));
    
    shrLog(LOGBOTH, 0, "clCreateCommandQueue\n"); 
    int id_device;
    if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line
    {
        // get & log device index # and name
        cl_device_id cdDevice = cdDevices[id_device];

        // create a command que
        cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &err);
        shrCheckErrorEX(err, CL_SUCCESS, NULL);
        oclPrintDevInfo(LOGBOTH, cdDevice);
        nDevice = 1;   
    } 
    else 
    { // create command queues for all available devices        
        for (int i = 0; i < nDevice; i++) 
        {
            cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &err);
            shrCheckErrorEX(err, CL_SUCCESS, NULL);
        }
        for (int i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]);
    }

    shrLog(LOGBOTH, 0, "\nUsing %d GPU(s)...\n\n", nDevice);
	int wgSize;
	if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) 
	{
		wgSize = 256;
	}


    shrLog(LOGBOTH, 0, "Init Hidden Markov Model parameters\n");
    int nState = 256*16; // number of states
    int nEmit  = 128; // number of possible observations
    
    float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability
    float *mtState  = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix
    float *mtEmit   = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix
    initHMM(initProb, mtState, mtEmit, nState, nEmit);

    // define observational sequence
    int nObs = 500; // size of observational sequence
    int **obs = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*));
    int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*));
    float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); 
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        obs[iDevice] = (int*)malloc(sizeof(int)*nObs);
        for (int i = 0; i < nObs; i++)
            obs[iDevice][i] = i % 15;
        viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
        viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs);
    }

    shrLog(LOGBOTH, 0, "# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n",
        nState, nEmit, nObs);


    shrLog(LOGBOTH, 0, "Compute Viterbi path on GPU\n\n");

    HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*));
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize);
    }

    cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice);
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &err);
        vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &err);
    }

#ifdef GPU_PROFILING
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clFinish(cqCommandQue[iDevice]);;
    }
	shrDeltaT(1);
#endif

    size_t szWorkGroup;
	for (int iDevice = 0; iDevice < nDevice; iDevice++)
	{
		szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]);
	}

	for (int iDevice = 0; iDevice < nDevice; iDevice++)
	{
	  clFinish(cqCommandQue[iDevice]);
	}

#ifdef GPU_PROFILING
    double dElapsedTime = shrDeltaT(1);
    shrLog(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %u, Workgroup = %u\n",
        (1.0e-6 * nDevice * nState * nObs)/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); 
#endif

    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        err = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL);
    }

    shrLog(LOGBOTH, 0, "\nCompute Viterbi path on CPU\n");
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        err = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit);
    }
    
    if (!err)
    {
        shrEXIT(argc, argv);
    }

    bool pass = true;
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        for (int i = 0; i < nObs; i++)
        {
            if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) 
            {
                pass = false;
                break;
            }
        }
    }
    shrLog(LOGBOTH, 0, "\nTEST %s\n\n", (pass) ? "PASSED" : "FAILED !!!");

        
    // NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    shrLog(LOGBOTH, 0, "Release CPU buffers and OpenCL objects...\n"); 
    free(initProb);
    free(mtState);
    free(mtEmit);
    for (int iDevice = 0; iDevice < nDevice; iDevice++)
    {
        free(obs[iDevice]);
        free(viterbiPathCPU[iDevice]);
        free(viterbiPathGPU[iDevice]);
        delete oHmm[iDevice];
        clReleaseCommandQueue(cqCommandQue[iDevice]);
    }
    free(obs);
    free(viterbiPathCPU);
    free(viterbiPathGPU);
    free(viterbiProbCPU);
    free(cdDevices);
    free(oHmm);
    clReleaseContext(cxGPUContext);

    shrEXIT(argc, argv);
}
// Main function
// *********************************************************************
int main(const int argc, const char** argv) 
{
    // start logs
    shrSetLogFileName ("oclDXTCompression.txt");
    shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); 

    cl_context cxGPUContext;
    cl_command_queue cqCommandQueue;
    cl_program cpProgram;
    cl_kernel ckKernel;
    cl_mem cmMemObjs[3];
    size_t szGlobalWorkSize[1];
    size_t szLocalWorkSize[1];
    cl_int ciErrNum;

    // Get the path of the filename
    char *filename;
    if (shrGetCmdLineArgumentstr(argc, argv, "image", &filename)) {
        image_filename = filename;
    }
    // load image
    const char* image_path = shrFindFilePath(image_filename, argv[0]);
    shrCheckError(image_path != NULL, shrTRUE);
    shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);
    shrCheckError(h_img != NULL, shrTRUE);
    shrLog(LOGBOTH, 0, "Loaded '%s', %d x %d pixels\n", image_path, width, height);

    // Convert linear image to block linear. 
    uint * block_image = (uint *) malloc(width * height * 4);

    // Convert linear image to block linear. 
    for(uint by = 0; by < height/4; by++) {
        for(uint bx = 0; bx < width/4; bx++) {
            for (int i = 0; i < 16; i++) {
                const int x = i & 3;
                const int y = i / 4;
                block_image[(by * width/4 + bx) * 16 + i] = 
                    ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x];
            }
        }
    }

    // create the OpenCL context on a GPU device
    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // get and log device
    cl_device_id device;
    if( shrCheckCmdLineFlag(argc, argv, "device") ) {
      int device_nr = 0;
      shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
      device = oclGetDev(cxGPUContext, device_nr);
    } else {
      device = oclGetMaxFlopsDev(cxGPUContext);
    }
    oclPrintDevInfo(LOGBOTH, device);

    // create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Memory Setup

    // Compute permutations.
    cl_uint permutations[1024];
    computePermutations(permutations);

    // Upload permutations.
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  sizeof(cl_uint) * 1024, permutations, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Image
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,
                                  sizeof(cl_uint) * width * height, NULL, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);
    
    // Result
    const uint compressedSize = (width / 4) * (height / 4) * 8;

    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
                                  compressedSize, NULL , &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);
    
    unsigned int * h_result = (uint *)malloc(compressedSize);

    // Program Setup
    size_t program_length;
    const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]);
    shrCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    shrCheckError(source != NULL, shrTRUE);

    // create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
        (const char **) &source, &program_length, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx");
        shrCheckError(ciErrNum, CL_SUCCESS); 
    }

    // create the kernel
    ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // set the args values
    ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height);

    // Upload the image
    clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0);

    // set work-item dimensions
    szGlobalWorkSize[0] = width * height * (NUM_THREADS/16);
    szLocalWorkSize[0]= NUM_THREADS;
    
#ifdef GPU_PROFILING
    int numIterations = 100;
    for (int i = -1; i < numIterations; ++i) {
        if (i == 0) { // start timing only after the first warmup iteration
            clFinish(cqCommandQueue); // flush command queue
            shrDeltaT(0); // start timer
        }
#endif
        // execute kernel
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL,
                                          szGlobalWorkSize, szLocalWorkSize, 
                                          0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);
#ifdef GPU_PROFILING
    }
    clFinish(cqCommandQueue);
    double dAvgTime = shrDeltaT(0) / (double)numIterations;
    shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n", 
        (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1); 

#endif

    // blocking read output
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0,
                                   compressedSize, h_result, 0, NULL, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Write DDS file.
    FILE* fp = NULL;
    char output_filename[1024];
    #ifdef WIN32
        strcpy_s(output_filename, 1024, image_path);
        strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds");
        fopen_s(&fp, output_filename, "wb");
    #else
        strcpy(output_filename, image_path);
        strcpy(output_filename + strlen(image_path) - 3, "dds");
        fp = fopen(output_filename, "wb");
    #endif
    shrCheckError(fp != NULL, shrTRUE);

    DDSHeader header;
    header.fourcc = FOURCC_DDS;
    header.size = 124;
    header.flags  = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE);
    header.height = height;
    header.width = width;
    header.pitch = compressedSize;
    header.depth = 0;
    header.mipmapcount = 0;
    memset(header.reserved, 0, sizeof(header.reserved));
    header.pf.size = 32;
    header.pf.flags = DDPF_FOURCC;
    header.pf.fourcc = FOURCC_DXT1;
    header.pf.bitcount = 0;
    header.pf.rmask = 0;
    header.pf.gmask = 0;
    header.pf.bmask = 0;
    header.pf.amask = 0;
    header.caps.caps1 = DDSCAPS_TEXTURE;
    header.caps.caps2 = 0;
    header.caps.caps3 = 0;
    header.caps.caps4 = 0;
    header.notused = 0;

    fwrite(&header, sizeof(DDSHeader), 1, fp);
    fwrite(h_result, compressedSize, 1, fp);

    fclose(fp);

    // Make sure the generated image matches the reference image (regression check)
    shrLog(LOGBOTH, 0, "\nComparing against Host/C++ computation...\n");     
    const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]);
    shrCheckError(reference_image_path != NULL, shrTRUE);

    // read in the reference image from file
    #ifdef WIN32
        fopen_s(&fp, reference_image_path, "rb");
    #else
        fp = fopen(reference_image_path, "rb");
    #endif
    shrCheckError(fp != NULL, shrTRUE);
    fseek(fp, sizeof(DDSHeader), SEEK_SET);
    uint referenceSize = (width / 4) * (height / 4) * 8;
    uint * reference = (uint *)malloc(referenceSize);
    fread(reference, referenceSize, 1, fp);
    fclose(fp);

    // compare the reference image data to the sample/generated image
    float rms = 0;
    for (uint y = 0; y < height; y += 4)
    {
        for (uint x = 0; x < width; x += 4)
        {
            // binary comparison of data
            uint referenceBlockIdx = ((y/4) * (width/4) + (x/4));
            uint resultBlockIdx = ((y/4) * (width/4) + (x/4));
            int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);

            // log deviations, if any
            if (cmp != 0.0f) 
            {
                compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);
                shrLog(LOGBOTH, 0, "Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3);
            }
            rms += cmp;
        }
    }
    rms /= width * height * 3;
    shrLog(LOGBOTH, 0, "RMS(reference, result) = %f\n\n", rms);
    shrLog(LOGBOTH, 0, "TEST %s\n\n", (rms <= ERROR_THRESHOLD) ? "PASSED" : "FAILED !!!");

    // Free OpenCL resources
    oclDeleteMemObjs(cmMemObjs, 3);
    clReleaseKernel(ckKernel);
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(cxGPUContext);

    // Free host memory
    free(source);
    free(h_img);

    // finish
    shrEXIT(argc, argv);
}