예제 #1
0
void ClSetup::SetupCl(int platform_index, int device_index)
{
    platform = GetPlatformID(platform_index);
    PrintPlatformInfo(platform);
    device = GetDeviceID(device_index);
    PrintDeviceInfo(device);
    context = CreateContext();
    queue = CreateCommandQueue();
    bInit = true;
}
예제 #2
0
int main(int argc, char** argv)
{
	glutInit(&argc, argv);
	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
	glutCreateWindow("GL interop");
	glutIconifyWindow();
	glutDisplayFunc(computeVBO);
	glutIdleFunc(computeVBO);

	initVBO();

	CreateContext();
	if(!context)
	{
		std::cerr << "Failed to create OpenCL context." << std::endl;
		return EXIT_FAILURE;
	}

	CreateCommandQueue();
	if(!commandQueue)
	{
		Cleanup();
		return EXIT_FAILURE;
	}

	CreateProgram();
	if(!program)
	{
		Cleanup();
		return EXIT_FAILURE;
	}

	kernel = clCreateKernel(program, "init_vbo_kernel", NULL);
	if(!kernel)
	{
		std::cerr << "Failed to create kernel" << std::endl;
		Cleanup();
		return EXIT_FAILURE;
	}

	if (!CreateMemObjects())
	{
		Cleanup();
		return EXIT_FAILURE;
	}

	glutMainLoop();

	std::cout << std::endl;
	std::cout << "Executed program succesfully." << std::endl;
	Cleanup();
}
예제 #3
0
    inline void ID3D12DeviceEx::createCommandContext(CommandContext<T, N, L>* commandContext,
                                                     const bool isHighPriority,
                                                     const bool disableGpuTimeout) {
        static_assert(N > 0 && L > 0, "Invalid command context parameters.");
        assert(commandContext);
        // Fill out the command queue description.
        const D3D12_COMMAND_QUEUE_DESC queueDesc = {
            /* Type */     static_cast<D3D12_COMMAND_LIST_TYPE>(T),
            /* Priority */ isHighPriority ? D3D12_COMMAND_QUEUE_PRIORITY_HIGH
                                          : D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,
            /* Flags */    disableGpuTimeout ? D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT
                                             : D3D12_COMMAND_QUEUE_FLAG_NONE,
            /* NodeMask */ nodeMask
        };
        // Create a command queue.
        CHECK_CALL(CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&commandContext->m_commandQueue)),
                   "Failed to create a command queue.");
        // Create command allocators.
        for (size_t i = 0; i < N; ++i) {
            for (size_t j = 0; j < L; ++j) {
                CHECK_CALL(CreateCommandAllocator(static_cast<D3D12_COMMAND_LIST_TYPE>(T),
                           IID_PPV_ARGS(&commandContext->m_commandAllocators[i][j])),
                           "Failed to create a command list allocator.");
            }
        }
        // Set the initial frame allocator set index to 0.
        commandContext->m_frameAllocatorSet = 0;
        // Create command lists in the closed, NULL state using the initial allocator.
        for (size_t i = 0; i < L; ++i) {
            CHECK_CALL(CreateCommandList(nodeMask, static_cast<D3D12_COMMAND_LIST_TYPE>(T),
                                         commandContext->m_commandAllocators[0][i].Get(), nullptr,
                                         IID_PPV_ARGS(&commandContext->m_commandLists[i])),
                       "Failed to create a command list.");
            CHECK_CALL(commandContext->m_commandLists[i]->Close(),
                       "Failed to close the command list.");

        }
        // Create a 0-initialized fence object.
        commandContext->m_fenceValue = 0;
        CHECK_CALL(CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&commandContext->m_fence)),
                   "Failed to create a fence object.");
        // Set the last fence value for each command allocator to 0.
        for (size_t i = 0; i < N; ++i) {
            commandContext->m_lastFenceValues[i] = 0;
        }
        // Create a synchronization event.
        commandContext->m_syncEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
        if (!commandContext->m_syncEvent) {
            CHECK_CALL(HRESULT_FROM_WIN32(GetLastError()),
                       "Failed to create a synchronization event.");
        }
    }
예제 #4
0
int SetOpenCLParemeters(OpenCLParameters *opencl_paremeters,
                        const char *file_name,
                        const char *kernel_name)
{
    opencl_paremeters->context = CreateContext();
    if (opencl_paremeters->context == NULL)
    {
        return 0;
    }

    opencl_paremeters->command_queue = CreateCommandQueue(opencl_paremeters->context,
                                                          &(opencl_paremeters->device));
    if (opencl_paremeters->command_queue == NULL)
    {
        CleanOpenCLParemeters(opencl_paremeters);
        return 0;
    }

    opencl_paremeters->program = CreateProgram(opencl_paremeters->context,
                                               opencl_paremeters->device,
                                               file_name);
    if (opencl_paremeters->program == NULL)
    {
        CleanOpenCLParemeters(opencl_paremeters);
        return 0;
    }

    opencl_paremeters->kernel = clCreateKernel(opencl_paremeters->program,
                                               kernel_name,
                                               NULL);
    if (opencl_paremeters->kernel == NULL)
    {
        CleanOpenCLParemeters(opencl_paremeters);
        std::cerr << "Failed to create kernel!" << std::endl;
        return 0;
    }

    return 1;
}
예제 #5
0
///
//	main() for HelloWorld example
//
int main(int argc, char** argv)
{
    cl_context context = 0;
    cl_command_queue commandQueue = 0;
    cl_program program = 0;
    cl_device_id device = 0;
    cl_kernel kernels[2] = { 0, 0 };
    cl_mem memObjects[3] = { 0, 0, 0 };
    cl_int errNum;

    // Create an OpenCL context on first available platform
    context = CreateContext();
    if (context == NULL)
    {
        std::cerr << "Failed to create OpenCL context." << std::endl;
        return 1;
    }

    // Create a command-queue on the first device available
    // on the created context
    commandQueue = CreateCommandQueue(context, &device);
    if (commandQueue == NULL)
    {
        Cleanup(context, commandQueue, program, kernels, memObjects);
        return 1;
    }

    // Create OpenCL program from HelloWorld.cl kernel source
    program = CreateProgram(context, device, "simple.cl");
    if (program == NULL)
    {
        Cleanup(context, commandQueue, program, kernels, memObjects);
        return 1;
    }

    // Create OpenCL kernel
    //clCreateKernel(program, "hello_kernel", NULL);

    cl_uint numberOfKernels = 0;
    errNum = clCreateKernelsInProgram(program,
                                      0,
                                      NULL,
                                      &numberOfKernels
                                     );
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Failed to get number of kernels" << std::endl;
        Cleanup(context, commandQueue, program, kernels, memObjects);
        return 1;
    }
    else
    {
        std::cout << "numberOfKernels is:" << numberOfKernels << std::endl;
    }

    assert(numberOfKernels == 2 && "number of kernels was not as expected");
    errNum = clCreateKernelsInProgram(program,
                                      2,
                                      kernels,
                                      NULL
                                     );

    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Failed to retrieve kernels" << std::endl;
        Cleanup(context, commandQueue, program, kernels, memObjects);
        return 1;
    }


    // Create memory objects that will be used as arguments to
    // kernels.  First create host memory arrays that will be
    // used to store the arguments to the kernel
    float result[ARRAY_SIZE];
    float a[ARRAY_SIZE];
    float b[ARRAY_SIZE];
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
        a[i] = (float)i;
        b[i] = (float)(i * 2);
    }

    if (!CreateMemObjects(context, memObjects, a, b))
    {
        Cleanup(context, commandQueue, program, kernels, memObjects);
        return 1;
    }

    for (int i = 0; i < numberOfKernels; ++i)
    {
        // Set the kernel arguments (result, a, b)
        errNum = clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &memObjects[0]);
        errNum |= clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &memObjects[1]);
        errNum |= clSetKernelArg(kernels[i], 2, sizeof(cl_mem), &memObjects[2]);
        if (errNum != CL_SUCCESS)
        {
            std::cerr << "Error setting kernels[" << i << "] arguments." << std::endl;
            Cleanup(context, commandQueue, program, kernels, memObjects);
            return 1;
        }
    }

    size_t globalWorkSize[1] = { ARRAY_SIZE };
    size_t localWorkSize[1] = { 1 };


    cl_event waitFor = NULL;
    for (int i = 0; i < numberOfKernels; ++i)
    {
        cl_uint numToWaitFor = 0;
        cl_event waitList[1] = { 0 };
        cl_event* waitListP = 0;

        if (waitFor != NULL)
        {
            numToWaitFor = 1;
            waitList[0] = waitFor;
            waitListP = waitList;
        }

        // Queue the kernel up for execution across the array
        errNum = clEnqueueNDRangeKernel(commandQueue, kernels[i], 1, NULL,
                                        globalWorkSize, localWorkSize,
                                        numToWaitFor, waitListP, &waitFor);
        if (errNum != CL_SUCCESS)
        {
            std::cerr << "Error queuing kernel for execution." << std::endl;
            Cleanup(context, commandQueue, program, kernels, memObjects);
            return 1;
        }
    }

    // Read the output buffer back to the Host
    errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
                                 0, ARRAY_SIZE * sizeof(float), result,
                                 0, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error reading result buffer." << std::endl;
        Cleanup(context, commandQueue, program, kernels, memObjects);
        return 1;
    }

    // Output the result buffer
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
        std::cout << result[i] << " ";
    }
    std::cout << std::endl;
    std::cout << "Executed program succesfully." << std::endl;
    Cleanup(context, commandQueue, program, kernels, memObjects);

    return 0;
}
예제 #6
0
int main(int argc, char** argv)
{
  cl_context       context      = 0;
  cl_command_queue commandQueue = 0;
  cl_program       program      = 0;
  cl_device_id     device       = 0;
  cl_kernel        kernel       = 0;
  cl_int           status;

  char filename[]   = "../../kernels/VectorUpdate_vec_kernel.cl";
  char filename2[] = "../../common/types_kernel.h";

  int profiling_info = 0;
  cl_event myEvent, myEvent2;

  if( argc != 4 )
    {
      printf("Usage: %s vector_file1 vector_file2 alpha\n", argv[0]);
      return EXIT_FAILURE;
    }

  char xfilename[50];
  char yfilename[50];
  real alpha;
  
  strcpy(xfilename, argv[1]);
  strcpy(yfilename, argv[2]);
  alpha = strtod(argv[3], NULL);

#ifdef PROFILE
  cl_ulong startTime, endTime, startTime2, endTime2;
  cl_ulong kernelExecTimeNs, readFromGpuTime;
  profiling_info = 1;
#endif

  /*  READING DATA FROM FILE  */
  
  real *x;
  real *y;
  real *ref_x;
  int N, M, N4;

  std::ifstream xfile;
  xfile.open (xfilename, std::ios::in);
  if (!xfile.is_open())
    {
      printf("Error: cannot open file\n");
      return EXIT_FAILURE;
    }
  
    xfile >> N;

    // it must be N%4 == 0                                                        
    N4 = ((N & (4-1)) == 0) ? N : N+(4-(N&3));

    HANDLE_ALLOC_ERROR(x = (real*)malloc(N4*sizeof(real)));
  
    for( int i = 0; i < N; i++)
 	xfile >> x[i];

    for(int i = N; i < N4; ++i)
      x[i] = 0;

    xfile.close();
    
    // needed for checking result 
    HANDLE_ALLOC_ERROR(ref_x = (real*)malloc(N*sizeof(real)));
    memcpy(ref_x, x, N*sizeof(real));


    std::ifstream yfile;
    yfile.open (yfilename, std::ios::in);
    if (!yfile.is_open())
      {
	printf("Error: cannot open file\n");
	return EXIT_FAILURE;
      }
    
    yfile >> M;
    assert(N==M);
    
    HANDLE_ALLOC_ERROR(y = (real*)malloc(N4*sizeof(real)));
    
    for( int i = 0; i < N; i++)
      yfile >> y[i];
    for(int i = N; i < N4; ++i)
      y[i] = 0;

    
    yfile.close();

    int Ndev4 = N4/4;
    



  TIME start = tic(); 

  TIME init = tic();
  
  // Create an OpenCL context
  context = CreateContext();
  if(context == NULL)
    {
      std::cerr << "Failed to create OpenCL context." << std::endl;
      Cleanup(context, commandQueue, program, kernel);
      return EXIT_FAILURE;
    }
  
  // Create a command queue
  commandQueue = CreateCommandQueue(context, &device, profiling_info);
  if(commandQueue == NULL)
    {
      std::cerr << "Failed to create OpenCL command queue." << std::endl;
      Cleanup(context, commandQueue, program, kernel);
      return EXIT_FAILURE;
    }
  
  // Create OpenCL program 
  program = CreateProgram(context, device, filename, filename2);
  if (program == NULL)
    {
      Cleanup(context, commandQueue, program, kernel);
      return EXIT_FAILURE;
    }
  
  // Create OpenCL kernel
  kernel = clCreateKernel(program, "VectorUpdate", NULL);
  if(kernel == NULL)
    {
      std::cerr << "Failed to create kernel." << std::endl;
      Cleanup(context, commandQueue, program, kernel);
      return EXIT_FAILURE;
    }
  

  printf("%lf\n",toc(init));

    /*     QUERYING DEVICE INFO     */
    
    size_t kernelWorkGroupSize; // maximum work-group size that can be used to execute a kernel
    size_t sizeOfWarp;          // the preferred multiple of workgroup size for launch
    cl_ulong localMemSize;      // the amount of local memory in bytes being used by a kernel
    cl_ulong privateMemSize;    // the minimum amount of private memory, in bytes, used by each workitem in the kernel. 

    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, NULL));
    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &sizeOfWarp, NULL));
  
#ifdef PRINT_INFO  
    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, NULL));
    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PRIVATE_MEM_SIZE, sizeof(cl_ulong), &privateMemSize, NULL));
#endif

#ifdef PRINT_INFO 
    printf("------------  Some info: --------------\n");
    printf("kernelWorkGroupSize = %lu \n", kernelWorkGroupSize);
    printf("sizeOfWarp          = %lu \n", sizeOfWarp);
    printf("localMemSize        = %lu \n", localMemSize);
    printf("privateMemSize      = %lu \n", privateMemSize);
    printf("------------------------ --------------\n");
#endif

    if( WORK_GROUP_SIZE > kernelWorkGroupSize )
      {
	printf("Error: wrong work group size\n");
	return EXIT_FAILURE;
      }

    size_t localWorkSize[1] = {WORK_GROUP_SIZE};
    int numWorkGroups = (Ndev4-1)/WORK_GROUP_SIZE+1;
    size_t globalWorkSize[1] = {numWorkGroups*WORK_GROUP_SIZE};


    TIME t = tic();

    cl_mem DEV_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
				  sizeof(real)*N4, x, &status);
    HANDLE_OPENCL_ERROR(status);
    
    cl_mem DEV_y = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
				  sizeof(real)*N4, y, &status);
    HANDLE_OPENCL_ERROR(status);
    
    int n = 0;
    status  = clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_x);
    status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_y);
    status |= clSetKernelArg(kernel, n++, sizeof(real), (void*)&alpha);
    status |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&Ndev4);
    HANDLE_OPENCL_ERROR(status);

    
    printf("%lf\n",toc(t));

    // Queue the kernel 
    HANDLE_OPENCL_ERROR(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
					       globalWorkSize, localWorkSize,
					       0, NULL, &myEvent));

    // Read the output buffer back to the Host
    HANDLE_OPENCL_ERROR(clEnqueueReadBuffer(commandQueue, DEV_x, CL_TRUE,
					    0, N4*sizeof(real), x,
					    0, NULL, &myEvent2));

    clFinish(commandQueue); // wait for all events to finish


    double elapsed_time = toc(start);

    /*  CHECK RESULT */

    TIME start_seq = tic();
    for (int i = 0; i < N; i++)
      ref_x[i] += alpha*y[i];
    double elapsed_time_seq = toc(start_seq);

    assert(ref_x[10] < 1000000);

    //std::cout << ref_x[0] << " " << x[0] << std::endl;

    //  for (int i = 0; i < N; i++)
    //  assert( abs(x[i] - ref_x[i]) < TOL );

    //std::cout << "Verified..." << std::endl;


#ifdef PROFILE
    clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, 
			    sizeof(cl_ulong), &startTime, NULL);
    clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
			    sizeof(cl_ulong), &endTime, NULL);
    clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_START,
                            sizeof(cl_ulong), &startTime2, NULL);
    clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_END,
                            sizeof(cl_ulong), &endTime2, NULL);

    kernelExecTimeNs = endTime-startTime;
    readFromGpuTime = endTime2-startTime2;
    printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)readFromGpuTime/1000000000.0);
    printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)kernelExecTimeNs/1000000000.0);
#endif
    printf(/*"Total execution time: %lf\n"*/"%lf\n", elapsed_time);
    printf(/*"Total execution time (seq.):*/"%lf\n", elapsed_time_seq);

    Cleanup(context, commandQueue, program, kernel);
    free(x);
    free(y);
    clReleaseMemObject(DEV_x);
    clReleaseMemObject(DEV_y);

    return EXIT_SUCCESS;
}
예제 #7
0
int main(int argc, char **argv) 
{
    int optindex = 0;
    char ch;
    struct option longopts[] = 
	{
        {"dimension",   required_argument, 0, 'd'}, //dimensions of src img
        {"components",  required_argument, 0, 'c'}, //numger of components of src img
        {"depth",       required_argument, 0, 'b'}, //bit depth of src img
        {"level",       required_argument, 0, 'l'}, //level of dwt
        {"device",      required_argument, 0, 'D'}, //cuda device
        {"forward",     no_argument,       0, 'f'}, //forward transform
        {"reverse",     no_argument,       0, 'r'}, //forward transform
        {"97",          no_argument,       0, '9'}, //9/7 transform
        {"53",          no_argument,       0, '5' }, //5/3transform
        {"write-visual",no_argument,       0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
        {"help",        no_argument,       0, 'h'}  
    };
    
    int pixWidth    = 0; //<real pixWidth
    int pixHeight   = 0; //<real pixHeight
    int compCount   = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
    int bitDepth    = 8; 
    int dwtLvls     = 3; //default numuber of DWT levels
    int device      = 0;
    int forward     = 1; //forward transform
    int dwt97       = 0; //1=dwt9/7, 0=dwt5/3 transform
    int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
    char * pos;
 
    while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) 
	{
        switch (ch) {
        case 'd':
            pixWidth = atoi(optarg);
            pos = strstr(optarg, "x");
            if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) 
			{
                usage();
                return -1;
            }
            pixHeight = atoi(pos+1);
            break;
        case 'c':
            compCount = atoi(optarg);
            break;
        case 'b':
            bitDepth = atoi(optarg);
            break;
        case 'l':
            dwtLvls = atoi(optarg);
            break;
        case 'D':
            device = atoi(optarg);
            break;
        case 'f':
            forward = 1;
            break;
        case 'r':
            forward = 0;
            break;
        case '9':
            dwt97 = 1;
            break;
        case '5':
            dwt97 = 0;
            break;
        case 'w':
            writeVisual = 1;
            break;
        case 'h':
            usage();
            return 0;
        case '?':
            return -1;
        default :
            usage();
            return -1;
        }
    }
	argc -= optind;
	argv += optind;

    if (argc == 0) 
	{ // at least one filename is expected
        printf("Please supply src file name\n");
        usage();
        return -1;
    }

    if (pixWidth <= 0 || pixHeight <=0) 
	{
        printf("Wrong or missing dimensions\n");
        usage();
        return -1;
    }

    if (forward == 0) 
	{
        writeVisual = 0; //do not write visual when RDWT
    }
	
	
	
	//
	// device init
	// Create an OpenCL context on first available platform
    context = CreateContext();
    if (context == NULL)
    {
        std::cerr << "Failed to create OpenCL context." << std::endl;
        return 1;
    }

    // Create a command-queue on the first device available
    // on the created context
    commandQueue = CreateCommandQueue(context, &cldevice);
    if (commandQueue == NULL)
    {
        Cleanup(context, commandQueue, program, kernel);
        return 1;
    }
	
	// Create OpenCL program from com_dwt.cl kernel source
	program = CreateProgram(context, cldevice, "com_dwt.cl");
    if (program == NULL)
    {
        printf("fail to create program!!\n");
    }

	// Create OpenCL kernel
	c_CopySrcToComponents = clCreateKernel(program, "c_CopySrcToComponents", NULL); 
	if (c_CopySrcToComponents == NULL)
    {
        std::cerr << "Failed to create kernel" << std::endl;
    }

	c_CopySrcToComponent = clCreateKernel(program, "c_CopySrcToComponent", NULL); 
	if (c_CopySrcToComponent == NULL)
    {
        std::cerr << "Failed to create kernel" << std::endl;
    }
	
	kl_fdwt53Kernel = clCreateKernel(program, "cl_fdwt53Kernel", NULL); 
    if (kl_fdwt53Kernel == NULL)
	{
		std::cerr<<"Failed to create kernel\n";
	}
	

	
	//initialize struct dwt
	struct dwt *d;
    d = (struct dwt *)malloc(sizeof(struct dwt));
    d->srcImg = NULL;
    d->pixWidth = pixWidth;
    d->pixHeight = pixHeight;
    d->components = compCount;
    d->dwtLvls  = dwtLvls;
	
	// file names
    d->srcFilename = (char *)malloc(strlen(argv[0]));
    strcpy(d->srcFilename, argv[0]);
    if (argc == 1) 
	{ // only one filename supplyed
        d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
        strcpy(d->outFilename, d->srcFilename);
        strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
    } else {
        d->outFilename = strdup(argv[1]);
    }

    //Input review
    printf("\nSource file:\t\t%s\n", d->srcFilename);
    printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
    printf(" Components count:\t%d\n", compCount);
    printf(" Bit depth:\t\t%d\n", bitDepth);
    printf(" DWT levels:\t\t%d\n", dwtLvls);
    printf(" Forward transform:\t%d\n", forward);
    printf(" 9/7 transform:\t\t%d\n", dwt97);
    
    //data sizes
    int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess

    //load img source image
	d->srcImg = (unsigned char *) malloc (inputSize);
	if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) 
        return -1;
		
	 // DWT
	// Create memory objects, Set arguments for kernel functions, Queue the kernel up for execution across the array, Read the output buffer back to the Host, Output the result buffer
	
    if (forward == 1) 
	{
        if(dwt97 == 1 )
            processDWT<float>(d, forward, writeVisual);
        else // 5/3
            processDWT<int>(d, forward, writeVisual);
    }
    else 
	{ // reverse
        if(dwt97 == 1 )
            processDWT<float>(d, forward, writeVisual);
        else // 5/3
            processDWT<int>(d, forward, writeVisual);
    }
	

	Cleanup(context, commandQueue, program, kernel);
	clReleaseKernel(c_CopySrcToComponents);
	clReleaseKernel(c_CopySrcToComponent);
	
    return 0;
	
}
예제 #8
0
///
//	main() for GLinterop example
//
int main(int argc, char** argv)
{
	cl_device_id device = 0;

	imWidth = 256;
	imHeight = 256;
	vbolen = imHeight;

	initGlut(argc, argv, imWidth, imHeight);
	vbo = initVBO(vbolen);
	initTexture(imWidth, imHeight);

	// Create an OpenCL context on first available platform
	context = CreateContext();
	if (context == NULL)
	{
		std::cerr << "Failed to create OpenCL context." << std::endl;
		return 1;
	}

	// Create a command-queue on the first device available
	// on the created context
	commandQueue = CreateCommandQueue(context, &device);
	if (commandQueue == NULL)
	{
		Cleanup();
		return 1;
	}

	// Create OpenCL program from GLinterop.cl kernel source
	program = CreateProgram(context, device, "kernel/GLinterop.cl");
	if (program == NULL)
	{
		Cleanup();
		return 1;
	}

	// Create OpenCL kernel
	kernel = clCreateKernel(program, "init_vbo_kernel", NULL);
	if (kernel == NULL)
	{
		std::cerr << "Failed to create kernel" << std::endl;
		Cleanup();
		return 1;
	}

	tex_kernel = clCreateKernel(program, "init_texture_kernel", NULL);
	if (tex_kernel == NULL)
	{
		std::cerr << "Failed to create kernel" << std::endl;
		Cleanup();
		return 1;
	}

	// Create memory objects that will be used as arguments to
	// kernel
	if (!CreateMemObjects(context, tex, vbo, &cl_vbo_mem, &cl_tex_mem))
	{
		Cleanup();
		return 1;
	}

	// Perform some queries to get information about the OpenGL objects
	performQueries();

	glutMainLoop();

	std::cout << std::endl;
	std::cout << "Executed program succesfully." << std::endl;
	Cleanup();

	return 0;
}
예제 #9
0
///
//	main() for HelloWorld example
//
int main(int argc, char** argv)
{
    cl_context context = 0;
    cl_command_queue commandQueue = 0;
    cl_program program = 0;
    cl_device_id device = 0;
    cl_kernel kernel = 0;
    cl_mem memObjects[2] = {0, 0};
    cl_int errNum;

    assert( (ARRAY_SIZE & (ARRAY_SIZE -1)) == 0 && "Array size must be a power of 2");
    int numberOfIterations = (int) ceil(log2(ARRAY_SIZE));
    assert( (1 << numberOfIterations) == ARRAY_SIZE && "numberOfIterations calculation is incorrect");


    // Create an OpenCL context on first available platform
    context = CreateContext();
    if (context == NULL)
    {
        std::cerr << "Failed to create OpenCL context." << std::endl;
        return 1;
    }

    // Create a command-queue on the first device available
    // on the created context
    commandQueue = CreateCommandQueue(context, &device);
    if (commandQueue == NULL)
    {
        Cleanup(context, commandQueue, program, kernel, memObjects);
        return 1;
    }

    // Create OpenCL program from HelloWorld.cl kernel source
    program = CreateProgram(context, device, "SimplePrefixSum.cl");
    if (program == NULL)
    {
        Cleanup(context, commandQueue, program, kernel, memObjects);
        return 1;
    }

    // Create OpenCL kernel
    kernel = clCreateKernel(program, "prefix_sum", NULL);
    if (kernel == NULL)
    {
        std::cerr << "Failed to create kernel" << std::endl;
        Cleanup(context, commandQueue, program, kernel, memObjects);
        return 1;
    }


    // Create memory objects that will be used as arguments to
    // kernel.  First create host memory arrays that will be
    // used to store the arguments to the kernel
    cl_int a[ARRAY_SIZE];
    cl_int b[ARRAY_SIZE];
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
        a[i] = i+1;
        b[i] = 0;
    }

    if (!CreateMemObjects(context, memObjects, a, b))
    {
        Cleanup(context, commandQueue, program, kernel, memObjects);
        return 1;
    }

    // Set the kernel arguments (result, a, b)
    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
    errNum |= clSetKernelArg(kernel, 2, sizeof(int), &numberOfIterations);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error setting kernel arguments." << std::endl;
        Cleanup(context, commandQueue, program, kernel, memObjects);
        return 1;
    }

    size_t globalWorkSize[1] = { ARRAY_SIZE };
    size_t localWorkSize[1] = { ARRAY_SIZE };

    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
                                    globalWorkSize, localWorkSize,
                                    0, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error queuing kernel for execution." << std::endl;
        Cleanup(context, commandQueue, program, kernel, memObjects);
        return 1;
    }

    // The location of the result depends on how many loop iterations were executed.
    cl_mem returnObject = (numberOfIterations % 2 == 0)? memObjects[0] : memObjects[1];

    // Read the output buffer back to the Host
    errNum = clEnqueueReadBuffer(commandQueue, returnObject, CL_TRUE,
                                 0, ARRAY_SIZE * sizeof(cl_int), a,
                                 0, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error reading result buffer." << std::endl;
        Cleanup(context, commandQueue, program, kernel, memObjects);
        return 1;
    }

    // Output the result buffer
    std::cout << "Result:" << std::endl;
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
        std::cout << a[i] << " ";
    }
    std::cout << std::endl;
    std::cout << "Executed program succesfully." << std::endl;
    Cleanup(context, commandQueue, program, kernel, memObjects);

    return 0;
}
예제 #10
0
int main(int argc, char** argv)
{
    cl_context       context      = 0;
    cl_command_queue commandQueue = 0;
    cl_program       program      = 0;
    cl_device_id     device       = 0;
    cl_kernel        kernel       = 0;
    cl_int           status;
   
    int profiling_info = 0;
    cl_event myEvent, myEvent2;

    int ITEMS_PER_ROW;

    if( argc < 3 || argc > 4 )
      {
	printf("Usage: %s matrix_file vector_file [ref_file]\n", argv[0]);
	return EXIT_FAILURE;
      }

    int verify = (argc == 4 ? 1 : 0);

    char Mfilename[50];
    char xfilename[50];
    char reffilename[50];



    strcpy(Mfilename, argv[1]);
    strcpy(xfilename, argv[2]);
    if(verify)
      strcpy(reffilename, argv[3]);


#ifdef PROFILE
    cl_ulong startTime, endTime, startTime2, endTime2;
    cl_ulong kernelExecTimeNs, readFromGpuTime;
    profiling_info = 1;
#endif

    char filename[]   = "../../kernels/MatVecMul_kernel.cl";
    char filename2[] = "../../common/types_kernel.h";

    /*  READING DATA FROM FILE  */

    struct MatrixParams params;
    real *Values;
    int *RowPtr;
    int *ColInd;
    real *x;
    real * res;
    int N;


    std::ifstream Mfile;
    Mfile.open (Mfilename, std::ios::in);
    if (!Mfile.is_open())
      {
	printf("Error: cannot open file\n");
	return EXIT_FAILURE;
      }

    Mfile >> params.NRows;
    Mfile >> params.NCols;
    Mfile >> params.NNZ;
    
    HANDLE_ALLOC_ERROR(Values = (real*)malloc(params.NNZ*sizeof(real)));
    HANDLE_ALLOC_ERROR(ColInd = (int*)malloc(params.NNZ*sizeof(int)));
    HANDLE_ALLOC_ERROR(RowPtr = (int*)malloc((params.NRows+1)*sizeof(int)));
    HANDLE_ALLOC_ERROR(x = (real*)malloc(params.NCols*sizeof(real)));
    HANDLE_ALLOC_ERROR(res = (real*)malloc(params.NRows*sizeof(real)));

    for( int i = 0; i < params.NRows+1; i++)
      Mfile >> RowPtr[i];
    for( int i = 0; i < params.NNZ; i++)
      Mfile >> ColInd[i];
    for( int i = 0; i < params.NNZ; i++)
      Mfile >> Values[i];
 
    Mfile.close();

    std::ifstream xfile;
    xfile.open (xfilename, std::ios::in);
    if (!xfile.is_open())
      {
	printf("Error: cannot open file\n");
	return EXIT_FAILURE;
      }

    xfile >> N;
    assert(params.NRows == N);

    for( int i = 0; i < params.NCols; i++)
      xfile >> x[i];
 
    xfile.close();




    TIME start = tic();

    TIME init = tic();

    // Create an OpenCL context
    context = CreateContext();
    if(context == NULL)
      {
	std::cerr << "Failed to create OpenCL context." << std::endl;
	Cleanup(context, commandQueue, program, kernel);
	return EXIT_FAILURE;
      }

    // Create a command queue
    commandQueue = CreateCommandQueue(context, &device, profiling_info);
    if(commandQueue == NULL)
      {
	std::cerr << "Failed to create OpenCL command queue." << std::endl;
	Cleanup(context, commandQueue, program, kernel);
	return EXIT_FAILURE;
      }

    program = CreateProgram(context, device, filename, filename2);
    if (program == NULL)
      {
	Cleanup(context, commandQueue, program, kernel);
	return 1;
      }
    
    // Create OpenCL kernel
    kernel = clCreateKernel(program, "MatVecMul", NULL);
    if(kernel == NULL)
      {
	std::cerr << "Failed to create kernel." << std::endl;
	Cleanup(context, commandQueue, program, kernel);
	return EXIT_FAILURE;
      }
    
    printf("%lf\n", toc(init));

    /*     QUERYING DEVICE INFO     */

    size_t kernelWorkGroupSize; // maximum work-group size that can be used to execute a kernel
    size_t sizeOfWarp;          // the preferred multiple of workgroup size for launch
    cl_ulong localMemSize;      // the amount of local memory in bytes being used by a kernel
    cl_ulong privateMemSize;    // the minimum amount of private memory, in bytes, used by each workitem in the kernel. 

    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, NULL));
    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &sizeOfWarp, NULL));
#ifdef PRINT_INFO    
    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, NULL));
    HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PRIVATE_MEM_SIZE, sizeof(cl_ulong), &privateMemSize, NULL));
#endif
#ifdef PRINT_INFO 
    printf("------------  Some info: --------------\n");
    printf("kernelWorkGroupSize = %lu \n", kernelWorkGroupSize);
    printf("sizeOfWarp          = %lu \n", sizeOfWarp);
    printf("localMemSize        = %lu \n", localMemSize);
    printf("privateMemSize      = %lu \n", privateMemSize);
    printf("------------------------ --------------\n");
#endif

    if( WORK_GROUP_SIZE > kernelWorkGroupSize )
      {
	printf("Error: wrong work group size\n");
	return EXIT_FAILURE;
      }
    ITEMS_PER_ROW = sizeOfWarp;

    TIME t = tic();

    int numWorkRows = params.NRows;    
    size_t localWorkSize[1]  = {WORK_GROUP_SIZE};
    int numWarpsInGroup = WORK_GROUP_SIZE/ITEMS_PER_ROW;
    size_t globalWorkSize[1] = {((numWorkRows-1) / numWarpsInGroup + 1)*WORK_GROUP_SIZE};

    cl_mem DEV_Values = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                   sizeof(real)*params.NNZ, Values, &status);
    HANDLE_OPENCL_ERROR(status);

    cl_mem DEV_ColInd = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                   sizeof(int)*params.NNZ, ColInd, &status);
    HANDLE_OPENCL_ERROR(status);

    cl_mem DEV_RowPtr = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
				       sizeof(int)*(params.NRows+1), RowPtr, &status);
    HANDLE_OPENCL_ERROR(status);

    cl_mem DEV_x = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                   sizeof(real)*params.NCols, x, &status);
    HANDLE_OPENCL_ERROR(status);


    cl_mem DEV_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                   sizeof(real)*params.NRows, NULL, &status);
    HANDLE_OPENCL_ERROR(status);

    int p_dim = 2;
    int p[] = {ITEMS_PER_ROW, params.NRows};

    cl_mem DEV_p = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
				  sizeof(int)*p_dim, p, &status);
    HANDLE_OPENCL_ERROR(status);

    int n = 0;
    status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_Values);
    status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_ColInd);
    status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_RowPtr);
    status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_x);
    status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_res);
    status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_p);
    status |= clSetKernelArg(kernel, n++, sizeof(real)*WORK_GROUP_SIZE, NULL);
    HANDLE_OPENCL_ERROR(status);


    printf("%lf\n", toc(t));

    // Queue the kernel 
    HANDLE_OPENCL_ERROR(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
					       globalWorkSize, localWorkSize,
					       0, NULL, &myEvent));

    // Read the output buffer back to the Host
    HANDLE_OPENCL_ERROR(clEnqueueReadBuffer(commandQueue, DEV_res, CL_TRUE,
					    0, params.NRows*sizeof(real), res,
					    0, NULL, &myEvent2));
    clFinish(commandQueue); // wait for all events to finish


    double elapsed_time = toc(start);

    /*  CHECK RESULTS   */
    
    if( verify )
      {
	std::ifstream reffile;
	reffile.open (reffilename, std::ios::in);
	if (!reffile.is_open())
	  {
	    printf("Error: cannot open file\n");
	    return EXIT_FAILURE;
	  }

	int N_ref;
	reffile >> N_ref;
	assert(N == N_ref);

	real *ref;
	HANDLE_ALLOC_ERROR(ref = (real*)malloc(N*sizeof(real)));

	for( int i = 0; i < N; i++)
	  reffile >> ref[i];

	reffile.close();


	for( int i = 0; i < N; i++)
	  {
	    //std::cout << ref[i] << "=="  << res[i] << ",   ";
	    assert(abs(ref[i] - res[i]) < TOL);
	  }
	
	std::cout << "Verified..." << std::endl;

      }




#ifdef PROFILE
    clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, 
			    sizeof(cl_ulong), &startTime, NULL);
    clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
			    sizeof(cl_ulong), &endTime, NULL);

    clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_START,
                            sizeof(cl_ulong), &startTime2, NULL);
    clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_END,
                            sizeof(cl_ulong), &endTime2, NULL);

    kernelExecTimeNs = endTime-startTime;
    readFromGpuTime = endTime2-startTime2;
    printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)readFromGpuTime/1000000000.0);
    printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)kernelExecTimeNs/1000000000.0);
#endif
printf(/*"Total execution time: %lf\n"*/"%lf\n", elapsed_time);


    Cleanup(context, commandQueue, program, kernel);
    free(Values);
    free(RowPtr);
    free(ColInd);
    free(x);
    free(res);

    clReleaseMemObject(DEV_Values); 
    clReleaseMemObject(DEV_ColInd);
    clReleaseMemObject(DEV_RowPtr);
    clReleaseMemObject(DEV_x);
    clReleaseMemObject(DEV_res);
    clReleaseMemObject(DEV_p);


    return EXIT_SUCCESS;
}
예제 #11
0
///
//	main() for HelloBinaryWorld example
//
int main(int argc, char** argv)
{
    time_t start, end;
    double duration = 0;
    time(&start);
	
    cl_context context = 0;
    cl_command_queue commandQueue = 0;
    cl_program program = 0;
    cl_device_id device = 0;
    cl_kernel kernel = 0;
    cl_mem imageObjects[2] = { 0, 0 };
    cl_sampler sampler = 0;
    cl_int errNum;


    if (argc != 3)
    {
        std::cerr << "USAGE: " << argv[0] << " <inputImageFile> <outputImageFiles>" << std::endl;
        return 1;
    }

    // Create an OpenCL context on first available platform
    context = CreateContext();
    if (context == NULL)
    {
        std::cerr << "Failed to create OpenCL context." << std::endl;
        return 1;
    }

    // Create a command-queue on the first device available
    // on the created context
    commandQueue = CreateCommandQueue(context, &device);
    if (commandQueue == NULL)
    {
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    // Make sure the device supports images, otherwise exit
    cl_bool imageSupport = CL_FALSE;
    clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
                    &imageSupport, NULL);
    if (imageSupport != CL_TRUE)
    {
        std::cerr << "OpenCL device does not support images." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    // Load input image from file and load it into
    // an OpenCL image object
    int width, height;
    imageObjects[0] = LoadImage(context, argv[1], width, height);
    if (imageObjects[0] == 0)
    {
        std::cerr << "Error loading: " << std::string(argv[1]) << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    // Create ouput image object
    cl_image_format clImageFormat;
    clImageFormat.image_channel_order = CL_RGBA;
    clImageFormat.image_channel_data_type = CL_UNORM_INT8;
    imageObjects[1] = clCreateImage2D(context,
                                       CL_MEM_WRITE_ONLY,
                                       &clImageFormat,
                                       width,
                                       height,
                                       0,
                                       NULL,
                                       &errNum);

    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error creating CL output image object." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }


    // Create sampler for sampling image object
    sampler = clCreateSampler(context,
                              CL_FALSE, // Non-normalized coordinates
                              CL_ADDRESS_CLAMP_TO_EDGE,
                              CL_FILTER_NEAREST,
                              &errNum);

    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error creating CL sampler object." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    // Create OpenCL program
    program = CreateProgram(context, device, "ImageFilter2D.cl");
    if (program == NULL)
    {
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    // Create OpenCL kernel
    kernel = clCreateKernel(program, "gaussian_filter", NULL);
    if (kernel == NULL)
    {
        std::cerr << "Failed to create kernel" << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    size_t localWorkSize[1] = { 8 };
    size_t globalWorkSize[1] =  { RoundUp(localWorkSize[0], height) };

    // Set the kernel arguments
    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &imageObjects[1]);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width);
    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height);
    errNum |= clSetKernelArg(kernel, 5, sizeof(float) * ((width+2)*(localWorkSize[0]+2)*4), NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error setting kernel arguments." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    // Queue the kernel up for execution
    errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
                                    globalWorkSize, localWorkSize,
                                    0, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error queuing kernel for execution." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    // Read the output buffer back to the Host
    char *buffer = new char [width * height * 4];
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = { width, height, 1};
    errNum = clEnqueueReadImage(commandQueue, imageObjects[1], CL_TRUE,
                                origin, region, 0, 0, buffer,
                                0, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error reading result buffer." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    std::cout << std::endl;
    std::cout << "Executed program succesfully." << std::endl;

    //memset(buffer, 0xff, width * height * 4);
    // Save the image out to disk
    if (!SaveImage(argv[2], buffer, width, height))
    {
        std::cerr << "Error writing output image: " << argv[2] << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        delete [] buffer;
        return 1;
    }

    delete [] buffer;
    Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);

    time(&end);
    duration = difftime(end, start);
    std::cout << "Total time taken: " << duration << " milliseconds" << std::endl;
    return 0;
}
예제 #12
0
void QHoneycombWidget::initializeGL()
{
  cl_device_id device = 0;
  connect(context(), &QOpenGLContext::aboutToBeDestroyed, this, &QHoneycombWidget::cleanup);

  initializeOpenGLFunctions();

  initTexture();

  // Create an OpenCL context on first available platform
  clcontext = CreateContext();
  if (clcontext == NULL)
  {
    std::cerr << "Failed to create OpenCL context." << std::endl;
    return ;
  }

  // Create a command-queue on the first device available
  // on the created context
  commandQueue = CreateCommandQueue(clcontext, &device);
  if (commandQueue == NULL)
  {
    cleanup();
    return ;
  }


  // Create OpenCL program from GLinterop.cl kernel source
  program = CreateProgram(device, "Kelvin.cl");
  if (program == NULL)
  {
    cleanup();
    return;
  }

  // Create OpenCL kernel
  m_KernelCopyIntoTexture = clCreateKernel(program, "copy_texture_kernel", NULL);
  if (m_KernelCopyIntoTexture == NULL)
  {
    std::cerr << "Failed to create kernel" << std::endl;
    cleanup();
    return;
  }

  m_KernelCopyIntoTextureBasic = clCreateKernel(program, "copy_texture_kernel_basic", NULL);
  if (m_KernelCopyIntoTextureBasic == NULL)
  {
    std::cerr << "Failed to create kernel" << std::endl;
    cleanup();
    return;
  }

  m_KernelIteration = clCreateKernel(program, "KelvinIteration", NULL);
  if (m_KernelIteration == NULL)
  {
    std::cerr << "Failed to create kernel" << std::endl;
    cleanup();
    return;
  }

  // Create memory objects that will be used as arguments to
  // kernel
  if (!CreateMemObjects())
  {
    cleanup();
    return ;
  }
}
예제 #13
0
int main(int argc, char *argv[]){

	if (MODE == 5){

		printf("---OpenCL Test Code---\n\n");


		cl_int errNum;
		cl_uint numPlatforms;
		cl_platform_id *platforms = NULL;
		cl_uint numDevices;
		cl_device_id *devices = NULL;

		//platform info fields
		char vendor[1024], name[1024], version[1024];

		//device info fields
		size_t MAX_WORK_GROUP_SIZE;
		cl_ulong GLOBAL_MEM_CACHE_SIZE, GLOBAL_MEM_SIZE, LOCAL_MEM_SIZE, GLOBAL_MEM_CACHELINE_SIZE;
		cl_uint MAX_COMPUTE_UNITS, MAX_WORK_ITEM_DIMENSIONS;
		size_t MAX_WORK_ITEM_SIZES[3];
		char DEVICE_NAME[1024], DEVICE_VENDOR[1024], DEVICE_VERSION[1024], DRIVER_VERSION[1024], EXTENSIONS[2048];
		cl_device_mem_cache_type GLOBAL_MEM_CACHE_TYPE;


		//printf("Getting number of OpenCL Platforms...\n");
		errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get number of OpenCL platforms.\n");
			return 0;
		}
		else
		{

			//printf("found %d.\n", numPlatforms);
		}

		//printf("Allocating space for the platform info...\n");
		platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));

		printf("---Platform Info---\n");
		errNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get platform info.\n");
			return 0;
		}
		else
		{
			clGetPlatformInfo (platforms[0], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL);
			clGetPlatformInfo (platforms[0], CL_PLATFORM_NAME, sizeof(name), name, NULL);
			clGetPlatformInfo (platforms[0], CL_PLATFORM_VERSION, sizeof(version), version, NULL);

			//printf("Got platform info.\n");
			printf("Vendor: \t%s\n", vendor);
			printf("Name:   \t%s\n", name);
			printf("Version:\t%s\n", version);
		}

		//printf("Getting number of devices...\n");
		errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get number of devices.\n");
			return 0;
		}
		else
		{
	    	//printf("Found %d.\n", numDevices);
	    }

		//printf("Allocating space for device info...\n");
		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

		printf("\n---Device Info---");
		errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
		if (errNum != CL_SUCCESS)
		{
			printf("Failed to get device info.\n");
			return 0;
		}
		else
		{

			int i, j = 0;
			for (i = 0; i < numDevices; i++ )
			{
				printf("\nDevice ID: %d\n", i+1);
				clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(DEVICE_NAME), DEVICE_NAME, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(DEVICE_VENDOR), DEVICE_VENDOR, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(DEVICE_VERSION), DEVICE_VERSION, NULL);
				clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(DRIVER_VERSION), DRIVER_VERSION, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(EXTENSIONS), EXTENSIONS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS), &MAX_COMPUTE_UNITS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(GLOBAL_MEM_SIZE), &GLOBAL_MEM_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(LOCAL_MEM_SIZE), &LOCAL_MEM_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(MAX_WORK_ITEM_DIMENSIONS), &MAX_WORK_ITEM_DIMENSIONS, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MAX_WORK_ITEM_SIZES), MAX_WORK_ITEM_SIZES, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_WORK_GROUP_SIZE), &MAX_WORK_GROUP_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(GLOBAL_MEM_CACHE_SIZE), &GLOBAL_MEM_CACHE_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(GLOBAL_MEM_CACHELINE_SIZE), &GLOBAL_MEM_CACHELINE_SIZE, NULL);
				clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(GLOBAL_MEM_CACHE_TYPE), &GLOBAL_MEM_CACHE_TYPE, NULL);


				printf("Device Name:\t%s\n", DEVICE_NAME);
				printf("Device Vendor:\t%s\n", DEVICE_VENDOR);
				printf("Device Version:\t%s\n", DEVICE_VERSION);
				printf("Driver Version:\t%s\n", DRIVER_VERSION);
				printf("EXTENSIONS:\t%s\n", EXTENSIONS);
				printf("Number of CUs:\t%d\n", MAX_COMPUTE_UNITS);
				printf("GMem:\t\t%lld (Bytes)\n", (long long) GLOBAL_MEM_SIZE);
				printf("GMem $ Size:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHE_SIZE);
				printf("GMem $ Line:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHELINE_SIZE);
				if(GLOBAL_MEM_CACHE_TYPE == CL_NONE)
				{
					printf("GMem $ Type:\tCL_NONE\n");
				}
				else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_ONLY_CACHE)
				{
					printf("GMem $ Type:\tCL_READ_ONLY_CACHE\n");
				}

				else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_WRITE_CACHE)
				{
					printf("GMem $ Type:\tCL_READ_WRITE_CACHE\n");
				}
				printf("LMem:\t\t%lld (Bytes)\n", (long long) LOCAL_MEM_SIZE);
				printf("Work Group Size:%d (Max)\n", (int) MAX_WORK_GROUP_SIZE);
				printf("Work Item Dim:\t%d (Max)\n", MAX_WORK_ITEM_DIMENSIONS);
				printf("Work Item Size:\t");
				for(j = 0; j < MAX_WORK_ITEM_DIMENSIONS; j ++)
				{
						if (j != (MAX_WORK_ITEM_DIMENSIONS -1))
						printf("%d, ", (int) MAX_WORK_ITEM_SIZES[j]);

						if (j == (MAX_WORK_ITEM_DIMENSIONS -1))
						printf("%d ", (int) MAX_WORK_ITEM_SIZES[j]);
				}
				printf("(Max)\n");

			}

				//printf("Got device info.\n");
		}


	}

	else if (MODE == 4){
		cl_context context = 0;
	    cl_command_queue commandQueue = 0;
	    cl_program program = 0;
	    cl_device_id device = 0;

	    //Create an OpenCL context on first available platform
	    context = CreateContext();
	    if (context == NULL)
	    {
	        printf("Failed to create OpenCL context.\n");
	        return 1;
	    }

	    //Create a command-queue on the first device available on the created context
	    commandQueue = CreateCommandQueue(context, &device);
	    if (commandQueue == NULL)
	    {
	    	printf("Failed to create commandQueue.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    // Create OpenCL program and store the binary for future use.
	    printf("Attempting to create kernel binary from source.\n");
	    program = CreateProgram(context, device, KERNELPATHIN);
	    if (program == NULL)
	    {
	    	printf("Failed to create Program");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    printf("Kernel is saved.\n");
	    if (SaveProgramBinary(program, device, KERNELPATHOUT) == false)
	    {
	        printf("Failed to write program binary.\n");
	        Cleanup(context, commandQueue, program, NULL);
	        return 1;
	     }

	    //printf("---Done---");

	    //return 1;

	}
	else if (MODE == 3){

		//todo free remaining objects not passed to cleanup

		//profiling
		int write_bytes = 0;
		int read_bytes = 0;
		/*unsigned long long start_cycles, stop_cycles;
		unsigned long long start_setup, stop_setup;
		unsigned long long start_write, stop_write;
		unsigned long long start_read, stop_read;
		unsigned long long start_finalize, stop_finalize;
		struct timespec start_time_t, stop_time_t;*/


		printf("Stream Mode\n\n");
		//clock_gettime(CLOCK_MONOTONIC, &start_time_t);
		//start_cycles = rdtsc();


		int i;
		time_t t;
		srand((unsigned) time(&t));

	    // Create the two input vectors
	    printf("\nHostside malloc(s)\n");
	    fflush(stdout);
	    int *A = (int*)malloc(sizeof(int)*(SIZE*SIZE));
	    int *B = (int*)malloc(sizeof(int)*(SIZE*SIZE));
	    int *C = (int*)malloc(sizeof(int)*(SIZE*SIZE));

	    //profile
	    //bytes += 3 * sizeof(int)*(SIZE*SIZE);

	    printf("\nHostside mat init\n");
	    fflush(stdout);
	    for(i = 0; i < (SIZE*SIZE); i++) {
	        A[i] = B[i] = rand() % 10 + 1;;
	    }


	    //print matrix
    	printf("Matrix A[%d][%d]:\n", SIZE, SIZE);
	   	for(i = 0; i < (SIZE*SIZE); i++)
	    {
	   		printf("%3d ", A[i]);
	   		if(((i + 1) % SIZE) == 0)
	   			printf("\n");
	    }

	    //print matrix
	   	printf("\nMatrix B[%d][%d]:\n", SIZE, SIZE);
	    for(i = 0; i < (SIZE*SIZE); i++)
	    {
	    	printf("%3d ", B[i]);
	        if(((i + 1) % SIZE) == 0)
	        	printf("\n");
	    }


	    //syscall(STATS_RESET);


	    //Get platform and device information
	    cl_context context = 0;
	    cl_command_queue commandQueue = 0;
	    cl_program program = 0;
	    cl_device_id device = 0;
	    cl_kernel kernel = 0;
	    cl_uint err = 0;
	    //char *filepath = NULL;

	    //Create the context
	    printf("\nCreateContext\n");
	    fflush(stdout);
	    context = CreateContext();
	    if (context == NULL)
	    {
	    	printf("Failed to create OpenCL context.\n");
	    	return 1;
	    }

	   /* printf("\nEnd CreateContext\n");
	    fflush(stdout);*/

	    //Create a command-queue on the first device available on the created context
	    printf("\nCreateCommandQueue\n");
	    fflush(stdout);
	    commandQueue = CreateCommandQueue(context, &device);
	    if (commandQueue == NULL)
	    {
	    	printf("Failed to create command queue.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    //create the program from the binary
	    //program = CreateProgramFromBinary(context, device, "/home/stardica/Desktop/Kernels/vector.cl.bin.GPU");
	    //strcat(KERNELPATHOUT, ".GPU")
	    printf("\nCreateProgramFromBinary\n");
	    fflush(stdout);
	    program = CreateProgramFromBinary(context, device, KERNEL);
	    if (program == NULL)
	    {
	    	printf("Failed to load kernel binary,\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }



	    // Create OpenCL kernel
	    printf("\nclCreateKernel\n");
	    fflush(stdout);
	    kernel = clCreateKernel(program, "Matrix", NULL);
	    if (kernel == NULL)
	    {
	    	printf("Failed to create kernel.\n");
	    	Cleanup(context, commandQueue, program, NULL);
	    	return 1;
	    }

	    cl_mem a_mem_obj = 0;
	    cl_mem b_mem_obj = 0;
	    cl_mem c_mem_obj = 0;

  	    //Create memory buffers on the device for each vector

	    printf("\nclCreateBuffer(s)\n");
	    fflush(stdout);
	    if(LOCALMEM == 1 && CACHEDMEM == 0)
		{
			//this creates uncached buffers in the GPU's local memory
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);

			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

		if(SYSMEM == 1 && CACHEDMEM == 0)
		{
			//this creates uncached buffers in the system memory
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

		if(SYSMEM == 1 && CACHEDMEM == 1)
		{
			//this creates cached buffers in the system memory.
			#if M2S_CGM_OCL_SIM
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
				c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE);
			}
			#else
			{
				a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
				c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL);
			}
			#endif
		}

	    if (a_mem_obj == NULL || b_mem_obj == NULL  || c_mem_obj == NULL)
	    {
	    	printf("Failed to create memory objects.\n");
	    	Cleanup(context, commandQueue, program, kernel);
	    	return 1;
	    }

	    //Copy the lists A and B to their respective memory buffers
	    printf("\nclEnqueueWriteBuffer(s)\n");
	    fflush(stdout);
	    write_bytes += 2 * sizeof(int)*(SIZE*SIZE);
	   // start_write = rdtsc();
	    clEnqueueWriteBuffer(commandQueue, a_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), A, 0, NULL, NULL);
	    clEnqueueWriteBuffer(commandQueue, b_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), B, 0, NULL, NULL);
	   // stop_write = rdtsc();


	    // Set the arguments of the kernel
	    int *size = (int *)SIZE;
	    printf("\nclSetKernelArg(s)\n");
	    fflush(stdout);
	    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj);
	    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj);
	    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj);
	    err = clSetKernelArg(kernel, 3, sizeof(int), (void *)&size);
	    if (err != CL_SUCCESS)
	    {
	    	printf("Kernel args not set.\n");
	    	return 1;
	    }

	    // Execute the OpenCL kernel on the list
	    size_t GlobalWorkSize[2], LocalWorkSize[2];

	    //Rember that in OpenCL we need to express the globalWorkSize in
	    //terms of the total number of threads. The underlying OpenCL API
	    //will look at the globalWorkSize and divide by the localWorkSize
	    //to arrive at a 64 by 64 NDRange of 16 by 16 work groups.

	    GlobalWorkSize[0] = GWS_0;//SIZE*SIZE*SIZE; // Process the entire lists
	    GlobalWorkSize[1] = GWS_1;//SIZE*SIZE*SIZE; // Process the entire lists
	    LocalWorkSize[0] = LWS_0; //SIZE Divide work items into groups of 64
	    LocalWorkSize[1] = LWS_1; //SIZE Divide work items into groups of 64


	    //used null for local, lets OpenCL determine the best local size.
	    //err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
	    printf("\nclEnqueueNDRangeKernel\n");
	    fflush(stdout);
	    err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
	    if (err != CL_SUCCESS)
	    {
	    	printf("ND range not enqueued. Code: %d\n", err);
	    	return 1;
	    }


	    //Read the memory buffer C on the device to the local variable C
	    printf("\nclEnqueueReadBuffer\n");
	    fflush(stdout);
	    read_bytes += sizeof(int)*(SIZE*SIZE);
	    //start_read = rdtsc();
	    err = clEnqueueReadBuffer(commandQueue, c_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), C, 0, NULL, NULL);
	   // stop_read = rdtsc();
	    if (err != CL_SUCCESS)
	    {
	    	printf("Buffer not returned.\n");
	    	return 1;
	    }

	    //syscall(STATS_STOP);

	    //print matrix
	    printf("\nMatrix C[%d][%d] = A[%d][%d]*B[%d][%d]:\n", SIZE, SIZE, SIZE, SIZE, SIZE, SIZE);
	    for(i = 0; i < (SIZE*SIZE); i++)
	    {
	    	printf("%3d ", C[i]);
	        if(((i + 1) % SIZE) == 0)
	        printf("\n");
	    }

	    printf("\nHostside clean up\n");
	    fflush(stdout);
	    err = clFlush(commandQueue);
	    err = clFinish(commandQueue);
	    Cleanup(context, commandQueue, program, kernel);
	    err = clReleaseMemObject(a_mem_obj);
	    err = clReleaseMemObject(b_mem_obj);
	    err = clReleaseMemObject(c_mem_obj);
	    free(A);
	    free(B);
	    free(C);

	    //printf("---Done---");



	    /*stop_cycles = rdtsc();
	    clock_gettime(CLOCK_MONOTONIC, &stop_time_t);
	    printf("Total cycles = %llu\n", (stop_cycles - start_cycles));

	    long int time_s = stop_time_t.tv_nsec - start_time_t.tv_nsec;
	    printf("Approximate runtime (check) = %ld ms\n", (time_s/1000000));

	    printf("Bytes written %d\n", write_bytes);
	    printf("transfer cycles = %llu\n", (stop_write - start_write));
	    printf("start at = %llu\n", (start_write - start_cycles));

	    printf("Bytes read %d\n", read_bytes);
	    printf("transfer cycles = %llu\n", (stop_read - start_read));
	    printf("start at = %llu\n", (start_read - start_cycles));*/

	}
	else if (MODE == 2){

		printf("Multi Thread Mode\n");
		//cal this:
		//assignToThisCore(0);//assign to core 0,1,2,...

		unsigned long long a, b;
	    int i = 0;
	    int j = 0;
	    int k = 0;

		LoadMatrices();

		pthread_t tid[SIZE*SIZE];


		//printf("waiting\n");
		//start our threads
		a = rdtsc();
		syscall(BEGIN_PARALLEL_SECTION);

		for(i=0;i<SIZE;i++){
			for(j=0;j<SIZE;j++){
				struct RowColumnData *RCData = (struct RowColumnData *) malloc(sizeof(struct RowColumnData));
				RCData->RowNum = i;
				RCData->ColumnNum = j;
				//printf("Thread create %d Row %d Col %d\n", k, RCData->RowNum, RCData->ColumnNum);
				pthread_create(&tid[k], NULL, RowColumnMultiply, RCData);
				k++;
			}
		}

		//Join threads////////////////////////////
		for (i=0;i<NUM_THREADS;i++)
		{
			pthread_join(tid[i], NULL);
		}
		syscall(END_PARALLEL_SECTION);
		b = rdtsc();

		PrintMatrices();



		//printf("\nend clock Cycles: %llu\n", b);
		printf("\nDone. Number of clock Cycles: %llu\n", b-a);

	}
	else if (MODE == 1)
	{

		printf("Single Thread Mode\n\n");
		//unsigned long long a, b;
		//a = rdtsc();
		//time_t t;
		int i,j,k;

		//srand((unsigned) time(&t));

		LoadMatrices();

		//multiply mats/////////////////////////
		for (i=0;i<SIZE;i++){
			for(j=0;j<SIZE;j++){
				for(k=0;k<SIZE;k++){
					matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
					}
			}
		}

		PrintMatrices();

		//b = rdtsc();
		//printf("\nDone. Number of clock Cycles: %llu\n", b-a);
	}
	else if (MODE == 0)
	{
		printf("---Misc Tests---\n\n");

		printf("size of long long is %d\n", (int) sizeof(long long));
		printf("size of long is %d\n", (int) sizeof(long));
		printf("size of int is %d\n", (int) sizeof(int));
		printf("size of short is %d\n", (int) sizeof(short));
		printf("size of char * %d\n", (int) sizeof(char *));
		printf("size of unsigned int (word) %d\n", (int) sizeof(unsigned int));

		char *string = "test string";
		printf("Here is the string 1: \"%s\"\n", string);

		//Using the struct
		//set string variable and point to print_me.
		object.string = strdup(string);
		object.print_me = (void (*)(void *)) print_me;

		//use of print_me
		object.print_me(object.string);

		//pointer fun
		struct Object *ptr = &object;
		printf("this is the value of the pointer to struct object: %p\n", ptr);
		object.next=&object;
		printf("this is the value of the pointer to struct object: %p\n", object.next);
		object_ptr = &object;
		object_ptr->next = &object;
		printf("this is the value of the pointer to struct object: %p\n", object_ptr->next);

		//Macro fun
		PRINT(ptr, ptr);
		PRINT(object.next, object.next);
		PRINT(object_ptr->next, object_ptr->next);

		int mmu_page_size = 1 << 12;

		printf("mmu_papge_size = %d\n", mmu_page_size);


		//setjmp and longjmp fun
		/*jmp_buf environment;
		int i;

		i = setjmp(environment);
		printf("\n\nsetjmp returned = %d\n", i);

		printf("Env 1:\n");

		int x = 0;
		for(x = 0; x < 6; x++)
		{
			printf("  %x\n", environment[x]);
		}


		if (i < 3)
		{
			longjmp(environment, 3);
		}

		printf("longjmp finished with i = %d\n", i);*/


	}
	else
	{

		printf("---Invalid Mode Set---\n\n");

	}

	printf("\n---Done---\n");
	return 1;
}
int main(int argc, char** argv)
{
	cl_context context = 0;
	cl_command_queue commandQueue = 0;
	cl_program program = 0;
	cl_device_id device = 0;
	cl_kernel kernel = 0;
	cl_mem memObjects[3] = { 0, 0, 0 };
	cl_int errNum;

	context = CreateContext(); 
	if (context == NULL)
	{
		std::cerr << "Failed to create OpenCL context." << std::endl;
		return 1;
	}
	commandQueue = CreateCommandQueue(context, &device);
	if (commandQueue == NULL)
	{
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}
	program = CreateProgram(context, device, "sun.cl");
	if (program == NULL)
	{
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}
	kernel = clCreateKernel(program, "hello_kernel", NULL);
	if (kernel == NULL)
	{
		std::cerr << "Failed to create kernel" << std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Create memory objects that will be used as arguments to
	// kernel.  First create host memory arrays that will be
	// used to store the arguments to the kernel
	float result[ARRAY_SIZE];
	float a[ARRAY_SIZE];
	float b[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; i++)
	{
		a[i] = (float)i;
		b[i] = (float)(i * 2);
	}
	//在设备上创建buffer对象
	if (!CreateMemObjects(context, memObjects, a, b))
	{
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Set the kernel arguments (result, a, b)
	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
	errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
	if (errNum != CL_SUCCESS)
	{
		std::cerr << "Error setting kernel arguments." << std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	size_t globalWorkSize[1] = { ARRAY_SIZE };
	size_t localWorkSize[1] = { 1 };

	// Queue the kernel up for execution -----opencl runtime
	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
		globalWorkSize, localWorkSize,
		0, NULL, NULL);
	if (errNum != CL_SUCCESS)
	{
		std::cerr << "Error queuing kernel for execution." << std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Read the output buffer back to the Host
	errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
		0, ARRAY_SIZE * sizeof(float), result,
		0, NULL, NULL);
	if (errNum != CL_SUCCESS)
	{
		std::cerr << "Error reading result buffer." << std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Output the result buffer
	for (int i = 0; i < ARRAY_SIZE; i++)
	{
		std::cout << result[i] << " ";
	}
	std::cout << std::endl;
	std::cout << "Executed program succesfully." << std::endl;
	Cleanup(context, commandQueue, program, kernel, memObjects);

	return 0;
}