Beispiel #1
0
void imgdiff(size_t N, size_t width, size_t height, double* diff_matrix, unsigned char* images) 
{

	//// we need to fill in ////
	cl_platform_id *platform;
	cl_device_type dev_type = CL_DEVICE_TYPE_GPU;
	cl_device_id *devs;
	cl_context context;
	cl_command_queue *cmd_queues;
	cl_program program;
	cl_kernel *kernels;
	cl_uint num_platforms;
	cl_uint num_devs;

	cl_mem* m_image1;
	cl_mem* m_image2;
	cl_mem* m_result;

	cl_event* ev_kernels;

	int err = CL_SUCCESS;

	int i, j, k;
	
	// modify version
	err = clGetPlatformIDs(0, NULL, &num_platforms);
	if(err != CL_SUCCESS)
	{
		printf("Error: platform error\n");
		return 0;
	}

	if(num_platforms == 0)
	{
		printf("Error: platform no count\n");
		return 0;
	}

	platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
	err = clGetPlatformIDs(num_platforms, platform, NULL);
	if(err != CL_SUCCESS)
	{
		printf("Error: clGetPlatformIDs error\n");
		return 0;
	}

	for(i = 0; i<num_platforms; i++)
	{
		err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs);
		if(err != CL_SUCCESS)
		{
			printf("Error: clGetDevice\n");
			return 0;
		}
		if(num_devs >= 1)
		{
			devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs);

			clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL);
			break;
		}
	}

	context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err);
	if(err != CL_SUCCESS)
	{
		printf("Error: clCreateContext error\n");
		return 0;
	}

	char* source = NULL;
	size_t src_size = 0;
	err = ReadSourceFromFile("./imgdiff_cal.cl", &source, &src_size);
	if (CL_SUCCESS != err)
	{
		printf("Error: ReadSourceFromFile returned %s.\n", err);
		free(source);
		return 0;
	}

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &src_size, &err);
	if(err != CL_SUCCESS)
	{
		printf("Error: clCreateProgram error\n");
		return 0;
	}

	free(source);
	printf("Create Program Success\n");

#if DBG
	// Measure clBuildProgram -@henry added
	gettimeofday(&start_m, NULL );
#endif
	err = clBuildProgram(program, num_devs, devs, "", NULL, NULL);
#if DBG
	gettimeofday(&end_m, NULL );

	double time = (end_m.tv_usec - start_m.tv_usec)*1e-6 + (end_m.tv_sec - start_m.tv_sec);
	printf("[Debug] Elapsed Time of clBuildProgram() : %lf s\n",time); 
#endif
	if(err != CL_SUCCESS)
	{
		printf("Error: clBuildProgram\n");
		return 0;
	}

	printf("Build Program Success\n");

	kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs);
	for(i = 0; i<num_devs; i++)
	{
		kernels[i] = clCreateKernel(program, "imgdiff_cal", NULL);
	}


	printf("Create Kernel Success\n");

	cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs);
	for(i=0; i<num_devs; i++)
	{
		cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err);
		if(err != CL_SUCCESS)
		{
			printf("Error: clCreateCommandQueue error\n");
			return 0;
		}

	}

	printf("Create commandQueue Success\n");
	int LOCAL_WIDTH = 16;
	int LOCAL_HEIGHT = 16;


	int WORK_WIDTH = ceil((double)width / LOCAL_WIDTH)*LOCAL_WIDTH;
	int WORK_HEIGHT = ceil((double)height/LOCAL_HEIGHT) * LOCAL_HEIGHT;
	int WORK_AMOUNT = width * height;
	int WORK_GROUP_COUNT = ceil(((double)WORK_WIDTH * WORK_HEIGHT) / (LOCAL_WIDTH * LOCAL_HEIGHT));
	
	int WORK_GROUP_WIDTH = width;
	int WORK_GROUP_HEIGHT = height;

	int SAMPLE_COUNT = 16;
	int WORK_COUNT[num_devs];
	double tmp_result_data[WORK_GROUP_COUNT*SAMPLE_COUNT];

	printf("WORK_WIDTH %d\tWORK_HEIGHT %d\t WORK_AMOUNT %d\t WORK_GROUP_COUNT %d\n", 
			WORK_WIDTH, WORK_HEIGHT, WORK_AMOUNT, WORK_GROUP_COUNT);

	m_image1 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs);
	m_image2 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs);
	
	m_result = (cl_mem*)malloc(sizeof(cl_mem)* num_devs);


	for(i=0; i<num_devs; i++)
	{
		m_image1[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT * 3, NULL, NULL);
		m_image2[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT*SAMPLE_COUNT * 3, NULL, NULL);
			
		m_result[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double) * WORK_GROUP_COUNT * SAMPLE_COUNT, NULL, NULL);
		clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*)&m_image1[i]);
		clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*)&m_image2[i]);
		clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*)&m_result[i]);
		clSetKernelArg(kernels[i], 3, sizeof(cl_int), &WORK_GROUP_WIDTH);
		clSetKernelArg(kernels[i], 4, sizeof(cl_int), &WORK_GROUP_HEIGHT);
	}

	ev_kernels  = (cl_event*)malloc(sizeof(cl_event)*num_devs);

	int row, col;

	row = 0;
	col = 1;

	for(row = 0; row < N; row++)
	{
		if( (N-row-1) < (SAMPLE_COUNT*4) && SAMPLE_COUNT > 1)
			SAMPLE_COUNT = SAMPLE_COUNT / 2;
		
		int remain_count = N - (row + 1);

		for(i=0; i<num_devs; i++)
		{

			clEnqueueWriteBuffer(cmd_queues[i], m_image1[i], CL_FALSE, 0, 
					sizeof(unsigned char) * WORK_AMOUNT * 3, (void*)(images + 
					(row * width*height)*3), 0, NULL, NULL);
		}
			
		diff_matrix[row*N + row] = 0;
		col = row + 1;
		while( col< N)
		{
			size_t lws[2] = { LOCAL_WIDTH, LOCAL_HEIGHT };
			size_t gws[2] = { WORK_WIDTH, WORK_HEIGHT};
			
			for(i=0; i<num_devs; i++)
			{
				if((remain_count - SAMPLE_COUNT) < 0)
				{
					WORK_COUNT[i] = remain_count;
					remain_count = 0;
				}
				else
				{
					WORK_COUNT[i] = SAMPLE_COUNT;
					remain_count = remain_count - SAMPLE_COUNT;
				}
				
				if(WORK_COUNT[i] != 0)
				{
					
					clSetKernelArg(kernels[i], 5, sizeof(cl_int), &WORK_COUNT[i]);
					
					int offset = 0;
					for(j=0; j<i; j++)
						offset += WORK_COUNT[j];


					err = clEnqueueWriteBuffer(cmd_queues[i], m_image2[i], CL_FALSE, 0, 
							sizeof(unsigned char)*WORK_AMOUNT*WORK_COUNT[i]*3, 
							(void*)(images +((col * width*height) + (WORK_AMOUNT * 
										offset))*3), 0, NULL, NULL);

				}


			}

			for( i=0; i < num_devs; i++ )
			{
				if(WORK_COUNT[i] != 0)
				{
					err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 2, NULL, gws, lws, 0, NULL, NULL);
					if(err != CL_SUCCESS)
					{
						printf("Error: clEnqueueNDRangeKernel %d error\n", i);
						printf("%s\n", TranslateOpenCLError(err));
						return 0;
					}
				}
			}
			double tmp_sum = 0;
			i = 0;
			for( i = num_devs -1; i >= 0; i-- )
			{
				
				if(WORK_COUNT[i] != 0)
				{
					err = clEnqueueReadBuffer( cmd_queues[i], m_result[i], CL_TRUE, 0, 
						sizeof(double) * WORK_GROUP_COUNT * WORK_COUNT[i], 
						tmp_result_data, 0, NULL, NULL); 
					if(err != CL_SUCCESS)
					{
						printf("Error: clEnqueueReadBuffer%d error\n", i);
						return 0;
					}
					//printf("receive......");

					for(j = 0; j<WORK_COUNT[i]; j++)
					{	
						tmp_sum = 0;
						for(k = 0; k<WORK_GROUP_COUNT; k++)
						{
							tmp_sum += tmp_result_data[k + j*WORK_GROUP_COUNT];
							//printf("%lf\t", tmp_result_data[k+j*WORK_GROUP_COUNT]);

						}
						//printf("%lf %lf\n", tmp_sum, tmp_result_data[j*WORK_GROUP_COUNT]);
						
						int offset = 0;
						for(k=0; k<i; k++)
							offset += WORK_COUNT[k];
						diff_matrix[row*N+col+j+offset] = diff_matrix[(col+j+offset)*N+row] = tmp_sum;
					}

				}
				
			}
			
			for( i = 0; i < num_devs; i++ )
			{
				col += WORK_COUNT[i];
			} 
		}
	}

}
Beispiel #2
0
// SETUP
int CLContext::setupCL()
{
	cl_int status = CL_SUCCESS;

	cl_device_type dType;
	int gpu = 1;

	if(gpu == 0)
		dType = CL_DEVICE_TYPE_CPU;
	else //deviceType = "gpu" 
		dType = CL_DEVICE_TYPE_GPU;

	/*
	* Have a look at the available platforms and pick either
	* the AMD one if available or a reasonable default.      <----- LOL check out the amd propaganda
	*/

	cl_uint numPlatforms;
	cl_platform_id platform = NULL;
	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if(!checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed."))
		return CL_FAILURE;
	if (0 < numPlatforms) 
	{
		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
		if(!checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed."))
			return CL_FAILURE;
		for (unsigned i = 0; i < numPlatforms; ++i) 
		{
			char pbuf[100];
			status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);

			if(!checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed."))
				return CL_FAILURE;

			platform = platforms[i];
			if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
				break;
		}
		delete[] platforms;
	}

	/*
	* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
	* implementation thinks we should be using.
	*/

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,  (cl_context_properties)platform,  0 };
	/* Use NULL for backward compatibility */
	cl_context_properties* cprops = (NULL == platform) ? NULL : cps;

	context = clCreateContextFromType( cprops, dType, NULL, NULL, &status);
	if(!checkVal( status, CL_SUCCESS, "clCreateContextFromType failed."))
		return CL_FAILURE;

	size_t deviceListSize;

	/* First, get the size of device list data */
	status = clGetContextInfo( context,  CL_CONTEXT_DEVICES,  0,  NULL,  &deviceListSize);
	if(!checkVal( status,  CL_SUCCESS, "clGetContextInfo failed."))
		return CL_FAILURE;

	/* Now allocate memory for device list based on the size we got earlier */
	devices = (cl_device_id*)malloc(deviceListSize);
	if(devices==NULL)
	{
		cout << "Failed to allocate memory (devices)." << endl;
		return CL_FAILURE;
	}

	/* Now, get the device list data */
	status = clGetContextInfo( context,  CL_CONTEXT_DEVICES,  deviceListSize,  devices,  NULL);
	if(!checkVal( status, CL_SUCCESS,  "clGetContextInfo failed."))
		return CL_FAILURE;

	/* Create command queue */
	commandQueue = clCreateCommandQueue( context, devices[0], 0, &status);
	if(!checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed."))
		return CL_FAILURE;

	/* Get Device specific Information */
	status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL);

	if(!checkVal( status, CL_SUCCESS,  "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed."))
		return CL_FAILURE;


	status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL);
	if(!checkVal( status, CL_SUCCESS,  "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
		return CL_FAILURE;


	maxWorkItemSizes = (size_t *)malloc(maxDimensions * sizeof(unsigned int));

	status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL);
	if(!checkVal( status, CL_SUCCESS,  "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
		return CL_FAILURE;

	status = clGetDeviceInfo( devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void *)&totalLocalMemory, NULL);
	if(!checkVal( status, CL_SUCCESS,  "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed."))
		return CL_FAILURE;

	/*
	* Create and initialize memory objects
	*/

	/* create a CL program using the kernel source */
	string content;
	fileH.open( "critterding.cl", content ); 

	const char * source = content.c_str();
	size_t sourceSize[] = { strlen(source) };
	program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status);
	if(!checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed."))
		return CL_FAILURE;

	/* create a cl program executable for all the devices specified */
	status = clBuildProgram( program, 1, &devices[0], NULL, NULL, NULL);
	if(status != CL_SUCCESS)
	{
		if(status == CL_BUILD_PROGRAM_FAILURE)
		{
			cl_int logStatus;
			char * buildLog = NULL;
			size_t buildLogSize = 0;
			logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize);
			if(!checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed."))
				return CL_FAILURE;

			buildLog = (char*)malloc(buildLogSize);
			if(buildLog == NULL)
			{
				cout << "Failed to allocate host memory. (buildLog)" << endl;
				return CL_FAILURE;
			}
			memset(buildLog, 0, buildLogSize);

			logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL);
			if(!checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed."))
			{
				free(buildLog);
				return CL_FAILURE;
			}

			std::cout << " \n\t\t\tBUILD LOG\n";
			std::cout << " ************************************************\n";
			std::cout << buildLog << std::endl;
			std::cout << " ************************************************\n";
			free(buildLog);
		}

		if(!checkVal( status, CL_SUCCESS, "clBuildProgram failed."))
			return CL_FAILURE;
	}

	return CL_SUCCESS;
}
void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_left(int xdim0, int ydim0, int xdim1, int ydim1) {

  //int ocl_fma = OCL_FMA;
  if(!isbuilt_update_halo_kernel2_yvel_plus_2_left) {
    buildOpenCLKernels();
    //clSafeCall( clUnloadCompiler() );
    cl_int ret;
    char* source_filename[1] = {"./OpenCL/update_halo_kernel2_yvel_plus_2_left.cl"};

    // Load the kernel source code into the array source_str
    FILE *fid;
    char *source_str[1];
    size_t source_size[1];

    for(int i=0; i<1; i++) {
      fid = fopen(source_filename[i], "r");
      if (!fid) {
        fprintf(stderr, "Can't open the kernel source file!\n");
        exit(1);
      }

      source_str[i] = (char*)malloc(4*0x1000000);
      source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid);
      if(source_size[i] != 4*0x1000000) {
        if (ferror(fid)) {
          printf ("Error while reading kernel source file %s\n", source_filename[i]);
          exit(-1);
        }
        if (feof(fid))
          printf ("Kernel source file %s succesfuly read.\n", source_filename[i]);
          //printf("%s\n",source_str[i]);
      }
      fclose(fid);
    }

    printf("Compiling update_halo_kernel2_yvel_plus_2_left %d source -- start \n",OCL_FMA);

      // Create a program from the source
      OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret);
      clSafeCall( ret );

      // Build the program
      char buildOpts[255*3];
      char* pPath = NULL;
      pPath = getenv ("OPS_INSTALL_PATH");
      if (pPath!=NULL)
        if(OCL_FMA)
          sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d  -Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d  -Dydim0_update_halo_kernel2_yvel_plus_2_left=%d  -Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d  -Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1);
        else
          sprintf(buildOpts,"-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d  -Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d  -Dydim0_update_halo_kernel2_yvel_plus_2_left=%d  -Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d  -Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1);
      else {
        sprintf("Incorrect OPS_INSTALL_PATH %s\n",pPath);
        exit(EXIT_FAILURE);
      }

      ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL);

      if(ret != CL_SUCCESS) {
        char* build_log;
        size_t log_size;
        clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) );
        build_log = (char*) malloc(log_size+1);
        clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) );
        build_log[log_size] = '\0';
        fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log);
        fprintf(stderr, "\n========================================================= \n");
        free(build_log);
        exit(EXIT_FAILURE);
      }
      printf("compiling update_halo_kernel2_yvel_plus_2_left -- done\n");

    // Create the OpenCL kernel
    OPS_opencl_core.kernel[70] = clCreateKernel(OPS_opencl_core.program, "ops_update_halo_kernel2_yvel_plus_2_left", &ret);
    clSafeCall( ret );

    isbuilt_update_halo_kernel2_yvel_plus_2_left = true;
  }

}
int 
MemoryOptimizations::genBinaryImage()
{
    cl_int status = CL_SUCCESS;

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetPlatformIDs failed."))
    {
        return SDK_FAILURE;
    }
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetPlatformIDs failed."))
        {
            return SDK_FAILURE;
        }

        char platformName[100];
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(platformName),
                                       platformName,
                                       NULL);

            if(!sampleCommon->checkVal(status,
                                       CL_SUCCESS,
                                       "clGetPlatformInfo failed."))
            {
                return SDK_FAILURE;
            }

            platform = platforms[i];
            if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        std::cout << "Platform found : " << platformName << "\n";
        delete[] platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[5] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        CL_CONTEXT_OFFLINE_DEVICES_AMD,
        (cl_context_properties)1,
        0
    };

    context = clCreateContextFromType(cps,
                                      CL_DEVICE_TYPE_ALL,
                                      NULL,
                                      NULL,
                                      &status);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateContextFromType failed."))
    {
        return SDK_FAILURE;
    }

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();
    kernelPath.append("MemoryOptimizations_Kernels.cl");
    if(!kernelFile.open(kernelPath.c_str()))
    {
        std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
        return SDK_FAILURE;
    }
    const char * source = kernelFile.source().c_str();
    size_t sourceSize[] = {strlen(source)};
    program = clCreateProgramWithSource(context,
                                        1,
                                        &source,
                                        sourceSize,
                                        &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateProgramWithSource failed."))
    {
        return SDK_FAILURE;
    }
    
    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(program,
                            0,
                            NULL,
                            NULL,
                            NULL,
                            NULL);

    size_t numDevices;
    status = clGetProgramInfo(program, 
                           CL_PROGRAM_NUM_DEVICES,
                           sizeof(numDevices),
                           &numDevices,
                           NULL );
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetProgramInfo(CL_PROGRAM_NUM_DEVICES) failed."))
    {
        return SDK_FAILURE;
    }

    std::cout << "Number of devices found : " << numDevices << "\n\n";
    devices = (cl_device_id *)malloc( sizeof(cl_device_id) * numDevices );
    if(devices == NULL)
    {
        sampleCommon->error("Failed to allocate host memory.(devices)");
        return SDK_FAILURE;
    }
    /* grab the handles to all of the devices in the program. */
    status = clGetProgramInfo(program, 
                              CL_PROGRAM_DEVICES, 
                              sizeof(cl_device_id) * numDevices,
                              devices,
                              NULL );
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetProgramInfo(CL_PROGRAM_DEVICES) failed."))
    {
        return SDK_FAILURE;
    }


    /* figure out the sizes of each of the binaries. */
    size_t *binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices );
    if(devices == NULL)
    {
        sampleCommon->error("Failed to allocate host memory.(binarySizes)");
        return SDK_FAILURE;
    }
    
    status = clGetProgramInfo(program, 
                              CL_PROGRAM_BINARY_SIZES,
                              sizeof(size_t) * numDevices, 
                              binarySizes, NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetProgramInfo(CL_PROGRAM_BINARY_SIZES) failed."))
    {
        return SDK_FAILURE;
    }

    size_t i = 0;
    /* copy over all of the generated binaries. */
    char **binaries = (char **)malloc( sizeof(char *) * numDevices );
    if(binaries == NULL)
    {
        sampleCommon->error("Failed to allocate host memory.(binaries)");
        return SDK_FAILURE;
    }

    for(i = 0; i < numDevices; i++)
    {
        if(binarySizes[i] != 0)
        {
            binaries[i] = (char *)malloc( sizeof(char) * binarySizes[i]);
            if(binaries[i] == NULL)
            {
                sampleCommon->error("Failed to allocate host memory.(binaries[i])");
                return SDK_FAILURE;
            }
        }
        else
        {
            binaries[i] = NULL;
        }
    }
    status = clGetProgramInfo(program, 
                              CL_PROGRAM_BINARIES,
                              sizeof(char *) * numDevices, 
                              binaries, 
                              NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetProgramInfo(CL_PROGRAM_BINARIES) failed."))
    {
        return SDK_FAILURE;
    }

    /* dump out each binary into its own separate file. */
    for(i = 0; i < numDevices; i++)
    {
        char fileName[100];
        sprintf(fileName, "%s.%d", dumpBinary.c_str(), (int)i);
        if(binarySizes[i] != 0)
        {
            char deviceName[1024];
            status = clGetDeviceInfo(devices[i], 
                                     CL_DEVICE_NAME, 
                                     sizeof(deviceName),
                                     deviceName, 
                                     NULL);
            if(!sampleCommon->checkVal(status,
                                       CL_SUCCESS,
                                       "clGetDeviceInfo(CL_DEVICE_NAME) failed."))
            {
                return SDK_FAILURE;
            }

            printf( "%s binary kernel: %s\n", deviceName, fileName);
            streamsdk::SDKFile BinaryFile;
            if(!BinaryFile.writeBinaryToFile(fileName, 
                                             binaries[i], 
                                             binarySizes[i]))
            {
                std::cout << "Failed to load kernel file : " << fileName << std::endl;
                return SDK_FAILURE;
            }
        }
        else
        {
            printf("Skipping %s since there is no binary data to write!\n",
                    fileName);
        }
    }

    // Release all resouces and memory
    for(i = 0; i < numDevices; i++)
    {
        if(binaries[i] != NULL)
        {
            free(binaries[i]);
            binaries[i] = NULL;
        }
    }

    if(binaries != NULL)
    {
        free(binaries);
        binaries = NULL;
    }

    if(binarySizes != NULL)
    {
        free(binarySizes);
        binarySizes = NULL;
    }

    if(devices != NULL)
    {
        free(devices);
        devices = NULL;
    }

    status = clReleaseProgram(program);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clReleaseProgram failed."))
    {
        return SDK_FAILURE;
    }

    status = clReleaseContext(context);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clReleaseContext failed."))
    {
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}
Beispiel #5
0
cl_kernel load_kernel (char * name, char * kernel_name, cl_device_id dev_id, cl_context context, int * error)
{
	cl_kernel kernel;

	FILE * fp = fopen(name, "r");

	if(fp==NULL)
	{
		*error = 1;
		return NULL;
	}

	char * source;
	size_t size;

	fseek(fp, 0, SEEK_END);
	size = ftell(fp);
	fseek(fp, 0, SEEK_SET);
		
	source = malloc((size+1)*sizeof(char));
	
	size = fread(source, 1, size, fp);

	fclose(fp);
	
	source[size] = '\0';

	cl_int err;

	cl_program program = clCreateProgramWithSource (context, 1, (const char **) &source, &size, &err);

	if(err!=CL_SUCCESS)
	{
		*error = 1;
		return NULL;
	}

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	cl_build_status status;

	clGetProgramBuildInfo(program, dev_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);

	if(status!=CL_BUILD_SUCCESS)
	{
		*error = 1;
		return NULL;
	}

	kernel = clCreateKernel(program, kernel_name , &err);

	if(err!=CL_SUCCESS)
	{
		*error = 1;
		return NULL;
	}

	*error = 0;
        free ( source );
	clReleaseProgram(program);
	return kernel;
}
Beispiel #6
0
static int build_program_from_file(const char *filename,
                                   const char *options,
                                   cl_context context,
                                   cl_device_id device,
                                   cl_program *program_out,
                                   cl_int *err)
{
    cl_int _err;
    FILE *file;
    char *program_source = NULL;
    size_t program_source_size;
    cl_program program = NULL;
    char *build_log = NULL;

    assert(filename != NULL);
    assert(program_out != NULL);

    if (!err) err = &_err;

    file = fopen(filename, "r");
    if (!file) {
        ERROR("Couldn't open file \"%s\"", filename);
        goto error;
    }

    if (fseek(file, 0L, SEEK_END)) {
        ERROR("Cannot determine file size of \"%s\"", filename);
        goto error;
    }
    program_source_size = ftell(file);
    if (fseek(file, 0L, SEEK_SET)) {
        ERROR("Cannot determine file size of \"%s\"", filename);
        goto error;
    }

    program_source = malloc(sizeof(*program_source) * (program_source_size + 1));
    CHECK_ALLOCATION(program_source);

    if (fread(program_source, 1, program_source_size, file) != program_source_size) {
        ERROR("Failed to read file \"%s\"", filename);
        goto error;
    }
    program_source[program_source_size] = '\0';

    fclose(file);

    program = clCreateProgramWithSource(context, 1, (const char **)&program_source, NULL, err);
    CHECK_CL_ERROR(*err);

    *err = clBuildProgram(program, 0, NULL, options, NULL, NULL);
    if (*err == CL_BUILD_PROGRAM_FAILURE) {
        size_t build_log_size;

        *err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
        CHECK_CL_ERROR(*err);

        build_log = malloc(sizeof(*build_log) * build_log_size);
        CHECK_ALLOCATION(build_log);

        *err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
        CHECK_CL_ERROR(*err);

        if (options)
            ERROR("Failed to build program in file \"%s\" with options \"%s\"", filename, options);
        else
            ERROR("Failed to build program in file \"%s\"", filename);

        debug_printf("================================== BUILD LOG ===================================\n"
                     "%s", NULL, 0, LOGGING_MSG_ERROR, build_log);
        goto error;
    }
    CHECK_CL_ERROR(*err);

    *program_out = program;
    return 0;
error:
    free(build_log);
    if (program)
        clReleaseProgram(program);
    *program_out = NULL;
    free(program_source);
    return -1;
}
Beispiel #7
0
int main(int argc, char **argv)
{
    int	start,end;
    unsigned long	p[64], c[64], k[56];
    unsigned long	res;

    build_samples (p, c, k, 0);
    set_low_keys(k);

    cl_platform_id cpPlatform;
    clGetPlatformIDs(1, &cpPlatform, NULL);

    cl_device_id cdDevice;
    clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

    char cBuffer[1024];
    clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DEVICE_NAME:\t\t%s\n", cBuffer);
    clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DRIVER_VERSION:\t%s\n\n", cBuffer);
    cl_uint compute_units;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
    printf("CL_DEVICE_MAX_COMPUTE_UNITS:\t%u\n", compute_units);
    size_t workitem_dims;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
    printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
    size_t workitem_size[3];
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
    printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
    size_t workgroup_size;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
    printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
    cl_uint clock_frequency;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
    printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);

    cl_context GPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, NULL);

    cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL);

    cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                                       CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, p, NULL);
    cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                                       CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, c, NULL);
    cl_mem GPUVector3 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                                       CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 56, k, NULL);

    cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                            sizeof(unsigned long), &res, NULL);

    size_t szKernelLength;
    char* cSourceCL = oclLoadProgSource("ocl_deseval.cl", "", &szKernelLength);
    cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1,
                               (const char **)&cSourceCL, &szKernelLength, NULL);

    if (clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL)!=CL_SUCCESS)
    {
        char cBuffer[2048];
        if(clGetProgramBuildInfo(OpenCLProgram,cdDevice,CL_PROGRAM_BUILD_LOG,sizeof(cBuffer),cBuffer,NULL)==CL_SUCCESS);
        printf("Build error:\n%s\n",cBuffer);
        exit(1);
    }
    cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "keysearch", NULL);

    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
    clSetKernelArg(OpenCLVectorAdd, 3, sizeof(cl_mem), (void*)&GPUVector3);

    size_t WorkSize[1] = {1024};
    start=clock();
    for (int i=0; i<1024; i++) {
        //clEnqueueWriteBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0,
        //									56 * sizeof(unsigned long), k, 0, NULL, NULL);
        clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL,
                               WorkSize, NULL, 0, NULL, NULL);
        //clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0,
        //									sizeof(unsigned long), &res, 0, NULL, NULL);
        if(res!=0) {
            printf("Key found\n");
            //key_found(res,k);
            break;
        }
        increment_key (k);
    }
    end=clock();

    clReleaseKernel(OpenCLVectorAdd);
    clReleaseProgram(OpenCLProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(GPUContext);
    clReleaseMemObject(GPUVector1);
    clReleaseMemObject(GPUVector2);
    clReleaseMemObject(GPUOutputVector);

    printf ("Searched %i keys in %.3f seconds\n", 1000000, ((double)(end-start))/CLOCKS_PER_SEC);
    return 0;
}
Beispiel #8
0
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength,
		int bLength, int keyLength, uint32_t** matches, char* clFile, int x,
		int y) {
	int gap = 0, myoffset = 0;
	cl_platform_id *platforms;
	cl_uint num_platforms = 0;
	cl_device_id *devices;
	cl_uint num_devices = 0;
	cl_context context;
	cl_command_queue command_queue;
	cl_image_format imgFormat;
	cl_mem aImg;
	cl_mem bImg;
	cl_mem res_buf;
	cl_program program;
	cl_kernel kernel;
	cl_uint *results;
	FILE *prgm_fptr;
	struct stat prgm_sbuf;
	char *prgm_data;
	size_t prgm_size;
	size_t offset;
	size_t count;
	const size_t global_work_size[] = { x, y };
	const size_t origin[] = { 0, 0, 0 };
	const size_t region[] = { aLength, 1, 1 };

	cl_int ret;
	cl_uint i;

	cl_bool imageSupport;

	struct timeval t1, t2;
	double elapsedTime;

	results = malloc(sizeof(cl_uint) * aLength);

	imgFormat.image_channel_order = CL_RGBA;
	imgFormat.image_channel_data_type = CL_UNSIGNED_INT32;

	/* figure out how many CL platforms are available */
	ret = clGetPlatformIDs(0, NULL, &num_platforms);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting the number of platform IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (0 == num_platforms) {
		print_error ("No CL platforms were found.");
		exit(EXIT_FAILURE);
	}

	/* allocate space for each available platform ID */
	if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* get all of the platform IDs */
	ret = clGetPlatformIDs(num_platforms, platforms, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting platform IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* find a platform that supports given device type */
	//	print_error ("Number of platforms found: %d", num_platforms);
	for (i = 0; i < num_platforms; i++) {
		ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL,
				&num_devices);
		if (CL_SUCCESS != ret)
			continue;

		if (0 < num_devices)
			break;
	}

	/* make sure at least one device was found */
	if (num_devices == 0) {
		print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU"));
		exit(EXIT_FAILURE);
	}

	/* only one device is necessary... */
	num_devices = 1;
	if (NULL == (devices = malloc((sizeof *devices) * num_devices))) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* get one device id */
	ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices,
			devices, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting device IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL);
	if (CL_SUCCESS != ret) {
			print_error ("Failed to get Device Info: %d", ret);
			exit(EXIT_FAILURE);
		}

	if(imageSupport == CL_FALSE)
	{
		print_error ("Failure: Images are not supported!");
				exit(EXIT_FAILURE);
	}

	/* create a context for the CPU device that was found earlier */
	context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret);
	if (NULL == context || CL_SUCCESS != ret) {
		print_error ("Failed to create context: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* create a command queue for the CPU device */
	command_queue = clCreateCommandQueue(context, devices[0], 0, &ret);
	if (NULL == command_queue || CL_SUCCESS != ret) {
		print_error ("Failed to create a command queue: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* create buffers on the CL device */
	aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret);
	if (NULL == aImg || CL_SUCCESS != ret) {
		print_error ("Failed to create a image: %d", ret);
		exit(EXIT_FAILURE);
	}

	bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret);
	if (NULL == bImg || CL_SUCCESS != ret) {
		print_error ("Failed to create b image: %d", ret);
		exit(EXIT_FAILURE);
	}

	int res_bufSize = aLength;

	res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint)
			* res_bufSize, NULL, &ret);
	if (NULL == res_buf || CL_SUCCESS != ret) {
		print_error ("Failed to create b buffer: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* read the opencl program code into a string */
	prgm_fptr = fopen(clFile, "r");
	if (NULL == prgm_fptr) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}

	if (0 != stat(clFile, &prgm_sbuf)) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}
	prgm_size = prgm_sbuf.st_size;

	prgm_data = malloc(prgm_size);
	if (NULL == prgm_data) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* make sure all data is read from the file (just in case fread returns
	 * short) */
	offset = 0;
	while (prgm_size - offset != (count = fread(prgm_data + offset, 1,
			prgm_size - offset, prgm_fptr)))
		offset += count;

	if (0 != fclose(prgm_fptr)) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}

	/* create a 'program' from the source */
	program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data,
			&prgm_size, &ret);
	if (NULL == program || CL_SUCCESS != ret) {
		print_error ("Failed to create program with source: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* compile the program.. (it uses llvm or something) */
	ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL);
	if (CL_SUCCESS != ret) {
		size_t size;
		char *log = calloc(1, 4000);
		if (NULL == log) {
			print_error ("Out of memory");
			exit(EXIT_FAILURE);
		}

		print_error ("Failed to build program: %d", ret);
		ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG,
				4096, log, &size);
		if (CL_SUCCESS != ret) {
			print_error ("Failed to get program build info: %d", ret);
			exit(EXIT_FAILURE);
		}

		fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log);
		exit(EXIT_FAILURE);
	}

	/* pull out a reference to your kernel */
	kernel = clCreateKernel(program, "cgm_kernel", &ret);
	if (NULL == kernel || CL_SUCCESS != ret) {
		print_error ("Failed to create kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	gettimeofday(&t1, NULL);

	/* write data to these buffers */
	clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0,
			(void*) aImg, 0, NULL, NULL);
	clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0,
			(void*) bImg, 0, NULL, NULL);

	/* set your kernel's arguments */
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clSetKernelArg(kernel, 4, sizeof(int), &gap);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* make sure buffers have been written before executing */
	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* enque this kernel for execution... */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
			global_work_size, NULL, 0, NULL, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* wait for the kernel to finish executing */
	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* copy the contents of dev_buf from the CL device to the host (CPU) */
	ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint)
			* aLength, results, 0, NULL, NULL);

	gettimeofday(&t2, NULL);
	elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms
	elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms

	if (CL_SUCCESS != ret) {
		print_error ("Failed to copy data from device to host: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* make sure the content of the buffer are what we expect */
	//for (i = 0; i < aLength; i++)
	//	printf("%d\n", results[i]);

	/* free up resources */
	ret = clReleaseKernel(kernel);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseProgram(program);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release program: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseMemObject(aImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clReleaseMemObject(bImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseMemObject(res_buf);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) {
		print_error ("Failed to release command queue: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (CL_SUCCESS != (ret = clReleaseContext(context))) {
		print_error ("Failed to release context: %d", ret);
		exit(EXIT_FAILURE);
	}

	matches = &results;
	return elapsedTime;
}
Beispiel #9
0
///
//	main() for Convoloution example
//
int main(int argc, char** argv)
{
    cl_int errNum;
    cl_uint numPlatforms;
	cl_uint numDevices;
    cl_platform_id * platformIDs;
	cl_device_id * deviceIDs;
    cl_context context = NULL;
	cl_command_queue queue;
	cl_program program;
	cl_kernel kernel;
	cl_mem inputSignalBuffer;
	cl_mem outputSignalBuffer;
	cl_mem maskBuffer;

    // First, select an OpenCL platform to run on.  
	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkErr( 
		(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
		"clGetPlatformIDs"); 
 
	platformIDs = (cl_platform_id *)alloca(
       		sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr( 
	   (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
	   "clGetPlatformIDs");

	// Iterate through the list of platforms until we find one that supports
	// a CPU device, otherwise fail with an error.
	deviceIDs = NULL;
	cl_uint i;
	for (i = 0; i < numPlatforms; i++)
	{
		errNum = clGetDeviceIDs(
            platformIDs[i], 
            CL_DEVICE_TYPE_CPU, 
            0,
            NULL,
            &numDevices);
		if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
	    {
			checkErr(errNum, "clGetDeviceIDs");
        }
	    else if (numDevices > 0) 
		{
		   	deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
			errNum = clGetDeviceIDs(
				platformIDs[i],
				CL_DEVICE_TYPE_CPU,
				numDevices, 
				&deviceIDs[0], 
				NULL);
			checkErr(errNum, "clGetDeviceIDs");
			break;
	   }
	}

	// Check to see if we found at least one CPU device, otherwise return
	if (deviceIDs == NULL) {
		std::cout << "No CPU device found" << std::endl;
		exit(-1);
	}

    // Next, create an OpenCL context on the selected platform.  
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[i],
        0
    };
    context = clCreateContext(
		contextProperties, 
		numDevices,
        deviceIDs, 
		&contextCallback,
		NULL, 
		&errNum);
	checkErr(errNum, "clCreateContext");

	std::ifstream srcFile("../convolution/Convolution.cl");
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");

	std::string srcProg(
        std::istreambuf_iterator<char>(srcFile),
        (std::istreambuf_iterator<char>()));

	const char * src = srcProg.c_str();
	size_t length = srcProg.length();

	// Create program from source
	program = clCreateProgramWithSource(
		context, 
		1, 
		&src, 
		&length, 
		&errNum);
	checkErr(errNum, "clCreateProgramWithSource");

	// Build program
	errNum = clBuildProgram(
		program,
		numDevices,
		deviceIDs,
		NULL,
		NULL,
		NULL);
    if (errNum != CL_SUCCESS)
    {
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
			program, 
			deviceIDs[0], 
			CL_PROGRAM_BUILD_LOG,
            sizeof(buildLog), 
			buildLog, 
			NULL);

        std::cerr << "Error in kernel: " << std::endl;
        std::cerr << buildLog;
		checkErr(errNum, "clBuildProgram");
    }

	// Create kernel object
	kernel = clCreateKernel(
		program,
		"convolve",
		&errNum);
	checkErr(errNum, "clCreateKernel");

	// Now allocate buffers
	inputSignalBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
		static_cast<void *>(inputSignal),
		&errNum);
	checkErr(errNum, "clCreateBuffer(inputSignal)");

	maskBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(cl_uint) * maskHeight * maskWidth,
		static_cast<void *>(mask),
		&errNum);
	checkErr(errNum, "clCreateBuffer(mask)");

	outputSignalBuffer = clCreateBuffer(
		context,
		CL_MEM_WRITE_ONLY,
		sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
		NULL,
		&errNum);
	checkErr(errNum, "clCreateBuffer(outputSignal)");

	// Pick the first device and create command queue.
	queue = clCreateCommandQueue(
		context,
		deviceIDs[0],
		0,
		&errNum);
	checkErr(errNum, "clCreateCommandQueue");

    errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
	errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
	errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
	checkErr(errNum, "clSetKernelArg");

	const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight };
    const size_t localWorkSize[1]  = { 1 };

    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel, 
		1, 
		NULL,
        globalWorkSize, 
		localWorkSize,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueNDRangeKernel");
    
	errNum = clEnqueueReadBuffer(
		queue, 
		outputSignalBuffer, 
		CL_TRUE,
        0, 
		sizeof(cl_uint) * outputSignalHeight * outputSignalHeight, 
		outputSignal,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueReadBuffer");

    // Output the result buffer
    for (int y = 0; y < outputSignalHeight; y++)
	{
		for (int x = 0; x < outputSignalWidth; x++)
		{
			std::cout << outputSignal[x][y] << " ";
		}
		std::cout << std::endl;
	}

    std::cout << std::endl << "Executed program succesfully." << std::endl;

	return 0;
}
int main(int argc, char** argv) {
    /* OpenCL 1.1 data structures */
    cl_platform_id* platforms;
    cl_program program;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_uint numOfPlatforms;
    cl_int  error;

    cl_mem matrixAMemObj; // input matrix A mem buffer
    cl_mem matrixBMemObj; // input matrix B mem buffer
    cl_mem matrixCMemObj; // input matrix C mem buffer
    cl_int* matrixA;      // input matrix A
    cl_int* matrixB;      // input matrix B
    cl_int* matrixC;      // input matrix C
    cl_uint widthA = WIDTH_G;
    cl_uint heightA = HEIGHT_G;
    cl_uint widthB = WIDTH_G;
    cl_uint heightB = HEIGHT_G;

	{
	    // allocate memory for input and output matrices 
        // based on whatever matrix theory i know.
	    matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int));
	    matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int));
	    matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int));

	    memset(matrixA, 0, widthA * heightA * sizeof(cl_int));
	    memset(matrixB, 0, widthB * heightB * sizeof(cl_int));
	    memset(matrixC, 0, widthB * heightA * sizeof(cl_int));
        
        fillRandom(matrixA, widthA, heightA, 643);
        fillRandom(matrixB, widthB, heightB, 991);
    }

    /*
     Get the number of platforms
     Remember that for each vendor's SDK installed on the computer,
     the number of available platform also increased.
     */
    error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
    if(error != CL_SUCCESS) {
        perror("Unable to find any OpenCL platforms");
        exit(1);
    }
    
    platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
    printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);
    
    error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
    if(error != CL_SUCCESS) {
        perror("Unable to find any OpenCL platforms");
        exit(1);
    }
    // Search for a GPU device through the installed platforms
    // Build a OpenCL program and do not run it.
    for(cl_int i = 0; i < numOfPlatforms; i++ ) {
        // Get the GPU device
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL);

        if(error != CL_SUCCESS) {
            perror("Can't locate a OpenCL compliant device i.e. GPU");
            exit(1);
        }
        /* Create a context */
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
        if(error != CL_SUCCESS) {
            perror("Can't create a valid OpenCL context");
            exit(1);
        }
        
        /* Load the two source files into temporary datastores */
        const char *file_names[] = {"mmult.cl"};
        const int NUMBER_OF_FILES = 1;
        char* buffer[NUMBER_OF_FILES];
        size_t sizes[NUMBER_OF_FILES];
        loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
        
        /* Create the OpenCL program object */
        program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);
	    if(error != CL_SUCCESS) {
            perror("Can't create the OpenCL program object");
            exit(1);
	    }
        /* Build OpenCL program object and dump the error message, if any */
        char *program_log;
        const char options[] = "";
        size_t log_size;

        error = clBuildProgram(program, 1, &device, options, NULL, NULL);
	    if(error != CL_SUCCESS) {
            // If there's an error whilst building the program, dump the log
            clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
            program_log = (char*) malloc(log_size+1);
            program_log[log_size] = '\0';
            clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
                                  log_size+1, program_log, NULL);
            printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
            free(program_log);
            exit(1);
	    }
       
        // Queue is created with profiling enabled 
        cl_command_queue_properties props;
        props |= CL_QUEUE_PROFILING_ENABLE;

        queue = clCreateCommandQueue(context, device, props, &error);

        cl_kernel kernel = clCreateKernel(program, "mmmult", &error);

        matrixAMemObj = clCreateBuffer(context,
                                       CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                       widthA * heightA * sizeof(cl_int),
                                       matrixA,
                                       &error);

        matrixBMemObj = clCreateBuffer(context,
                                       CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                       widthB * heightB * sizeof(cl_int),
                                       matrixB,
                                       &error);

        matrixCMemObj = clCreateBuffer(context,
                                       CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR,
                                       widthB * heightA * sizeof(cl_int),
                                       0,
                                       &error);

        clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB);
        clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA);
        clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj);
        clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj);
        clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj);
         
        size_t globalThreads[] = {heightA};
        size_t localThreads[] = {256};
		cl_event exeEvt; 
        cl_ulong executionStart, executionEnd;
		error = clEnqueueNDRangeKernel(queue,
		                               kernel,
		                               1,
                                       NULL,
		                               globalThreads,
                                       localThreads,
                                       0,
                                       NULL,
                                       &exeEvt);
		clWaitForEvents(1, &exeEvt);
		if(error != CL_SUCCESS) {
			printf("Kernel execution failure!\n");
			exit(-22);
		}	

        // let's understand how long it took?
        clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL);
        clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL);
        clReleaseEvent(exeEvt);

        printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000);
        printf("Execution the matrix-matrix multiplication took %lu s\n", (executionEnd - executionStart));
 
        clEnqueueReadBuffer(queue,
                            matrixCMemObj,
                            CL_TRUE,
                            0,
                            widthB * heightA * sizeof(cl_int),
                            matrixC,
                            0,
                            NULL,
                            NULL);
       
        if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB))
            printf("Passed!\n");
        else 
            printf("Failed!\n");
 
        /* Clean up */
        for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
        clReleaseProgram(program);
        clReleaseContext(context);
        clReleaseMemObject(matrixAMemObj);
        clReleaseMemObject(matrixBMemObj);
        clReleaseMemObject(matrixCMemObj);
    }
    
    free(matrixA);
    free(matrixB);
    free(matrixC);
}
static void
clrpc_client_test2(void)
{
	int err;

	int size = 1024;

	cl_uint nplatforms = 0;
	cl_platform_id* platforms = 0;
	cl_uint nplatforms_ret;

	clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret);	

	printf(  "after call one i get nplatforms_ret = %d",
		nplatforms_ret);

	if (nplatforms_ret == 0) exit(1);

	nplatforms = nplatforms_ret;
	platforms = (cl_platform_id*)calloc(nplatforms,sizeof(cl_platform_id));

	clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret);

	int i;
	for(i=0;i<nplatforms;i++) {
		clrpc_dptr* tmp = ((_xobj_t*)platforms[i])->obj;
		int is_rpc;
		if ( clGetPlatformInfo(platforms[i],999,sizeof(cl_int),&is_rpc,0)==CL_SUCCESS) {
			printf(  "platforms[%d] local=%p remote=%p\n",
				i,(void*)tmp->local,
				(void*)tmp->remote);
		} else {
			printf( "platforms[%d] not RPC\n",i);
		}
	}

	char buffer[1024];
	size_t sz;
	cl_platform_id rpc_platform = 0;
	for(i=0;i<nplatforms;i++) {
		clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1023,buffer,&sz);
		printf(  "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer);
	}

int iplat;
for(iplat=0;iplat<nplatforms;iplat++) {

printf("\n******************\nTEST PLATFORM %d\n*************\n\n",iplat);

	cl_uint ndevices = 0;
	cl_device_id* devices = 0;
	cl_uint ndevices_ret;

	clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL,
		ndevices,devices,&ndevices_ret);

	printf(  "after call one i get ndevices_ret = %d\n", ndevices_ret);

	if (ndevices_ret > 10) exit(-1);

	ndevices = ndevices_ret;
	devices = (cl_device_id*)calloc(ndevices,sizeof(cl_device_id));

	clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL,
		ndevices,devices,&ndevices_ret);

	if (!ndevices_ret) {
		//printf("no devices, stopping.\n");
		//exit(1);
		printf("no devices, skipping.\n");
		continue;
	}

	for(i=0;i<ndevices;i++) {
		clrpc_dptr* tmp = ((_xobj_t*)devices[i])->obj;
		clGetDeviceInfo(devices[i],CL_DEVICE_NAME,1023,buffer,&sz);
		printf(  "CL_DEVICE_NAME |%s|\n",buffer);
		cl_platform_id tmpid;
		clGetDeviceInfo(devices[i],CL_DEVICE_PLATFORM,sizeof(tmpid),&tmpid,&sz);
		printf("%p\n",platforms[iplat]); fflush(stdout);
		printf("%p\n",tmpid); fflush(stdout);
		clGetPlatformInfo(tmpid,CL_PLATFORM_NAME,1023,buffer,&sz);
		printf(  "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer);
	}

	cl_context_properties ctxprop[] = { 
		CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[iplat], 0 };

	printf("i am setting this: prop[%d] %p\n",iplat,platforms[iplat]);

	cl_context ctx = clCreateContext(ctxprop,ndevices,devices, 0,0,&err);

	cl_command_queue* cmdq 
		= (cl_command_queue*) calloc(ndevices,sizeof(cl_command_queue));

	for(i=0;i<ndevices;i++) {
		cmdq[i] = clCreateCommandQueue(ctx,devices[i],0,&err);
		printf( 	 "cmdq %d %p",i,cmdq[i]);
	}

	cl_mem a_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);
	cl_mem b_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);
	cl_mem c_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);
	cl_mem d_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);

	int* a = (int*)malloc(1024*sizeof(int));
	int* b = (int*)malloc(1024*sizeof(int));
	int* c = (int*)malloc(1024*sizeof(int));
	int* d = (int*)malloc(1024*sizeof(int));

	char* prgsrc[] = { 
		"__kernel void my_kern( int n, __global int* a, __global int* b )\n"
		" { int i = get_global_id(0); int tmp = 0; int j; for(j=0;j<n;j++) tmp += a[i] * a[j]; b[i] = tmp; }\n" 
	};
	size_t prgsrc_sz = strlen(prgsrc[0]) + 1;

	cl_program prg = clCreateProgramWithSource(ctx,1,
		(const char**)prgsrc,&prgsrc_sz,&err);

	clBuildProgram(prg,ndevices,devices,0,0,0);

	cl_kernel krn = clCreateKernel(prg,"my_kern",&err);

int idev;
for(idev=0;idev<ndevices;idev++) {
printf("\n******************\nTEST DEVICE %d(%d)\n*************\n\n",idev,iplat);

	for(i=0;i<size;i++) a[i] = i*10;
	for(i=0;i<size;i++) b[i] = i*10+1;
	for(i=0;i<size;i++) c[i] = 0;
	for(i=0;i<size;i++) d[i] = 0;

	cl_event ev[8];

	for(i=0;i<32;i++) printf("%d/",a[i]); printf("\n");
	for(i=0;i<32;i++) printf("%d/",b[i]); printf("\n");

	clEnqueueWriteBuffer(cmdq[idev],a_buf,CL_FALSE,0,size*sizeof(int),a,
		0,0,&ev[0]);
	clEnqueueWriteBuffer(cmdq[idev],b_buf,CL_FALSE,0,size*sizeof(int),b,
		1,ev,&ev[1]);
	clEnqueueWriteBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c,
		2,ev,&ev[2]);
	clEnqueueWriteBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d,
		3,ev,&ev[3]);

	size_t offset = 0; 
	size_t gwsz = 128;
	size_t lwsz = 16;

	clSetKernelArg(krn,0,sizeof(int),&size);
	clSetKernelArg(krn,1,sizeof(cl_mem),&a_buf);
	clSetKernelArg(krn,2,sizeof(cl_mem),&c_buf);
	clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,4,ev,&ev[4]);

	clSetKernelArg(krn,1,sizeof(cl_mem),&b_buf);
	clSetKernelArg(krn,2,sizeof(cl_mem),&d_buf);
	clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,5,ev,&ev[5]);

	clEnqueueReadBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c,
		6,ev,&ev[6]);
	clEnqueueReadBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d,
		7,ev,&ev[7]);

	clFlush(cmdq[idev]);

	clWaitForEvents(8,ev);

	for(i=0;i<32;i++) printf("%d/",c[i]); printf("\n");
	for(i=0;i<32;i++) printf("%d/",d[i]); printf("\n");

	for(i=0;i<8;i++) clReleaseEvent(ev[i]);

}

	clReleaseKernel(krn);

	clReleaseProgram(prg);

	clReleaseMemObject(a_buf);
	clReleaseMemObject(b_buf);
	clReleaseMemObject(c_buf);
	clReleaseMemObject(d_buf);

	clReleaseCommandQueue(cmdq[0]);
	clReleaseContext(ctx);

//	printf("sleeping ...\n");
//	sleep(1);

}

//	clrpc_final();

}
Beispiel #12
0
int 
exec_trig_kernel(const char *program_source, 
                 int n, void *srcA, void *dst) 
{ 
  cl_context  context; 
  cl_command_queue cmd_queue; 
  cl_device_id  *devices; 
  cl_program  program; 
  cl_kernel  kernel; 
  cl_mem       memobjs[2]; 
  size_t       global_work_size[1]; 
  size_t       local_work_size[1]; 
  size_t       cb; 
  cl_int       err; 

  float c = 7.3f; // a scalar number to test non-pointer args
 
  // create the OpenCL context on a GPU device 
  context = poclu_create_any_context();
  if (context == (cl_context)0) 
    return -1; 
 
  // get the list of GPU devices associated with context 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
  devices = (cl_device_id *) malloc(cb);
  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
 
  // create a command-queue 
  cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (cmd_queue == (cl_command_queue)0) 
    { 
      clReleaseContext(context); 
      free(devices); 
      return -1; 
    } 
  free(devices); 
 
  // allocate the buffer memory objects 
  memobjs[0] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcA, NULL); 
  if (memobjs[0] == (cl_mem)0) 
    { 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  memobjs[1] = clCreateBuffer(context, 
			      CL_MEM_READ_WRITE, 
			      sizeof(cl_float4) * n, NULL, NULL); 
  if (memobjs[1] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 1); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the program 
  program = clCreateProgramWithSource(context, 
				      1, (const char**)&program_source, NULL, NULL); 
  if (program == (cl_program)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // build the program 
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the kernel 
  kernel = clCreateKernel(program, "trig", NULL); 
  if (kernel == (cl_kernel)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set the args values 
  err = clSetKernelArg(kernel,  0,  
		       sizeof(cl_mem), (void *) &memobjs[0]); 
  err |= clSetKernelArg(kernel, 1,
			sizeof(cl_mem), (void *) &memobjs[1]); 
  err |= clSetKernelArg(kernel, 2,
			sizeof(float), (void *) &c); 
 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set work-item dimensions 
  global_work_size[0] = n; 
  local_work_size[0]= 2; 
 
  // execute kernel 
  err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, 
			       global_work_size, local_work_size,  
			       0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // read output image 
  err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 
			    0, n * sizeof(cl_float4), dst, 
			    0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // release kernel, program, and memory objects 
  delete_memobjs(memobjs, 2); 
  clReleaseKernel(kernel); 
  clReleaseProgram(program); 
  clReleaseCommandQueue(cmd_queue); 
  clReleaseContext(context); 
  return 0; // success... 
}
Beispiel #13
0
void maxcoinMiner_openCL_generateOrUpdateKernel()
{
	if( maxcoinOpenCLKernelInited )
		return;
	maxcoinOpenCLKernelInited = true;
	printf("Compiling OpenCL kernel...\n");

	char* kernel_src = (char*)malloc(1024*512);
	strcpy(kernel_src, "");

	cl_int clerr = 0;
	// init input buffer
	maxcoinGPU.buffer_blockInputData = malloc(80+8); // endian swapped block data + share target attached at the end
	memset(maxcoinGPU.buffer_blockInputData, 0x00, 80+8);
	maxcoinGPU.clBuffer_blockInputData = clCreateBuffer(openCL_getActiveContext(), CL_MEM_READ_WRITE, 88, maxcoinGPU.buffer_blockInputData, &clerr);
	// init output buffer
	sint32 outputBlocks = 256;
	maxcoinGPU.buffer_nonceOutputData = malloc(outputBlocks*4*sizeof(uint32));
	memset(maxcoinGPU.buffer_nonceOutputData, 0x00, outputBlocks*4*sizeof(uint32));
	maxcoinGPU.clBuffer_nonceOutputData = clCreateBuffer(openCL_getActiveContext(), CL_MEM_READ_WRITE, outputBlocks*4*sizeof(uint32), maxcoinGPU.buffer_nonceOutputData, &clerr);

	maxcoinMiner_openCL_appendKeccakFunction(kernel_src);

	strcat(kernel_src, "__kernel void xptMiner_cl_maxcoin_keccak(__global unsigned long *blockData, __global unsigned int *nonceIndexOut)\r\n");
	strcat(kernel_src, "{\r\n");

	strcat(kernel_src, 
	"unsigned long nonceAndBits = blockData[9] & 0x00000000FFFFFFFF;\r\n"
	"unsigned long shareTarget = blockData[10];\r\n"
	"nonceIndexOut[get_local_id(0)] = 0xFFFFFFFF;\r\n"

	"nonceAndBits += 0x100000000UL*0x1000UL*(unsigned long)get_local_id(0);\r\n"
	
	//"nonceAndBits = 0x01f94bdb00000000UL;\r\n"

	"for(int i=0; i<0x1000; i++) {\r\n"
	//"for(int i=0; i<1; i++) {\r\n"
	//"if( keccak256_maxcoin_opt_v(blockData, nonceAndBits) < shareTarget ) nonceIndexOut[0] = nonceAndBits>>32;\r\n"
	"if( keccak256_maxcoin_opt_v(blockData, nonceAndBits) < 0x0040000000000000UL ) nonceIndexOut[get_local_id(0)] = nonceAndBits>>32;\r\n"
	"nonceAndBits += 0x100000000UL;\r\n"
	"}\r\n");

	strcat(kernel_src, "}\r\n");

	const char* source = kernel_src;
	size_t src_size = strlen(kernel_src);
	cl_program program = clCreateProgramWithSource(openCL_getActiveContext(), 1, &source, &src_size, &clerr);
	if(clerr != CL_SUCCESS)
		printf("Error creating OpenCL program\n");
	// builds the program
	clerr = clBuildProgram(program, 1, openCL_getActiveDeviceID(), NULL, NULL, NULL);
	if(clerr != CL_SUCCESS)
		printf("Error compiling OpenCL program\n");
	// shows the log
	char* build_log;
	size_t log_size;
	// First call to know the proper size
	clGetProgramBuildInfo(program, *openCL_getActiveDeviceID(), CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
	build_log = (char*)malloc(log_size+1);
	memset(build_log, 0x00, log_size+1);
	// Second call to get the log
	clGetProgramBuildInfo(program, *openCL_getActiveDeviceID(), CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
	build_log[log_size] = '\0';
	puts(build_log);
	free(build_log);

	maxcoinGPU.kernel_keccak = clCreateKernel(program, "xptMiner_cl_maxcoin_keccak", &clerr);

	clerr = clSetKernelArg(maxcoinGPU.kernel_keccak, 0, sizeof(cl_mem), &maxcoinGPU.clBuffer_blockInputData);
	clerr = clSetKernelArg(maxcoinGPU.kernel_keccak, 1, sizeof(cl_mem), &maxcoinGPU.clBuffer_nonceOutputData);

	free(kernel_src);
}
Beispiel #14
0
/* Initialize OpenCl processing */
void init_cl() {

   char *program_buffer, *program_log;
   size_t program_size, log_size;
   int err;

   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   }

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   if(err == CL_DEVICE_NOT_FOUND) {
      err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
   }
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   /* Create OpenCL context properties */
#ifdef MAC
   CGLContextObj mac_context = CGLGetCurrentContext();
   CGLShareGroupObj group = CGLGetShareGroup(mac_context);
   cl_context_properties properties[] = {
      CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, 
      (cl_context_properties)group, 0};
#else 
#ifdef UNIX
   cl_context_properties properties[] = {
      CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), 
      CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), 
      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
#else
   cl_context_properties properties[] = {
      CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), 
      CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 
      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
#endif
#endif

   /* Create context */
   context = clCreateContext(properties, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Create program from file */
   program_buffer = read_file(PROGRAM_FILE, &program_size);
   program = clCreateProgramWithSource(context, 1, 
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, "-DRADIUS=0.75", NULL, NULL);
   if(err < 0) {

      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &log_size);
      program_log = (char*) malloc(log_size + 1);
      program_log[log_size] = '\0';
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            log_size + 1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Create kernel */
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      printf("Couldn't create a kernel: %d", err);
      exit(1);
   };
}
Beispiel #15
0
int main(int argc, char **argv)
{
  /* test name */
  char name[] = "test_sampler_address_clamp";
  size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 };
  size_t srcdir_length, name_length, filename_size;
  char *filename = NULL;
  char *source = NULL;
  cl_device_id devices[1];
  cl_context context = NULL;
  cl_command_queue queue = NULL;
  cl_program program = NULL;
  cl_kernel kernel = NULL;
  cl_int result;
  int retval = -1;

  /* image parameters */
  cl_uchar4 *imageData;
  cl_image_format image_format;
  cl_image_desc image_desc;

  printf("Running test %s...\n", name);
  memset(&image_desc, 0, sizeof(cl_image_desc));
  image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
  image_desc.image_width = 4;
  image_desc.image_height = 4;
  image_format.image_channel_order = CL_RGBA;
  image_format.image_channel_data_type = CL_UNSIGNED_INT8;
  imageData = (cl_uchar4*)malloc (4 * 4 * sizeof(cl_uchar4));
  
  if (imageData == NULL)
    {
      puts("out of host memory\n");
      goto error;
    }
  memset (imageData, 1, 4*4*sizeof(cl_uchar4));

  /* determine file name of kernel source to load */
  srcdir_length = strlen(SRCDIR);
  name_length = strlen(name);
  filename_size = srcdir_length + name_length + 16;
  filename = (char *)malloc(filename_size + 1);
  if (!filename) 
    {
      puts("out of memory");
      goto error;
    }
  
  snprintf(filename, filename_size, "%s/%s.cl", SRCDIR, name);
  
  /* read source code */
  source = poclu_read_file (filename);
  TEST_ASSERT (source != NULL && "Kernel .cl not found.");

  /* setup an OpenCL context and command queue using default device */
  context = poclu_create_any_context();
  if (!context) 
    {
      puts("clCreateContextFromType call failed\n");
      goto error;
    }

  result = clGetContextInfo(context, CL_CONTEXT_DEVICES,
                            sizeof(cl_device_id), devices, NULL);
  if (result != CL_SUCCESS) 
    {
      puts("clGetContextInfo call failed\n");
      goto error;
    }

  queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (!queue) 
    {
      puts("clCreateCommandQueue call failed\n");
      goto error;
    }

  /* Create image */

  cl_mem image = clCreateImage (context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                &image_format, &image_desc, imageData, &result);
  if (result != CL_SUCCESS)
    {
      puts("image creation failed\n");
      goto error;
    }


  /* create and build program */
  program = clCreateProgramWithSource (context, 1, (const char **)&source,
                                       NULL, NULL);
  if (!program) 
    {
      puts("clCreateProgramWithSource call failed\n");
      goto error;
    }

  result = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (result != CL_SUCCESS) 
    {
      puts("clBuildProgram call failed\n");
      goto error;
    }

  /* execute the kernel with give name */
  kernel = clCreateKernel(program, name, NULL); 
  if (!kernel) 
    {
      puts("clCreateKernel call failed\n");
      goto error;
    }

   result = clSetKernelArg( kernel, 0, sizeof(cl_mem), &image);
   if (result)
     {
       puts("clSetKernelArg failed\n");
       goto error;
     }

  result = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, 
                                  local_work_size, 0, NULL, NULL); 
  if (result != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  result = clFinish(queue);
  if (result == CL_SUCCESS)
    retval = 0;

error:

  if (image)
    {
      clReleaseMemObject (image);
    }

  if (kernel) 
    {
      clReleaseKernel(kernel);
    }
  if (program) 
    {
      clReleaseProgram(program);
    }
  if (queue) 
    {
      clReleaseCommandQueue(queue);
    }
  if (context) 
    {
      clUnloadCompiler ();
      clReleaseContext (context);
    }
  if (source) 
    {
      free(source);
    }
  if (filename)
    {
      free(filename);
    }
  if (imageData)
    {
      free(imageData);
    }


  if (retval) 
    {
      printf("FAIL\n");
      return 1;
    }
 
  printf("OK\n");
  return 0;
}
Beispiel #16
0
static ConvolveInfo *GetConvolveInfo(const Image *image,const char *name,
  const char *source,ExceptionInfo *exception)
{
  char
    options[MaxTextExtent];

  cl_int
    status;

  ConvolveInfo
    *convolve_info;

  size_t
    length,
    lengths[] = { strlen(source) };

  /*
    Create OpenCL info.
  */
  convolve_info=(ConvolveInfo *) AcquireAlignedMemory(1,sizeof(*convolve_info));
  if (convolve_info == (ConvolveInfo *) NULL)
    {
      (void) ThrowMagickException(exception,GetMagickModule(),
        ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename);
      return((ConvolveInfo *) NULL);
    }
  (void) ResetMagickMemory(convolve_info,0,sizeof(*convolve_info));
  /*
    Create OpenCL context.
  */
  convolve_info->context=clCreateContextFromType((cl_context_properties *)
    NULL,CL_DEVICE_TYPE_GPU,ConvolveNotify,exception,&status);
  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
    convolve_info->context=clCreateContextFromType((cl_context_properties *)
      NULL,CL_DEVICE_TYPE_CPU,ConvolveNotify,exception,&status);
  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
    convolve_info->context=clCreateContextFromType((cl_context_properties *)
      NULL,CL_DEVICE_TYPE_DEFAULT,ConvolveNotify,exception,&status);
  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
    {
      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
        "failed to create OpenCL context","`%s' (%d)",image->filename,status);
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  /*
    Detect OpenCL devices.
  */
  status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,0,NULL,
    &length);
  if ((status != CL_SUCCESS) || (length == 0))
    {
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  convolve_info->devices=(cl_device_id *) AcquireMagickMemory(length);
  if (convolve_info->devices == (cl_device_id *) NULL)
    {
      (void) ThrowMagickException(exception,GetMagickModule(),
        ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename);
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,length,
    convolve_info->devices,NULL);
  if (status != CL_SUCCESS)
    {
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  /*
    Create OpenCL command queue.
  */
  convolve_info->command_queue=clCreateCommandQueue(convolve_info->context,
    convolve_info->devices[0],0,&status);
  if ((convolve_info->command_queue == (cl_command_queue) NULL) ||
      (status != CL_SUCCESS))
    {
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  /*
    Build OpenCL program.
  */
  convolve_info->program=clCreateProgramWithSource(convolve_info->context,1,
    &source,lengths,&status);
  if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS))
    {
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  (void) FormatMagickString(options,MaxTextExtent,CLOptions,(double)
    QuantumRange,MagickEpsilon);
  status=clBuildProgram(convolve_info->program,1,convolve_info->devices,options,
    NULL,NULL);
  if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS))
    {
      char
        *log;

      status=clGetProgramBuildInfo(convolve_info->program,
        convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&length);
      log=(char *) AcquireMagickMemory(length);
      if (log == (char *) NULL)
        {
          DestroyConvolveInfo(convolve_info);
          return((ConvolveInfo *) NULL);
        }
      status=clGetProgramBuildInfo(convolve_info->program,
        convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,length,log,&length);
      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
        "failed to build OpenCL program","`%s' (%s)",image->filename,log);
      log=DestroyString(log);
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  /*
    Get a kernel object.
  */
  convolve_info->kernel=clCreateKernel(convolve_info->program,name,&status);
  if ((convolve_info->kernel == (cl_kernel) NULL) || (status != CL_SUCCESS))
    {
      DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  return(convolve_info);
}
Beispiel #17
0
int main() {
// START:context
  cl_platform_id platform;
  clGetPlatformIDs(1, &platform, NULL);

  cl_device_id device;
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
// END:context

// START:queue
  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
// END:queue

// START:kernel
  char* source = read_source("multiply_arrays.cl");
  cl_program program = clCreateProgramWithSource(context, 1,
    (const char**)&source, NULL, NULL);
  free(source);
  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL);
// END:kernel

// START:buffers
  cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
  random_fill(a, NUM_ELEMENTS);
  random_fill(b, NUM_ELEMENTS);
  cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
  cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
  cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
    sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
// END:buffers

// START:execute
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
  clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);

  size_t work_units = NUM_ELEMENTS;
  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL);
// END:execute

// START:results
  cl_float results[NUM_ELEMENTS];
  clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
    results, 0, NULL, NULL);
// END:results

// START:cleanup
  clReleaseMemObject(inputA);
  clReleaseMemObject(inputB);
  clReleaseMemObject(output);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);
// END:cleanup

  for (int i = 0; i < NUM_ELEMENTS; ++i) {
    printf("%f * %f = %f\n", a[i], b[i], results[i]);
  }

  return 0;
}
int main(int argc, char *argv[])
{
    //FILE *fp;

    cl_platform_id      platform_id[2];
    cl_uint             ret_num_devices;
    cl_uint             ret_num_platforms;
    cl_int              ret_code;

    cl_mem              image_in_mem = NULL;
    cl_mem              image_out_mem = NULL;
    cl_mem              twiddle_factors_mem = NULL;
    cl_float2           *image_in_host;
    cl_float2           *twiddle_factors_host;

    cl_kernel           kernel_twiddle_factors;
    cl_kernel           kernel_matriz_transpose;
    cl_kernel           kernel_lowpass_filter;

    pgm_t ipgm;
    pgm_t opgm;

    image_file_t        *image_filename;
    char                *output_filename;
    FILE                *fp;
    const char          *kernel_filename = C_NOME_ARQ_KERNEL;
    size_t              source_size;
    char                *source_str;
    cl_int              i, j,n ,m;
    cl_int              raio = 0;
    size_t              global_wg[2];
    size_t              local_wg[2];
    float               *image_amplitudes;
    size_t              log_size;
    char                *log_file;

    cl_event            kernels_events_out_fft[4];

    cl_ulong            kernel_runtime              = (cl_ulong) 0;
    cl_ulong            kernel_start_time           = (cl_ulong) 0;
    cl_ulong            kernel_end_time             = (cl_ulong) 0;

    cl_event            write_host_dev_event;
    cl_ulong            write_host_dev_start_time   = (cl_ulong) 0;
    cl_ulong            write_host_dev_end_time     = (cl_ulong) 0;
    cl_ulong            write_host_dev_run_time     = (cl_ulong) 0;

    cl_event            read_dev_host_event;
    cl_ulong            read_dev_host_start_time    = (cl_ulong) 0;
    cl_ulong            read_dev_host_end_time      = (cl_ulong) 0;
    cl_ulong            read_dev_host_run_time      = (cl_ulong) 0;

    unsigned __int64    image_tam;
    unsigned __int64    MEGA_BYTES   =  1048576; // 1024*1024
    double              image_tam_MB;
    double              tempo_total;

    struct event_in_fft_t *fft_events;


   //=== Timer count start ==============================================================================
    timer_reset();
    timer_start();
    //===================================================================================================

    if (argc < 2) {
        printf("**Erro: O arquivo de entrada eh necessario.\n");
        exit(EXIT_FAILURE);
    }

    image_filename = (image_file_t *) malloc(sizeof(image_file_t));
    split_image_filename(image_filename, argv[1]);
    output_filename = (char *) malloc(40*sizeof(char));
    sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO);

    fp = fopen(kernel_filename, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(EXIT_FAILURE);
    }

    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    //===================================================================================================
     /* Abrindo imagem do arquivo para objeto de memoria local*/
    if( ler_pgm(&ipgm, argv[1]) == -1)
        exit(EXIT_FAILURE);

    n = ipgm.width;
    raio = n/8;
    m = (cl_int)(log((double)n)/log(2.0));

    image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2));
    twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));

    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j];
            image_in_host[n*i + j].s[1] = (float)0;
        }
    }

    fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t));

    kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event));

    //===================================================================================================
    CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms));

    if (ret_num_platforms == 0 ) {
        fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n");
        exit(2);
    }

    //===================================================================================================

    CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices));
    //print_platform_info(&platform_id[1]);

    //===================================================================================================
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code);
    //===================================================================================================

    cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code);
    //===================================================================================================

    image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code);
    image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code);
    twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code);
    //===================================================================================================

    /* Transfer data to memory buffer */
    CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event));

    image_tam = n*n*sizeof(cl_float2);

    //===================================================================================================
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code);
    //===================================================================================================
    ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    //===================================================================================================
    if (ret_code != CL_SUCCESS) {
    // Determine the size of the log
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    //===================================================================================================

    // Allocate memory for the log
    log_file = (char *) malloc(log_size);

    // Get the log
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL);
    printf("%s\n", log_file);
    system("pause");
    exit(0);
}
    kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code);
    kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code);
    kernel_lowpass_filter  = clCreateKernel(program, "lowpass_filter", &ret_code);

    /* Processa os fatores Wn*/
    //===================================================================================================
    CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem));
    CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n/2, 1);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0]));

    //===================================================================================================
    /* Executa a FFT em N/2 */
    fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]);

    //===================================================================================================
    /* Realiza a transposta da Matriz (imagem) */
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1]));

    //===================================================================================================
    /* Executa a FFT N/2 */
    fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]);

    //===================================================================================================
    /* Processa o filtro passa baixa */
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n));
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2]));

    //===================================================================================================
    /* Obtem a FFT inversa*/
    fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]);
    //===================================================================================================

    /* Realiza a transposta da Matriz (imagem) */
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3]));

    //===================================================================================================
    fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]);
    //===================================================================================================

    CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event));
    //===================================================================================================

    //== Total time elapsed ============================================================================
    timer_stop();
    tempo_total = get_elapsed_time();
    //==================================================================================================

    //====== Get time of Profile Info ==================================================================
    // Write data time
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL));
    // Read data time
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL));

    for (i = 0; i < MAX_CALL_FFT; i++) {

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;
        CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;
        CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;

        if (fft_events[i].kernel_normalize != NULL) {
            CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
            CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
            kernel_runtime += (kernel_end_time - kernel_start_time);
        }
   }

    for (j=0; j < MAX_CALL_FFT*m; j++){
        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;

        CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);
    }

    write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time;
    read_dev_host_run_time =  read_dev_host_end_time -  read_dev_host_start_time;

/* save_log_debug(write_host_dev_run_time,fp);
    save_log_debug(read_dev_host_run_time,fp);
    close_log_debug(fp); */

    image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES);

    //==================================================================================================
    save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)),
    (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME);

    //===================================================================================================
    image_amplitudes = (float*)malloc(n*n*sizeof(float));
    for (i=0; i < n; i++) {
        for (j=0; j < n; j++) {
            image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1]));
        }
    }

    //clFlush(cmd_queue);
    //clFinish(cmd_queue);
    opgm.width = n;
    opgm.height = n;

    normalizar_pgm(&opgm, image_amplitudes);
    escrever_pgm(&opgm, output_filename);

    //===================================================================================================
	clFinish(cmd_queue);
    clReleaseKernel(kernel_twiddle_factors);
    clReleaseKernel(kernel_matriz_transpose);
    clReleaseKernel(kernel_lowpass_filter);
    clReleaseProgram(program);
    clReleaseMemObject(image_in_mem);
    clReleaseMemObject(image_out_mem);
    clReleaseMemObject(twiddle_factors_mem);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
	clReleaseEvent(read_dev_host_event);
	clReleaseEvent(write_host_dev_event);
	clReleaseEvent(kernels_events_out_fft[0]);
	clReleaseEvent(kernels_events_out_fft[1]);
	clReleaseEvent(kernels_events_out_fft[2]);
	clReleaseEvent(kernels_events_out_fft[3]);
    destruir_pgm(&ipgm);
    destruir_pgm(&opgm);
    free(image_amplitudes);
    free(source_str);
    free(image_in_host);
    free(image_filename);
    free(twiddle_factors_host);
    free(output_filename);
    free(fft_events);
    free(kernel_butter_events);

    //_CrtDumpMemoryLeaks();

    return 0;
}
// Helper function to create and build program and kernel
// *********************************************************************
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
{
    // compile cl program
    size_t program_length;
    char *source; 

    std::ostringstream preamble;   

    // create the program
    // with type specification depending on datatype argument
    switch (datatype)
    {
    default:
    case REDUCE_INT:
        preamble << "#define T int" << std::endl;
        break;
    case REDUCE_FLOAT:
        preamble << "#define T float" << std::endl;
        break;
    }
    
    // set blockSize at compile time
    preamble << "#define blockSize " << blockSize << std::endl;
    
    // set isPow2 at compile time
    preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
    
    // Load the source code and prepend the preamble
    source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
    oclCheckError(source != NULL, shrTRUE);
    
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, 
                                                     &program_length, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    free(source);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS); 
    }
    
    // create Kernel    
    std::ostringstream kernelName;
    kernelName << "reduce" << whichKernel;    
    cl_kernel ckKernel = clCreateKernel(cpProgram, kernelName.str().c_str(), &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    size_t wgSize;
    ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
    if (wgSize == 64) 
      smallBlock = true;
    else smallBlock = false;

    // NOTE: the program will get deleted when the kernel is also released
    clReleaseProgram(cpProgram);
    
    return ckKernel;
}
Beispiel #20
0
/**
 * @brief Main principal
 * @param argc El número de argumentos del programa
 * @param argv Cadenas de argumentos del programa
 * @return Nada si es correcto o algún número negativo si es incorrecto
 */
int main( int argc, char** argv ) {

	if(argc != 2)
		return -1;

	// Medimos tiempo para el programa
	const double start_time = getCurrentTimestamp();

	FILE *kernels;
	char *source_str;
	size_t source_size, work_items;

	// OpenCL runtime configuration
	unsigned num_devices;
	cl_platform_id platform_ids[3];
	cl_uint ret_num_platforms;
	cl_device_id device_id;
	cl_context context = NULL;
	cl_command_queue command_queue;
	cl_program program = NULL;
	cl_int ret;
	cl_kernel kernelINIT;
	cl_event kernel_event, finish_event;
	cl_mem objPARTICULAS;

	// Abrimos el fichero que contiene el kernel
	fopen_s(&kernels, "initparticulasCPU.cl", "r");
	if (!kernels) {
		fprintf(stderr, "Fallo al cargar el kernel\n");
		exit(-1);
	}	
	source_str = (char *) malloc(0x100000);
	source_size = fread(source_str, 1, 0x100000, kernels);
	fclose(kernels);

	// Obtenemos los IDs de las plataformas disponibles
	if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) {
		printf("No se puede obtener id de la plataforma");
		return -1;
	}

	// Intentamos obtener un dispositivo CPU soportado
	if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) {
		printf("No se puede obtener id del dispositivo");
		return -1;
	}
	clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL);
 
	// Creación de un contexto OpenCL
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
 
	// Creación de una cola de comandos
	command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);

	// Creación de un programa kernel desde un fichero de código
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
	if (ret != CL_SUCCESS) {
		size_t len;
		char buffer[2048];
		printf("Error: ¡Fallo al construir el programa ejecutable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s", buffer);
		exit(-1);
	}

	// Creación del kernel OpenCL
	kernelINIT = clCreateKernel(program, "calc_particles_init", &ret);

	// Creamos el buffer para las partículas y reservamos espacio ALINEADO para los datos
	size_t N = atoi(argv[1]);
	particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64);
	objPARTICULAS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(particle), NULL, &ret);
	const size_t global = 4;
	const size_t local_work_size = 1;

	// Transferimos el frame al dispositivo
	cl_event write_event;
	ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event);

	// Establecemos los argumentos del kernel
	ret = clSetKernelArg(kernelINIT, 0, sizeof(cl_mem), &objPARTICULAS);
	ret = clSetKernelArg(kernelINIT, 1, sizeof(int), &N);

	// Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo
	ret = clEnqueueNDRangeKernel(command_queue, kernelINIT, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event);

	// Leemos los resultados
	ret = clEnqueueReadBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 1, &kernel_event, &finish_event);
	
	// Esperamos a que termine de leer los resultados
	clWaitForEvents(1, &finish_event);

	// Obtenemos el tiempo del kernel y de las transferencias CPU-RAM
	cl_ulong totalKernel = getStartEndTime(kernel_event);
	cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event);

	const double end_time = getCurrentTimestamp();

	// Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria
	printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3);
	printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6);
	printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6);

	// Liberamos todos los recursos usados (kernels y objetos OpenCL)
	clReleaseEvent(kernel_event);
	clReleaseEvent(finish_event);
	clReleaseEvent(write_event);
	clReleaseMemObject(objPARTICULAS);
	clReleaseKernel(kernelINIT);
	clReleaseCommandQueue(command_queue);
	clReleaseProgram(program);
	clReleaseContext(context);
}
Beispiel #21
0
int main()
{
  size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 };
  cl_int err;
  cl_platform_id platforms[1];
  cl_uint nplatforms;
  cl_device_id devices[1]; // + 1 for duplicate test
  cl_uint num_devices;
  cl_program program = NULL;
  cl_kernel kernelA = NULL;
  cl_kernel kernelB = NULL;
  cl_kernel kernelC= NULL;
  char inputA[] = "A";
  char inputB[] = "B";
  char inputC[] = "C";
  cl_mem inputBufferA = NULL;
  cl_mem inputBufferB = NULL;
  cl_mem inputBufferC = NULL;
  /* command queues */
  cl_command_queue queueA = NULL;
  cl_command_queue queueB = NULL;
  cl_command_queue queueC = NULL;
  /* events */
  cl_event eventA1 = NULL;
  cl_event eventB2 = NULL;
  cl_event eventA3 = NULL;
  cl_event eventB4 = NULL;
  /* event wait lists */
  cl_event B2_wait_list[1];
  cl_event A3_wait_list[1];
  cl_event B4_wait_list[1];
  cl_event C5_wait_list[2];

  err = clGetPlatformIDs(1, platforms, &nplatforms);	
  if (err != CL_SUCCESS && !nplatforms)
    return EXIT_FAILURE;
  
  err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 1,
                       devices, &num_devices);  
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  cl_context context = clCreateContext(NULL, num_devices, devices, NULL, 
                                       NULL, &err);
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  err = clGetContextInfo(context, CL_CONTEXT_DEVICES,
                         sizeof(cl_device_id), devices, NULL);
  if (err != CL_SUCCESS) 
    {
      puts("clGetContextInfo call failed\n");
      goto error;
    }

  queueA = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (!queueA) 
    {
      puts("clCreateCommandQueue call failed\n");
      goto error;
    }

  queueB = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (!queueB) 
    {
      puts("clCreateCommandQueue call failed\n");
      goto error;
    }

  queueC = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (!queueB) 
    {
      puts("clCreateCommandQueue call failed\n");
      goto error;
    }

  inputBufferA = clCreateBuffer(context, 
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                strlen (inputB)+1, (void *) inputA, &err);
  if (inputBufferA == NULL)
    {
      printf("clCreateBuffer call failed err = %d\n", err);
      goto error;
    }

  inputBufferB = clCreateBuffer(context, 
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                strlen (inputA)+1, (void *) inputB, &err);
  if (inputBufferB == NULL)
    {
      printf("clCreateBuffer call failed err = %d\n", err);
      goto error;
    }
  
  inputBufferC = clCreateBuffer(context, 
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                strlen (inputA)+1, (void *) inputC, &err);
  if (inputBufferC == NULL)
    {
      printf("clCreateBuffer call failed err = %d\n", err);
      goto error;
    }
  
  
  size_t kernel_size = strlen (kernelASourceCode);
  char* kernel_buffer = kernelASourceCode;
  
  program = clCreateProgramWithSource (context, 1, 
                                       (const char**)&kernel_buffer, 
                                       &kernel_size, &err);
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  err = clBuildProgram (program, num_devices, devices, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  kernelA = clCreateKernel (program, "test_kernel", NULL); 
  if (!kernelA) 
    {
      puts("clCreateKernel call failed\n");
      goto error;
    }

  kernelB = clCreateKernel (program, "test_kernel", NULL); 
  if (!kernelB) 
    {
      puts("clCreateKernel call failed\n");
      goto error;
    }
  
  kernelC = clCreateKernel (program, "test_kernel", NULL); 
  if (!kernelC) 
    {
      puts("clCreateKernel call failed\n");
      goto error;
    }
  
  err = clSetKernelArg (kernelA, 0, sizeof (cl_mem), &inputBufferA);
  if (err)
    {
      puts("clSetKernelArg failed\n");
      goto error;
    }
 
  err = clSetKernelArg (kernelB, 0, sizeof (cl_mem), &inputBufferB);
  if (err)
    {
      puts("clSetKernelArg failed\n");
      goto error;
    }
  
  err = clSetKernelArg (kernelC, 0, sizeof (cl_mem), &inputBufferC);
  if (err)
    {
      puts("clSetKernelArg failed\n");
      goto error;
    }

    

  /* first enqueue A1*/
  err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, 
                                local_work_size, 0, NULL, &eventA1); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue B2 */
  B2_wait_list[0] = eventA1;
  err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, 
                                local_work_size, 1, B2_wait_list, &eventB2); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue A3 */
  A3_wait_list[0] = eventB2;
  err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, 
                                local_work_size, 1, A3_wait_list, &eventA3); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue B4 */
  B4_wait_list[0] = eventA3;
  err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, 
                                local_work_size, 1, B4_wait_list, &eventB4); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue C5 */
  C5_wait_list[0] = eventA3;
  C5_wait_list[1] = eventB4;
  err = clEnqueueNDRangeKernel (queueC, kernelC, 1, NULL, global_work_size, 
                                local_work_size, 2, C5_wait_list, NULL); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }


  clFinish(queueC);
  printf("\n");
  return EXIT_SUCCESS;

 error:
  return EXIT_FAILURE;

}
Beispiel #22
0
int main()
{
	int i,j,k;
	// nb of operations:
	const int dsize = 512;
	int nthreads = 1;
	int nbOfAverages = 1e2;
	int opsMAC = 2; // operations per MAC
	cl_short4 *in, *out;
	cl_half *ck;
	double tops; //total ops

#define NQUEUES 1
	cl_int err;
	cl_platform_id platform = 0;
	cl_device_id device = 0;
	cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
	cl_context ctx = 0;
	cl_command_queue queues[NQUEUES];
	cl_mem bufin, bufck, bufout;
	cl_event event = NULL;
	cl_program program;
	cl_kernel kernel;
	size_t global[2], local[2];
	size_t param[5];
	char version[300];
  
	// allocate matrices
	
	in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in));
	out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out));
	ck = (cl_half *) calloc(9*9, sizeof(*ck));
	in[0].x = 0x3c00;
	in[1].x = 0x4000;
	in[dsize].x = 0x4100;
	ck[0] = 0x3c00;
	ck[1] = 0x4000;
	ck[9] = 0x3000;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    for(i = 0; i < NQUEUES; i++)
    	queues[i] = clCreateCommandQueue( ctx, device, 0, &err );

	// Print some info about the system
	clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
	printf("CL_DEVICE_VERSION=%s\n", version);
	clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL);
	printf("CL_DRIVER_VERSION=%s\n", version);
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]);
	j = param[0];
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_SIZES=");
	for(i = 0; i < j; i++)
		printf("%d ", (int)param[i]);
	printf("\n");
        clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL);
        printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]);
		
		
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	if(!program)
	{
		printf("Error creating program\n");
		return -1;
	}
	err = clBuildProgram(program, 0, 0, 0, 0, 0);
	if(err != CL_SUCCESS)
	{
		char buffer[20000];
		size_t len;
		
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		puts(buffer);
		return -1;
	}
	kernel = clCreateKernel(program, "conv9x9", &err);
	if(!kernel || err != CL_SUCCESS)
	{
		printf("Error creating kernel\n");
		return -1;
	}

    /* Prepare OpenCL memory objects and place matrices inside them. */
	cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT};
	cl_int rc;
	bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
	bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
    bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck),
                          NULL, &err );

	size_t origin[3] = {0,0,0};
	size_t region[3] = {dsize, dsize, 1};
    err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL );
	clSetKernelArg(kernel, 0, sizeof(int), &dsize);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck);
	clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout);
	local[0] = 8;
	local[1] = 8;
	global[0] = global[1] = dsize-32;
    usleep(100000);

	struct timeval start,end;
	gettimeofday(&start, NULL);

	for (k=0; k<nthreads; k++) {
		//printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
		for(i=0;i<nbOfAverages;i++) {
		// do the 2D convolution
			err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL);
			if(err != CL_SUCCESS)
			{
				printf("clEnqueueNDRangeKernel error %d\n", err);
				return -1;
			}
		}
	}

	clFinish(queues[0]);
	gettimeofday(&end, NULL);
	double t = ((double) (end.tv_sec - start.tv_sec))
	+ ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified!

    /* Wait for calculations to be finished. */

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL );
	clFinish(queues[0]);
	
	printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x);

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufin );
    clReleaseMemObject( bufck );
    clReleaseMemObject( bufout );

    /* Release OpenCL working objects. */
    for(i = 0; i < NQUEUES; i++)
    	clReleaseCommandQueue( queues[i] );
    clReleaseContext( ctx );
	
	// report performance:
	tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops
	printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads);
	printf("\nTime in s: %lf:", t);
	printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9);
	printf("\n");
	return(0);
}
int
MemoryOptimizations::setupCL(void)
{
    cl_int status = 0;
    size_t deviceListSize;

    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetPlatformIDs failed."))
    {
        return SDK_FAILURE;
    }
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetPlatformIDs failed."))
        {
            return SDK_FAILURE;
        }
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);

            if(!sampleCommon->checkVal(status,
                                       CL_SUCCESS,
                                       "clGetPlatformInfo failed."))
            {
                return SDK_FAILURE;
            }

            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    // Display available devices.
    if(!sampleCommon->displayDevices(platform, dType))
    {
        sampleCommon->error("sampleCommon::displayDevices() failed");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    if(!sampleCommon->checkVal(status, 
                               CL_SUCCESS,
                               "clCreateContextFromType failed."))
        return SDK_FAILURE;

    /* First, get the size of device list data */
    status = clGetContextInfo(context, 
                              CL_CONTEXT_DEVICES, 
                              0, 
                              NULL, 
                              &deviceListSize);
    if(!sampleCommon->checkVal(status, 
                               CL_SUCCESS,
                               "clGetContextInfo failed."))
        return SDK_FAILURE;

    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    if(!sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id*)malloc(deviceListSize);
    if(devices == NULL) 
    {
        sampleCommon->error("Failed to allocate memory (devices).");
        return SDK_FAILURE;
    }

    /* Now, get the device list data */
    status = clGetContextInfo(context, 
                              CL_CONTEXT_DEVICES, 
                              deviceListSize, 
                              devices, 
                              NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetGetContextInfo failed."))
        return SDK_FAILURE;


    /* Get Device specific Information */
    /* Get device extensions */
    char deviceExtensions[2048];
    status = clGetDeviceInfo(devices[deviceId], 
                             CL_DEVICE_EXTENSIONS, 
                             sizeof(deviceExtensions), 
                             deviceExtensions, 
                             0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo failed.(extensions)"))
        return SDK_FAILURE;

    if(!strstr(deviceExtensions, "cl_khr_global_int32_base_atomics"))
    {
        sampleCommon->error("Device does not support global_int32_base_atomics!");
        return SDK_EXPECTED_FAILURE;
    }

    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_GROUP_SIZE,
                             sizeof(size_t),
                             (void *)&maxWorkGroupSize,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed."))
        return SDK_FAILURE;


    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                             sizeof(cl_uint),
                             (void *)&maxDimensions,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
        return SDK_FAILURE;


    maxWorkItemSizes = (size_t*)malloc(maxDimensions*sizeof(size_t));
    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_ITEM_SIZES,
                             sizeof(size_t)*maxDimensions,
                             (void *)maxWorkItemSizes,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
        return SDK_FAILURE;




    {
        /* The block is to move the declaration of prop closer to its use */
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   0,
                                   "clCreateCommandQueue failed."))
            return SDK_FAILURE;
    }

    /* Input buffer */
    inputBuffer = clCreateBuffer(context, 
                                 CL_MEM_READ_ONLY,
                                 sizeof(cl_float4) * length,
                                 0, 
                                 &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateBuffer failed. (inputBuffer)"))
        return SDK_FAILURE;

    /* Write data to buffer */
    status = clEnqueueWriteBuffer(commandQueue,
                                  inputBuffer,
                                  1,
                                  0,
                                  sizeof(cl_float4) * length,
                                  input,
                                  0,
                                  0,
                                  0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clEnqueueWriteBuffer failed. (inputBuffer)"))
        return SDK_FAILURE;


    outputBuffer = clCreateBuffer(context, 
                                  CL_MEM_WRITE_ONLY,
                                  sizeof(cl_float4) * length,
                                  0, 
                                  &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateBuffer failed. (outputBuffer)"))
        return SDK_FAILURE;

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        const char * binary = kernelFile.source().c_str();
        size_t binarySize = kernelFile.source().size();
        program = clCreateProgramWithBinary(context,
                                            1,
                                            &devices[deviceId], 
                                            (const size_t *)&binarySize,
                                            (const unsigned char**)&binary,
                                            NULL,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithBinary failed."))
        {
            return SDK_FAILURE;
        }

    }
    else
    {
        kernelPath.append("MemoryOptimizations_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file: " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char * source = kernelFile.source().c_str();
        size_t sourceSize[] = {strlen(source)};
        program = clCreateProgramWithSource(context,
                                            1,
                                            &source,
                                            sourceSize,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithSource failed."))
            return SDK_FAILURE;
    }

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(program, 1, &devices[deviceId], NULL, NULL, NULL);
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char *buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                &buildLogSize);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
                return SDK_FAILURE;

            buildLog = (char*)malloc(buildLogSize);
            if(buildLog == NULL)
            {
                sampleCommon->error("Failed to allocate host memory. (buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                NULL);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
            {
                free(buildLog);
                return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clBuildProgram failed."))
            return SDK_FAILURE;
    }

    /* Copy 1D Fast Path */
    kernel[0] = clCreateKernel(program, "copy1DFastPath", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed.(copy1DFastPath)"))
        return SDK_FAILURE;

    /* Copy 1D Complete Path */
    kernel[1] = clCreateKernel(program, "copy1DCompletePath", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy1DCompletePath)"))
        return SDK_FAILURE;

    /* Copy 2D float */
    kernel[2] = clCreateKernel(program, "copy2Dfloat", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy2Dfloat)"))
        return SDK_FAILURE;

    /* Copy 2D float4 */
    kernel[3] = clCreateKernel(program, "copy2Dfloat4", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy2Dfloat4)"))
        return SDK_FAILURE;

    /* Copy 1D float4 */
    kernel[4] = clCreateKernel(program, "copy1Dfloat4", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy1Dfloat4)"))
        return SDK_FAILURE;

    /* Copy No Coalesced */
    kernel[5] = clCreateKernel(program, "NoCoal", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (NoCoal)"))
        return SDK_FAILURE;

    /* Copy Split */
    kernel[6] = clCreateKernel(program, "Split", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (Split)"))
        return SDK_FAILURE;

    return SDK_SUCCESS;
}
int CommandGenerate::execute(const std::vector<std::string>& p_args) {
	if(p_args.size() < 10) {
		help();
		return -1;
	}

	unsigned int platformId = atol(p_args[1].c_str());
	unsigned int deviceId = atol(p_args[2].c_str());
	unsigned int staggerSize = atol(p_args[3].c_str());
	unsigned int threadsNumber = atol(p_args[4].c_str());
	unsigned int hashesNumber = atol(p_args[5].c_str());
	unsigned int nonceSize = PLOT_SIZE * staggerSize;

	std::cerr << "Threads number: " << threadsNumber << std::endl;
	std::cerr << "Hashes number: " << hashesNumber << std::endl;

	unsigned int numjobs = (p_args.size() - 5)/4;
	std::cerr << numjobs << " plot(s) to do." << std::endl;
	unsigned int staggerMbSize = staggerSize / 4;
	std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl;
	
	std::vector<std::string> paths(numjobs);
	std::vector<std::ofstream *> out_files(numjobs);
	std::vector<unsigned long long> addresses(numjobs);
	std::vector<unsigned long long> startNonces(numjobs);
	std::vector<unsigned long long> endNonces(numjobs);
	std::vector<unsigned int> noncesNumbers(numjobs);
	std::vector<unsigned char*> buffersCpu(numjobs);
	std::vector<bool> saving_thread_flags(numjobs);
	std::vector<std::future<void>> save_threads(numjobs);
	unsigned long long maxNonceNumber = 0;
	unsigned long long totalNonces = 0;

	int returnCode = 0;

	try {
		for (unsigned int i = 0; i < numjobs; i++) {
			std::cerr << "----" << std::endl;
			std::cerr << "Job number " << i << std::endl;
			unsigned int argstart = 6 + i*4;
			paths[i] = std::string(p_args[argstart]);
			addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10);
			startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10);
			noncesNumbers[i] = atol(p_args[argstart+3].c_str());
			maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]);
			totalNonces += noncesNumbers[i];

			std::ostringstream outFile;
			outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \
				noncesNumbers[i] << "_" << staggerSize;
			std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc;
			out_files[i] = new std::ofstream(outFile.str(), file_mode);
			assert(out_files[i]);

			if(noncesNumbers[i] % staggerSize != 0) {
				noncesNumbers[i] -= noncesNumbers[i] % staggerSize;
				noncesNumbers[i] += staggerSize;
			}

			endNonces[i] = startNonces[i] + noncesNumbers[i];
			unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024;
			std::cerr << "Path: " << outFile.str() << std::endl;
			std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl;
			std::cerr << "Creating CPU buffer" << std::endl;
			buffersCpu[i] = new unsigned char[nonceSize];
			if(!buffersCpu[i]) {
				throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)");
			}
			saving_thread_flags[i] = false;
			std::cerr << "----" << std::endl;
		}

		cl_platform_id platforms[4];
		cl_uint platformsNumber;
		cl_device_id devices[32];
		cl_uint devicesNumber;
		cl_context context = 0;
		cl_command_queue commandQueue = 0;
		cl_mem bufferGpuGen = 0;
		cl_mem bufferGpuScoops = 0;
		cl_program program = 0;
		cl_kernel kernelStep1 = 0;
		cl_kernel kernelStep2 = 0;
		cl_kernel kernelStep3 = 0;

		int error;

		std::cerr << "Retrieving OpenCL platforms" << std::endl;
		error = clGetPlatformIDs(4, platforms, &platformsNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL platforms");
		}

		if(platformId >= platformsNumber) {
			throw std::runtime_error("No platform found with the provided id");
		}

		std::cerr << "Retrieving OpenCL GPU devices" << std::endl;
		error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL devices");
		}

		if(deviceId >= devicesNumber) {
			throw std::runtime_error("No device found with the provided id");
		}

		std::cerr << "Creating OpenCL context" << std::endl;
		context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL context");
		}

		std::cerr << "Creating OpenCL command queue" << std::endl;
		commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL command queue");
		}

		std::cerr << "Creating OpenCL GPU generation buffer" << std::endl;
		bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer");
		}

		std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl;
		bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer");
		}

		std::cerr << "Creating OpenCL program" << std::endl;
		std::string source = loadSource("kernel/nonce.cl");
		const char* sources[] = {source.c_str()};
		size_t sourcesLength[] = {source.length()};
		program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL program");
		}

		std::cerr << "Building OpenCL program" << std::endl;
		error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0);
		if(error != CL_SUCCESS) {
			size_t logSize;
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);

			char* log = new char[logSize];
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0);
			std::cerr << log << std::endl;
			delete[] log;

			throw OpenclError(error, "Unable to build the OpenCL program");
		}

		std::cerr << "Creating OpenCL step1 kernel" << std::endl;
		kernelStep1 = clCreateKernel(program, "nonce_step1", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		std::cerr << "Creating OpenCL step2 kernel" << std::endl;
		kernelStep2 = clCreateKernel(program, "nonce_step2", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		std::cerr << "Creating OpenCL step3 kernel" << std::endl;
		kernelStep3 = clCreateKernel(program, "nonce_step3", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize);
		error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen);
		error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		size_t globalWorkSize = staggerSize;
		size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber;
		time_t startTime = time(0);
		unsigned int totalNoncesCompleted = 0;
		for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) {
			for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) {
				unsigned long long nonce = startNonces[jobnum] + nonce_ordinal;
				if (nonce > endNonces[jobnum]) {
				  break;
				}

				std::cout << "Running with start nonce " << nonce << std::endl;
				// Is a cl_ulong always an unsigned long long?
				unsigned int error = 0;
				error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments");
				}
				error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments");
				}

				error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in step1 kernel launch");
				}

				unsigned int hashesSize = hashesNumber * HASH_SIZE;
				for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) {
					error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce);
					error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset);
					error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments");
					}

					error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Error in step2 kernel launch");
					}

					error = clFinish(commandQueue);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Error in step2 kernel finish");
					}
				}

				totalNoncesCompleted += staggerSize;
				double percent = 100.0 * (double)totalNoncesCompleted / totalNonces;
				time_t currentTime = time(0);
				double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0;
				double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed;
				std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)";
				std::cerr << ", " << speed << " nonces/minutes";
				std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s";
				std::cerr << "...                    ";

				error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in step3 kernel launch");
				}

				if (saving_thread_flags[jobnum]) {
					save_threads[jobnum].wait(); // Wait for last job to finish
					saving_thread_flags[jobnum] = false;
				}

				error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in synchronous read");
				}
				saving_thread_flags[jobnum] = true;
				save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]);
			}
		}

		//Clean up
		for (unsigned int i = 0; i < paths.size(); i += 1) {
		  if (saving_thread_flags[i]) {
		    std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl;
		    save_threads[i].wait();
		    saving_thread_flags[i] = false;
		    std::cerr << "done waiting for final save" << std::endl;
		    if (buffersCpu[i]) {
		      delete[] buffersCpu[i];
		    }
		  }
		}
		
		if(kernelStep3) { clReleaseKernel(kernelStep3); }
		if(kernelStep2) { clReleaseKernel(kernelStep2); }
		if(kernelStep1) { clReleaseKernel(kernelStep1); }
		if(program) { clReleaseProgram(program); }
		if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); }
		if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); }
		if(commandQueue) { clReleaseCommandQueue(commandQueue); }
		if(context) { clReleaseContext(context); }


		time_t currentTime = time(0);
		double elapsedTime = difftime(currentTime, startTime) / 60.0;
		double speed = (double)totalNonces / elapsedTime;
		std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)";
		std::cerr << ", " << speed << " nonces/minutes";
		std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s";
		std::cerr << "                    " << std::endl;
	} catch(const OpenclError& ex) {
		std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl;
		returnCode = -1;
	} catch(const std::exception& ex) {
		std::cerr << "[ERROR] " << ex.what() << std::endl;
		returnCode = -1;
	}
	return returnCode;
}
Beispiel #25
0
int main(int argc, char* argv[]) {
  struct pb_Parameters *parameters;

  parameters = pb_ReadParameters(&argc, argv);
  if (!parameters)
    return -1;

  if(!parameters->inpFiles[0]){
    fputs("Input file expected\n", stderr);
    return -1;
  }

  
  struct pb_TimerSet timers;
  
  char oclOverhead[] = "OCL Overhead";
  char intermediates[] = "IntermediatesKernel";
  char finals[] = "FinalKernel";

  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
    
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  
  int numIterations;
  if (argc >= 2){
    numIterations = atoi(argv[1]);
  } else {
    fputs("Expected at least one command line argument\n", stderr);
    return -1;
  }

  unsigned int img_width, img_height;
  unsigned int histo_width, histo_height;

  FILE* f = fopen(parameters->inpFiles[0],"rb");
  int result = 0;

  result += fread(&img_width,    sizeof(unsigned int), 1, f);
  result += fread(&img_height,   sizeof(unsigned int), 1, f);
  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
  result += fread(&histo_height, sizeof(unsigned int), 1, f);

  if (result != 4){
    fputs("Error reading input and output dimensions from file\n", stderr);
    return -1;
  }

  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));

  result = fread(img, sizeof(unsigned int), img_width*img_height, f);

  fclose(f);

  if (result != img_width*img_height){
    fputs("Error reading input array from file\n", stderr);
    return -1;
  }

  cl_int ciErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(parameters);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram[2];
  
  cl_kernel histo_intermediates_kernel;
  cl_kernel histo_final_kernel;
  
  cl_mem input;
  cl_mem ranges;
  cl_mem sm_mappings;
  cl_mem global_subhisto;
  cl_mem global_overflow;
  cl_mem final_histo;
  
  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  cl_uint workItemDimensions;
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
  
  size_t workItemSizes[workItemDimensions];
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
  
  size_t program_length[2];
  const char *source_path[2] = { 
    "src/opencl_mxpa/histo_intermediates.cl", 
   "src/opencl_mxpa/histo_final.cl"};
  char *source[4];

  for (int i = 0; i < 2; ++i) {
    // Dynamically allocate buffer for source
    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
    if(!source[i]) {
      fprintf(stderr, "Could not load program source\n"); exit(1);
    }
  	
  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);
  	  	
  	free(source[i]);
  }
  	
  	  	  	  	  	  	  	
  for (int i = 0; i < 2; ++i) {
    //fprintf(stderr, "Building Program #%d...\n", i);
    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) );
       
    /*
       char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
     */
  }
  	
  histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);  

  input =           clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  ranges =          clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);  
  sm_mappings =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  final_histo =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);

  // Must dynamically allocate. Too large for stack
  unsigned int *zeroData;
  zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int));
  if (zeroData == NULL) {
    fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height);
    exit(1);
  }
   
  for (int y=0; y < img_height; y++){
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, 
                          y*img_width*sizeof(unsigned int), // Offset in bytes
                          img_width*sizeof(unsigned int), // Size of data to write
                          &img[y*img_width], // Host Source
                          0, NULL, NULL) );
  }
 
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  unsigned int img_dim = img_height*img_width;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) );

  size_t inter_localWS[1] = { workItemSizes[0] };
  size_t inter_globalWS[1] = { img_height * inter_localWS[0] };
  
  size_t final_localWS[1] = { workItemSizes[0] };
  size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) /
                                          (int)final_localWS[0])*(int)final_localWS[0] };
  
  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

  for (int iter = 0; iter < numIterations; iter++) {
    unsigned int ranges_h[2] = {UINT32_MAX, 0};
    
    // how about something like
    // __global__ unsigned int ranges[2];
    // ...kernel
    // __shared__ unsigned int s_ranges[2];
    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
    // __syncthreads();
    
    // Although then removing the blocking cudaMemcpy's might cause something about
    // concurrent kernel execution.
    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?


  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to write
                          ranges_h, // Host Source
                          0, NULL, NULL) );
                          
  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
                          0, // Offset in bytes
                          histo_width*histo_height*sizeof(unsigned int), // Size of data to write
                          zeroData, // Host Source
                          0, NULL, NULL) );
                          
  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);

  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0,
                            inter_globalWS, inter_localWS, 0, 0, 0) );              
  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);                            
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
                            final_globalWS, final_localWS, 0, 0, 0) );                           
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
                          0, // Offset in bytes
                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
                          histo, // Host Source
                          0, NULL, NULL) );                         

  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
  
  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );

  if (parameters->outFile) {
    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  free(zeroData);
  free(img);
  free(histo);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(parameters);
  
  pb_DestroyTimerSet(&timers);

  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );

  return 0;
}
Beispiel #26
0
int32_t init_kernel_platform() {

	cl_uint plat_num;
	cl_platform_id plat_id = NULL;
	cl_uint dev_num = 0;
	cl_device_id *devices;

	ret = clGetPlatformIDs(0, NULL, &plat_num);
	if (ret < 0) {
		LOGD("MU1 Error: Getting plat_ids!\n");
		return -1;
	}

	if(plat_num > 0)
	{
		cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id));
		ret = clGetPlatformIDs(plat_num, plat_ids, NULL);
		plat_id = plat_ids[0];
		free(plat_ids);
	}

	ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num);	
	if (dev_num == 0) {
		LOGD("MU1: No GPU device available.\n");
		LOGD("MU1: Choose CPU as default device.\n");
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num);	
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL);
	} else {
		LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num);
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL);
	}
	
	context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);

	commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
    
	char filename[] = "/data/mu1_kernel.cl";
	char file_context[10*1024]={0};
	const char *source = &file_context[0];

    ret = read_cl(filename, &file_context[0]);

	size_t sourceSize[10] = {strlen(source)};
	cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL);
	
	ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if(ret < 0) {
        LOGD("MU1 Error: clBuildProgram error\n");
        return 0;
    }

	kernel = clCreateKernel(program, "process_iq", NULL);

	inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_i[0][0]), NULL);
	inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_q[0][0]), NULL);
	inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_o[0][0]), NULL);


	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q);
	ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o);



    if(devices != NULL) { free(devices);}

    LOGD("MU1: init cl plat success");
    return 0;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'add_sat_ulong8ulong8.cl' */
        source_code = read_buffer("add_sat_ulong8ulong8.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "add_sat_ulong8ulong8", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_ulong8 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong8));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_ulong8){{2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong8), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong8), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_ulong8 *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_ulong8));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_ulong8){{2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong8), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong8), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_ulong8 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_ulong8));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_ulong8));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_ulong8), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_ulong8), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_ulong8));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Beispiel #28
0
void initOpenCL(void) {
    
    /* get the platform(s) */
    cl_uint num_platforms;
    clError = clGetPlatformIDs( 0, NULL, &num_platforms );

    checkErr (clError, "clGetPlatformIDs( 0, NULL, &num_platforms );");
    if (num_platforms <= 0) {
        std::cerr << "No platform..." << std::endl;
        exit(1);
    }
    
    cl_platform_id* platforms = new cl_platform_id[num_platforms];
    clError = clGetPlatformIDs(num_platforms, platforms, NULL);
    checkErr(clError, "clGetPlatformIDs( num_platforms, &platforms, NULL );");
    platform_id = platforms[0];
    delete platforms;

    /* Connect to a compute device */
    err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
    checkErr(err,"Failed to create a device group");

    /* Create a compute context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    checkErr(err,"Failed to create a compute context!");

    /* Create a command queue */
    commandQueue = clCreateCommandQueue(context, device, 0, &err);
    checkErr(err,"Failed to create a command commands!");

    /* Open Kernel file */
    std::string filename = "/home/ictp17/bagus/kernels.cl";
    std::ifstream kernelFile(filename.c_str(), std::ios::in);
    
    if (not kernelFile.is_open()) {
        std::cout << "Unable to open " << filename << ". " << std::endl;
        exit(1);
    }
    
    /*
     * Read the kernel file into an output stream.
     * Convert this into a char array for passing to OpenCL.
     */
    std::ostringstream outputStringStream;
    outputStringStream << kernelFile.rdbuf();
    std::string srcStdStr = outputStringStream.str();
    const char* charSource = srcStdStr.c_str();
    
    kernelFile.close();
    /* Create the compute program from the source buffer */
    program = clCreateProgramWithSource(context, 1, 
               (const char **) &charSource, NULL, &err);
    if (not program) {
        std::cerr << "Error: Failed to create compute program!" << std::endl;
        exit(1);
    }
    
    /* Build the program executable */
    const char* flags = "";


    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    if (err != CL_SUCCESS) {
        size_t len;
        char buffer[1024];
    
        std::cerr << "Error: Failed to build program executable!" 
                  << std::endl;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
                sizeof(buffer), buffer, &len);
        std::cerr << buffer << std::endl;
        exit(1);
    }

    /* Create Kernels objects for all the kernels in the OpenCL program */
    cl_uint num_kernels;
    kernel = clCreateKernel(program, "vector_sum", &err);
    if (err != CL_SUCCESS) {
        size_t len;
        char buffer[1024];
    
        std::cout << "Error: Failed to create kernels in program!" 
                  << std::endl;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
                sizeof(buffer), buffer, &len);
        std::cout << buffer << std::endl;
        exit(1);
    }
}
Beispiel #29
0
short opencl_load_kernel(struct openclenv* env, const char *bitcode_path, const char *kernname, size_t index)
{
	char *err_msg = NULL;
	int err = 0, log_error;
	size_t pathlen = strlen(bitcode_path);
	char *build_log;
	size_t build_log_size;
	char build_opts[MAX_BUILD_OPTS_LENGTH];


#ifdef __APPLE__
	env->program = clCreateProgramWithSource(env->context, 1, (const char **)&bitcode_path, &pathlen, &err);
#else
	size_t code_length;
	char *code_contents = _read_file(bitcode_path, &code_length);

	if (!code_contents) {
		err_msg = strerror(errno);
		err = errno;
		goto error;
	}

	env->program = clCreateProgramWithSource(env->context, 1 , (const char **)&code_contents, &code_length, &err);

	free(code_contents);
#endif


	if (err) {
		err_msg = "clCreateProgramWithSource";
		goto error;
	}

	_opencl_generate_build_opts(build_opts, MAX_BUILD_OPTS_LENGTH);

	stablecl_log(log_message, "Building kernel %s from %s...", kernname, bitcode_path);
	stablecl_log(log_message, "Build options: %s", build_opts);

	err = clBuildProgram(env->program, 1, &env->device, build_opts, NULL, NULL);

#ifndef SIMULATOR_BUILD
	log_error = clGetProgramBuildInfo(env->program, env->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);

	if (log_error)
		stablecl_log(log_err, "Error retrieving build log size.");
	else {
		build_log = malloc(build_log_size);

		if (!build_log)
			stablecl_log(log_err, "Couldn't allocate enough memory for build log.");
		else {
			log_error = clGetProgramBuildInfo(env->program, env->device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, &build_log_size);

			if (log_error)
				stablecl_log(log_err, "Couldn't get build log: %s", opencl_strerr(log_err));
			else if (err)
				stablecl_log(log_err, "Build log (size %zu):\n%s", build_log_size, build_log);

			free(build_log);
		}
	}

#endif

	if (err) {
		err_msg = "clBuildProgram";
		goto error;
	}

	env->kernel[index] = clCreateKernel(env->program, kernname, &err);

	if (err) {
		err_msg = "clCreateKernel";
		goto error;
	}

	_opencl_kernel_info(env, env->kernel[index]);

	env->enabled_kernels[index] = 1;
	env->kernel_count++;

	stablecl_log(log_message, "Kernel %s loaded successfully", kernname);

error:

	if (err && err_msg)
		stablecl_log(log_err, "Kernel load failed with error %d at %s: %s", err, err_msg, opencl_strerr(err));

	return err;
}
Beispiel #30
0
static ConvolveInfo *GetConvolveInfo(const Image *image,const char *name,
  const char *source,ExceptionInfo *exception)
{
  char
    options[MaxTextExtent];

  cl_context_properties
    context_properties[3];

  cl_int
    status;

  cl_platform_id
    platforms[1];

  cl_uint
    number_platforms;

  ConvolveInfo
    *convolve_info;

  size_t
    length,
    lengths[] = { strlen(source) };

  /*
    Create OpenCL info.
  */
  convolve_info=(ConvolveInfo *) AcquireMagickMemory(sizeof(*convolve_info));
  if (convolve_info == (ConvolveInfo *) NULL)
    {
      (void) ThrowMagickException(exception,GetMagickModule(),
        ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename);
      return((ConvolveInfo *) NULL);
    }
  (void) ResetMagickMemory(convolve_info,0,sizeof(*convolve_info));
  /*
    Create OpenCL context.
  */
  status=clGetPlatformIDs(0,(cl_platform_id *) NULL,&number_platforms);
  if ((status == CL_SUCCESS) && (number_platforms > 0))
    status=clGetPlatformIDs(1,platforms,NULL);
  if (status != CL_SUCCESS)
    {
      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
        "failed to create OpenCL context","`%s' (%d)",image->filename,status);
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  context_properties[0]=CL_CONTEXT_PLATFORM;
  context_properties[1]=(cl_context_properties) platforms[0];
  context_properties[2]=0;
  convolve_info->context=clCreateContextFromType(context_properties,
    (cl_device_type) CL_DEVICE_TYPE_GPU,ConvolveNotify,exception,&status);
  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
    convolve_info->context=clCreateContextFromType(context_properties,
      (cl_device_type) CL_DEVICE_TYPE_CPU,ConvolveNotify,exception,&status);
  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
    convolve_info->context=clCreateContextFromType(context_properties,
      (cl_device_type) CL_DEVICE_TYPE_DEFAULT,ConvolveNotify,exception,&status);
  if ((convolve_info->context == (cl_context) NULL) || (status != CL_SUCCESS))
    {
      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
        "failed to create OpenCL context","`%s' (%d)",image->filename,status);
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  /*
    Detect OpenCL devices.
  */
  status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,0,NULL,
    &length);
  if ((status != CL_SUCCESS) || (length == 0))
    {
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  convolve_info->devices=(cl_device_id *) AcquireMagickMemory(length);
  if (convolve_info->devices == (cl_device_id *) NULL)
    {
      (void) ThrowMagickException(exception,GetMagickModule(),
        ResourceLimitError,"MemoryAllocationFailed","`%s'",image->filename);
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  status=clGetContextInfo(convolve_info->context,CL_CONTEXT_DEVICES,length,
    convolve_info->devices,NULL);
  if (status != CL_SUCCESS)
    {
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  if (image->debug != MagickFalse)
    {
      char
        attribute[MaxTextExtent];

      size_t
        length;

      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_NAME,
        sizeof(attribute),attribute,&length);
      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Name: %s",
        attribute);
      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_VENDOR,
        sizeof(attribute),attribute,&length);
      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Vendor: %s",
        attribute);
      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_VERSION,
        sizeof(attribute),attribute,&length);
      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),
        "Driver Version: %s",attribute);
      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_PROFILE,
        sizeof(attribute),attribute,&length);
      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Profile: %s",
        attribute);
      clGetDeviceInfo(convolve_info->devices[0],CL_DRIVER_VERSION,
        sizeof(attribute),attribute,&length);
      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Driver: %s",
        attribute);
      clGetDeviceInfo(convolve_info->devices[0],CL_DEVICE_EXTENSIONS,
        sizeof(attribute),attribute,&length);
      (void) LogMagickEvent(AccelerateEvent,GetMagickModule(),"Extensions: %s",
        attribute);
    }
  /*
    Create OpenCL command queue.
  */
  convolve_info->command_queue=clCreateCommandQueue(convolve_info->context,
    convolve_info->devices[0],0,&status);
  if ((convolve_info->command_queue == (cl_command_queue) NULL) ||
      (status != CL_SUCCESS))
    {
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  /*
    Build OpenCL program.
  */
  convolve_info->program=clCreateProgramWithSource(convolve_info->context,1,
    &source,lengths,&status);
  if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS))
    {
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  (void) FormatLocaleString(options,MaxTextExtent,CLOptions,(float)
    QuantumRange,MagickEpsilon);
  status=clBuildProgram(convolve_info->program,1,convolve_info->devices,options,
    NULL,NULL);
  if ((convolve_info->program == (cl_program) NULL) || (status != CL_SUCCESS))
    {
      char
        *log;

      status=clGetProgramBuildInfo(convolve_info->program,
        convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&length);
      log=(char *) AcquireMagickMemory(length);
      if (log == (char *) NULL)
        {
          convolve_info=DestroyConvolveInfo(convolve_info);
          return((ConvolveInfo *) NULL);
        }
      status=clGetProgramBuildInfo(convolve_info->program,
        convolve_info->devices[0],CL_PROGRAM_BUILD_LOG,length,log,&length);
      (void) ThrowMagickException(exception,GetMagickModule(),DelegateWarning,
        "failed to build OpenCL program","`%s' (%s)",image->filename,log);
      log=DestroyString(log);
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  /*
    Get a kernel object.
  */
  convolve_info->kernel=clCreateKernel(convolve_info->program,name,&status);
  if ((convolve_info->kernel == (cl_kernel) NULL) || (status != CL_SUCCESS))
    {
      convolve_info=DestroyConvolveInfo(convolve_info);
      return((ConvolveInfo *) NULL);
    }
  return(convolve_info);
}