int main (){
	cl_platform_id clPlatform;
	cl_device_id clDevice;
	cl_context clContext;
	cl_command_queue clQueue;
	cl_program clProgram;
	int isMic=0;
	cl_uint numDevices;
	cl_platform_id platform;
	clGetPlatformIDs(1, &platform, NULL);
	cl_int err;
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	//Check for MIC if GPU is not found
	if (err != CL_SUCCESS) {
		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices);
		isMic = 1;
	}
	if (err != CL_SUCCESS) {
		fprintf(stderr, "[ERROR in OpenCLDriver::HI_get_num_devices()] Failed to get device IDs  for type \n");
	}
	
	
	cl_device_id devices[numDevices];
	clGetPlatformIDs(1, &clPlatform, NULL);
	if(isMic)
		clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL);
	else
		clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
	
	for(int i=0; i< numDevices; i++) {
		clDevice = devices[i];
		
		FILE *fp;
		char *source_str;
		size_t source_size;
		char filename[] = "openarc_kernel.cl";
		fp = fopen(filename, "r");
		if (!fp) {
			fprintf(stderr, "[INFO: in OpenCL binary creation] Failed to read the kernel file %s, so skipping binary generation for OpenCL devices %d\n", filename, i);
			break;
		}
		source_str = (char*)malloc(MAX_SOURCE_SIZE);
		source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
		fclose( fp );

		cl_int err;
		clContext = clCreateContext( NULL, 1, &clDevice, NULL, NULL, &err);
		if(err != CL_SUCCESS) {
				fprintf(stderr, "[ERROR in OpenCL binary creation] failed to create OPENCL context with error %d (OPENCL GPU)\n", err);
		}

		clQueue = clCreateCommandQueue(clContext, clDevice, 0, &err);
		if(err != CL_SUCCESS) {
				fprintf(stderr, "[ERROR in OpenCL binary creation] failed to create OPENCL queue with error %d (OPENCL GPU)\n", err);
		}
		
		char cBuffer[1024];
		char *cBufferN;
		clGetDeviceInfo(clDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
		cBufferN = deblank(cBuffer);
		
		std::string binaryName = std::string("openarc_kernel_") + cBufferN + std::string(".ptx");
		
		clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source_str, (const size_t *)&source_size, &err);
		if(err != CL_SUCCESS) {
				fprintf(stderr, "[ERROR in OpenCL binary creation] failed to create OPENCL program with error %d (OPENCL GPU)\n", err);
		}
		
		char *envVar;
		envVar = getenv("OPENARC_JITOPTION");
		err = clBuildProgram(clProgram, 1, &clDevice, envVar, NULL, NULL);
#if PRINT_LOG == 0
		if(err != CL_SUCCESS)
		{
				printf("[ERROR in OpenCL binary creation] Error in clBuildProgram, Line %u in file %s : %d!!!\n\n", __LINE__, __FILE__, err);
				if (err == CL_BUILD_PROGRAM_FAILURE)
				{
						// Determine the size of the log
						size_t log_size;
						clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

						// Allocate memory for the log
						char *log = (char *) malloc(log_size);

						// Get the log
						clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

						// Print the log
						printf("%s\n", log);
				}
				exit(1);
		}
#else
		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

		// Allocate memory for the log
		char *log = (char *) malloc(log_size);

		// Get the log
		clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

		// Print the log
		printf("%s\n", log);
#endif
		
		
		size_t size;
		err = clGetProgramInfo( clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
		if(err != CL_SUCCESS) {
				fprintf(stderr, "[ERROR in OpenCL binary creation] failed to get OPENCL program info error %d (OPENCL GPU)\n", err);
		}

		unsigned char * binary = new unsigned char [size];
		
		//#ifdef NVIDIA_GPU
		//err = clGetProgramInfo( clProgram, CL_PROGRAM_BINARIES, size, &binary, NULL );
		//#else
		err = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binary, NULL);
		//#endif
		
		if(err != CL_SUCCESS) {
				fprintf(stderr, "[ERROR in OpenCL binary creation] failed to dump OPENCL program binary error %d (OPENCL GPU)\n", err);
		}
		
		FILE * fpbin = fopen(binaryName.c_str(), "wb" );
		fwrite(binary, 1 , size, fpbin);
		fclose(fpbin);
		delete[] binary;
	}	
	
	#ifdef NVIDIA_GPU
	//Generate ptx files for .cu, only if nvcc is found on the system
	if (system("which nvcc")==0){
		CUresult err;
		int major, minor;
		CUdevice cuDevice;
		CUcontext cuContext;
		CUmodule cuModule;
		int numDevices;
		cudaGetDeviceCount(&numDevices);
		
		for(int i=0 ; i < numDevices; i++) {
			cuDeviceGet(&cuDevice, i);
			#if CUDA_VERSION >= 5000
			cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice);
			cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice);
			#else
				cuDeviceComputeCapability(&major, &minor, cuDevice);
			#endif

			std::stringstream ss;
			ss << major;
			ss << minor;
			std::string version = ss.str();
			std::string ptxName = std::string("openarc_kernel_") + version + std::string(".ptx");
			std::string command = std::string("nvcc $OPENARC_JITOPTION -arch=sm_") + version + std::string(" openarc_kernel.cu -ptx -o ") + ptxName;
			system(command.c_str());
		}
	}
	#endif

}
Exemplo n.º 2
0
int main(){
    cl_platform_id *platforms;
    cl_uint platforms_n;
    clGetPlatformIDs(0, NULL, &platforms_n);
    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*platforms_n);
    clGetPlatformIDs(platforms_n, platforms, &platforms_n);
    printf("There are %d platforms\n", platforms_n);

    int i = 0;
    char re[1024];

    for(i=0; i<platforms_n; i++){
        printf("Platform: %d\n", i);

        clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, re, NULL);
        printf("CL_PLATFORM_VENDOR: %s\n", re);

        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 1024, re, NULL);
        printf("CL_PLATFORM_NAME: %s\n", re);

        clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 1024, re, NULL);
        printf("CL_PLATFORM_VERSION: %s\n", re);

        clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 1024, re, NULL);
        printf("CL_PLATFORM_PROFILE: %s\n", re);

        clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 1024, re, NULL);
        printf("CL_PLATFORM_EXTENSIONS: %s\n", re);

        cl_device_id *devices;
        cl_uint devices_n;
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &devices_n);
        cl_uint uint;
        cl_ulong ulong;
        devices = (cl_device_id*)malloc(sizeof(cl_device_id)*devices_n);
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, devices_n, devices, &devices_n);

        for(int j = 0; j < devices_n; j++ ){
            char string[1024];
            printf("\tPlatform: %d, devices: %d\n", i, j);

            clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 1024, string, NULL);
            printf("\tCL_DEVICE_NAME: %s\n", string);

            clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 1024, string, NULL);
            printf("\tCL_DEVICE_VENDOR: %s\n", string);

            clGetDeviceInfo(devices[j], CL_DEVICE_EXTENSIONS, 1024, string, NULL);
            printf("\tCL_DEVICE_EXTENSIONS: %s\n", string);

            clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uint), &uint, NULL);
            printf("\tCL_DEVICE_MAX_COMPUTE_UNITS: %d\n", uint);

            clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(uint), &uint, NULL);
            printf("\tCL_DEVICE_MAX_CLOCK_FREQUENCY: %d\n", uint);

            clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(ulong), &ulong, NULL);
            printf("\tCL_Device_LOCAL_MEM_SIZE: %lu\n", ulong);

            clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(ulong), &ulong, NULL);
            printf("\tCL_DEVICE_GLOBAL_MEM_SIZE: %lu\n", ulong);
        }
        free(devices);

    }


    free(platforms);

    return EXIT_SUCCESS;
}
Exemplo n.º 3
0
	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
	: OpenCLDeviceBase(info, stats, background_)
	{
		background = background_;

		/* Initialize cl_mem variables. */
		kgbuffer = NULL;
		sd = NULL;
		sd_DL_shadow = NULL;

		rng_coop = NULL;
		throughput_coop = NULL;
		L_transparent_coop = NULL;
		PathRadiance_coop = NULL;
		Ray_coop = NULL;
		PathState_coop = NULL;
		Intersection_coop = NULL;
		ray_state = NULL;

		AOAlpha_coop = NULL;
		AOBSDF_coop = NULL;
		AOLightRay_coop = NULL;
		BSDFEval_coop = NULL;
		ISLamp_coop = NULL;
		LightRay_coop = NULL;
		Intersection_coop_shadow = NULL;

#ifdef WITH_CYCLES_DEBUG
		debugdata_coop = NULL;
#endif

		work_array = NULL;

		/* Queue. */
		Queue_data = NULL;
		Queue_index = NULL;
		use_queues_flag = NULL;

		per_sample_output_buffers = NULL;

		per_thread_output_buffer_size = 0;
		hostRayStateArray = NULL;
		PathIteration_times = PATH_ITER_INC_FACTOR;
#ifdef __WORK_STEALING__
		work_pool_wgs = NULL;
		max_work_groups = 0;
#endif
		current_max_closure = -1;
		first_tile = true;

		/* Get device's maximum memory that can be allocated. */
		ciErr = clGetDeviceInfo(cdDevice,
		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
		                        sizeof(size_t),
		                        &total_allocatable_memory,
		                        NULL);
		assert(ciErr == CL_SUCCESS);
		if(platform_name == "AMD Accelerated Parallel Processing") {
			/* This value is tweak-able; AMD platform does not seem to
			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
			 * is considered for further computation.
			 */
			total_allocatable_memory /= 2;
		}
	}
Exemplo n.º 4
0
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads)
{
	/* initialize highlighting */
	if (!g_highlightInitialized) {
		if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead);
		if (g_highlightedNodes)     MEM_freeN(g_highlightedNodes);

		g_highlightedNodesRead = NULL;
		g_highlightedNodes = NULL;

		COM_startReadHighlights();

		g_highlightInitialized = true;
	}

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
	/* deinitialize if number of threads doesn't match */
	if (g_cpudevices.size() != num_cpu_threads) {
		Device *device;

		while (g_cpudevices.size() > 0) {
			device = g_cpudevices.back();
			g_cpudevices.pop_back();
			device->deinitialize();
			delete device;
		}

		g_cpuInitialized = false;
	}

	/* initialize CPU threads */
	if (!g_cpuInitialized) {
		for (int index = 0; index < num_cpu_threads; index++) {
			CPUDevice *device = new CPUDevice();
			device->initialize();
			g_cpudevices.push_back(device);
		}

		g_cpuInitialized = true;
	}

#ifdef COM_OPENCL_ENABLED
	/* deinitialize OpenCL GPU's */
	if (use_opencl && !g_openclInitialized) {
		g_context = NULL;
		g_program = NULL;

		if (!OCL_init()) /* this will check for errors and skip if already initialized */
			return;

		if (clCreateContextFromType) {
			cl_uint numberOfPlatforms = 0;
			cl_int error;
			error = clGetPlatformIDs(0, 0, &numberOfPlatforms);
			if (error == -1001) { }   /* GPU not supported */
			else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
			if (G.f & G_DEBUG) printf("%d number of platforms\n", numberOfPlatforms);
			cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__);
			error = clGetPlatformIDs(numberOfPlatforms, platforms, 0);
			unsigned int indexPlatform;
			for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) {
				cl_platform_id platform = platforms[indexPlatform];
				cl_uint numberOfDevices = 0;
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices);
				if (numberOfDevices <= 0)
					continue;

				cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__);
				clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0);

				g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error);
				if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
				const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL};
				g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error);
				error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0);
				if (error != CL_SUCCESS) {
					cl_int error2;
					size_t ret_val_size = 0;
					printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__);
					error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
					if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
					build_log[ret_val_size] = '\0';
					printf("%s", build_log);
					MEM_freeN(build_log);
				}
				else {
					unsigned int indexDevices;
					for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) {
						cl_device_id device = cldevices[indexDevices];
						cl_int vendorID = 0;
						cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL);
						if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); }
						OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID);
						clDevice->initialize();
						g_gpudevices.push_back(clDevice);
					}
				}
				MEM_freeN(cldevices);
			}
			MEM_freeN(platforms);
		}

		g_openclInitialized = true;
	}
#endif
#endif
}
Exemplo n.º 5
0
void opencl_info() {
  cl_int           err_code;

  cl_platform_id  *platforms;
  cl_device_type   device_type;
  cl_uint          num_devices;
  cl_device_id    *devices;

  // Get OpenCL platforms
  // - Get the number of available platforms
  cl_uint num_platforms;
  err_code = clGetPlatformIDs(0, NULL, &num_platforms);
  clu_CheckError(err_code, "clGetPlatformIDs() for num_platforms");
  if (num_platforms == 0) {
    fprintf(stderr, "No OpenCL platform!\n");
    exit(EXIT_FAILURE);
  }
  // - Get platform IDs
  platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id));
  err_code = clGetPlatformIDs(num_platforms, platforms, NULL);
  clu_CheckError(err_code, "clGetPlatformIDs()");

  // Get platform informations
  printf("\nNumber of platforms: %u\n\n", num_platforms);
  char tmp_buf[1024];
  for (cl_uint i = 0; i < num_platforms; i++) {
    printf("platform: %u\n", i);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_NAME");
    printf("- CL_PLATFORM_NAME      : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VENDOR");
    printf("- CL_PLATFORM_VENDOR    : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_PROFILE");
    printf("- CL_PLATFORM_PROFILE   : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VERSION");
    printf("- CL_PLATFORM_VERSION   : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code,"clGetPlatformInfo() for CL_PLATFORM_EXTENSIONS");
    printf("- CL_PLATFORM_EXTENSIONS: %s\n", tmp_buf);
    printf("\n");


    // Get the number of devices
    err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL,
                              &num_devices);
    clu_CheckError(err_code, "clGetDeviceIDs for num_devices");
    if (num_devices == 0) {
      fprintf(stderr, "No OpenCL device in this platform!\n");
      exit(EXIT_FAILURE);
    }
    printf("Number of devices: %u\n", num_devices);

    // Get the default device
    cl_device_id default_device;
    cl_uint num_defaults;
    err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_DEFAULT, 
                              1, &default_device, &num_defaults);
    clu_CheckError(err_code, "clGetDeviceIDs() for CL_DEVICE_TYPE_DEFAULT");
    if (num_defaults != 1) {
      printf("- # of default devices: %u\n", num_defaults);
    }

    // Get device IDs
    devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
    err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices,
                              devices, NULL);
    clu_CheckError(err_code, "clGetDeviceIDs()");
    for (cl_uint k = 0; k < num_devices; k++) {
      printf("device: %u (", k);
      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_TYPE, 
                                 sizeof(cl_device_type), &device_type, NULL);
      if (device_type & CL_DEVICE_TYPE_CPU)
        printf("CL_DEVICE_TYPE_CPU");
      if (device_type & CL_DEVICE_TYPE_GPU)
        printf("CL_DEVICE_TYPE_GPU");
      if (device_type & CL_DEVICE_TYPE_ACCELERATOR)
        printf("CL_DEVICE_TYPE_ACCELERATOR");
      if (device_type & CL_DEVICE_TYPE_DEFAULT)
        printf("CL_DEVICE_TYPE_DEFAULT");
      printf(")");
      if (default_device == devices[k]) printf(" default");
      printf("\n");

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_NAME,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_NAME                         : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VENDOR,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_VENDOR                       : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DRIVER_VERSION,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DRIVER_VERSION                      : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_PROFILE,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_PROFILE                      : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VERSION,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_VERSION                      : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_EXTENSIONS,
                                 1024, tmp_buf, NULL);

	  //CL_DEVICE_MAX_COMPUTE_UNITS
	  //CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
	  //CL_DEVICE_MAX_WORK_GROUP_SIZE
	  //CL_DEVICE_MAX_WORK_ITEM_SIZES
	  //
	  cl_uint usize;
      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(usize), &usize, NULL);
      printf(" - CL_DEVICE_MAX_COMPUTE_UNITS                      : %d\n", usize);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,  sizeof(usize), &usize, NULL);
      printf(" - CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS                      : %d\n", usize);

	  size_t size;
      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size), &size, NULL);
      printf(" - CL_DEVICE_MAX_WORK_GROUP_SIZE                      : %d\n",size);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size), &size, NULL);
      printf(" - CL_DEVICE_MAX_WORK_ITEM_SIZES                      : %d\n", size);
      printf("\n");
    }
    free(devices);

    printf("\n");
  }

  free(platforms);
}
Exemplo n.º 6
0
int OpenCLDevice::getMaxWorkItemDimensions() {
	cl_uint value;
	check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, (sizeof(cl_uint)), &value, NULL));
	return value;
}
Exemplo n.º 7
0
int main() {

   /* Host/device data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel init_kernel, stage_kernel, scale_kernel;
   cl_int err, i;
   size_t global_size, local_size;
   cl_ulong local_mem_size;

   /* Data and buffer */
   int direction;
   unsigned int num_points, points_per_group, stage;
   float data[NUM_POINTS*2];
   double error, check_input[NUM_POINTS][2], check_output[NUM_POINTS][2];
   cl_mem data_buffer;

   /* Initialize data */
   srand((unsigned int)time(0));
   for(i=0; i<NUM_POINTS; i++) {
      data[2*i] = (float)rand();
      data[2*i+1] = (float)rand();
      check_input[i][0] = (float)data[2*i];
      check_input[i][1] = (float)data[2*i+1];
   }

   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Build the program */
   program = build_program(context, device, PROGRAM_FILE);

   /* Create kernels for the FFT */
   init_kernel = clCreateKernel(program, INIT_FUNC, &err);
   if(err < 0) {
      printf("Couldn't create the initial kernel: %d", err);
      exit(1);
   };
   stage_kernel = clCreateKernel(program, STAGE_FUNC, &err);
   if(err < 0) {
      printf("Couldn't create the stage kernel: %d", err);
      exit(1);
   };
   scale_kernel = clCreateKernel(program, SCALE_FUNC, &err);
   if(err < 0) {
      printf("Couldn't create the scale kernel: %d", err);
      exit(1);
   };

   /* Create buffer */
   data_buffer = clCreateBuffer(context, 
         CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, 
         2*NUM_POINTS*sizeof(float), data, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);
   };

   /* Determine maximum work-group size */
   err = clGetKernelWorkGroupInfo(init_kernel, device, 
      CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
   if(err < 0) {
      perror("Couldn't find the maximum work-group size");
      exit(1);   
   };

   /* Determine local memory size */
   err = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, 
      sizeof(local_mem_size), &local_mem_size, NULL);
   if(err < 0) {
      perror("Couldn't determine the local memory size");
      exit(1);   
   };

   /* Initialize kernel arguments */
   direction = DIRECTION;
   num_points = NUM_POINTS;
   points_per_group = (unsigned int)(local_mem_size/(2*sizeof(float)));
   if(points_per_group > num_points)
      points_per_group = num_points;

   /* Set kernel arguments */
   err = clSetKernelArg(init_kernel, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(init_kernel, 1, (size_t)local_mem_size, NULL);
   err |= clSetKernelArg(init_kernel, 2, sizeof(points_per_group), &points_per_group);
   err |= clSetKernelArg(init_kernel, 3, sizeof(num_points), &num_points);
   err |= clSetKernelArg(init_kernel, 4, sizeof(direction), &direction);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue initial kernel */
   global_size = (num_points/points_per_group)*local_size;
   err = clEnqueueNDRangeKernel(queue, init_kernel, 1, NULL, &global_size, 
                                &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't enqueue the initial kernel");
      exit(1);
   }

   /* Enqueue further stages of the FFT */
   if(num_points > points_per_group) {

      err = clSetKernelArg(stage_kernel, 0, sizeof(cl_mem), &data_buffer);
      err |= clSetKernelArg(stage_kernel, 2, sizeof(points_per_group), &points_per_group);
      err |= clSetKernelArg(stage_kernel, 3, sizeof(direction), &direction);
      if(err < 0) {
         printf("Couldn't set a kernel argument");
         exit(1);   
      };
      for(stage = 2; stage <= num_points/points_per_group; stage <<= 1) {
         clSetKernelArg(stage_kernel, 1, sizeof(stage), &stage);
         err = clEnqueueNDRangeKernel(queue, stage_kernel, 1, NULL, &global_size, 
                                      &local_size, 0, NULL, NULL); 
         if(err < 0) {
            perror("Couldn't enqueue the stage kernel");
            exit(1);
         }
      }
   }

   /* Scale values if performing the inverse FFT */
   if(direction < 0) {
      err = clSetKernelArg(scale_kernel, 0, sizeof(cl_mem), &data_buffer);
      err |= clSetKernelArg(scale_kernel, 1, sizeof(points_per_group), &points_per_group);
      err |= clSetKernelArg(scale_kernel, 2, sizeof(num_points), &num_points);
      if(err < 0) {
         printf("Couldn't set a kernel argument");
         exit(1);   
      };
      err = clEnqueueNDRangeKernel(queue, scale_kernel, 1, NULL, &global_size, 
                                   &local_size, 0, NULL, NULL); 
      if(err < 0) {
         perror("Couldn't enqueue the initial kernel");
         exit(1);
      }
   }

   /* Read the results */
   err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
         2*NUM_POINTS*sizeof(float), data, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   /* Compute accurate values */
   if(direction > 0)
      fft(NUM_POINTS, check_input, check_output);
   else
      ifft(NUM_POINTS, check_output, check_input);

   /* Determine error */
   error = 0.0;
   for(i=0; i<NUM_POINTS; i++) {
      error += fabs(check_output[i][0] - data[2*i])/fabs(check_output[i][0]);
      error += fabs(check_output[i][1] - data[2*i+1])/fabs(check_output[i][1]);
   }
   error = error/(NUM_POINTS*2);

   /* Display check results */
   printf("%u-point ", num_points);
   if(direction > 0) 
      printf("FFT ");
   else     
      printf("IFFT ");
   printf("completed with %lf average relative error.\n", error);

   /* Deallocate resources */
   clReleaseMemObject(data_buffer);
   clReleaseKernel(init_kernel);
   clReleaseKernel(stage_kernel);
   clReleaseKernel(scale_kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}
bool
initOpenCL(ComputeEnv *env)
{
        int r = cllib_init();
        if (r < 0) {
                return false;
        }

        cl_uint num_plt;
        cl_platform_id plts[16];
        clGetPlatformIDs(16, plts, &num_plt);
        bool found = false;
        cl_int err;

        cl_platform_id platform;
        cl_context context;
        cl_device_id dev;
        cl_command_queue queue;
        cl_kernel ker_filter, ker_filter_in1_out32, ker_filter_in128_out1;
        cl_kernel ker_filter_in3_out32, ker_filter_in128_out3;
        cl_program program = 0;

        for (unsigned int i=0; i<num_plt; i++) {
                size_t sz;
                cl_uint num_dev;

                clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, 0, nullptr, &sz);
                std::vector<char> name(sz);
                clGetPlatformInfo(plts[i], CL_PLATFORM_NAME, sz, &name[0], &sz);

                bool is_amd = strstr(&name[0], "AMD") != NULL;
                bool is_apple = strstr(&name[0], "Apple") != NULL;
                //bool is_intel = strstr(&name[0], "Intel") != NULL;
                //bool is_nvidia = strstr(&name[0], "NVIDIA") != NULL;

                if (!is_amd && !is_apple) {
                        continue;
                }

                clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_dev);
                if (num_dev == 0) {
                        continue;
                }

                std::vector<cl_device_id> devs(num_dev);
                clGetDeviceIDs(plts[i], CL_DEVICE_TYPE_GPU, num_dev, &devs[0], &num_dev);

                platform = plts[i];
                dev = devs[0];

                cl_context_properties props[] =
                        {CL_CONTEXT_PLATFORM, (cl_context_properties)(plts[i]), 0};
                cl_context ctxt = clCreateContext(props, 1, &devs[0], NULL, NULL, &err);
                if (err != CL_SUCCESS) {
                        continue;
                }

                context = ctxt;

                found = true;
                break;
        }

        if (!found) {
                return false;
        }

        size_t dev_name_len;
        clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, nullptr, &dev_name_len);
        std::vector<char> dev_name(dev_name_len+1);
        clGetDeviceInfo(dev, CL_DEVICE_NAME, dev_name_len, &dev_name[0], &dev_name_len);

        bool bin_avaiable = false;

#if defined __linux || _WIN32

#ifdef __linux
        ssize_t path_len = 4;
        char *self_path = (char*)malloc(path_len+1);
        while (1) {
                ssize_t r = readlink("/proc/self/exe", self_path, path_len);
                if (r < path_len) {
                        self_path[r] = '\0';
                        break;
                }

                path_len *= 2;
                self_path = (char*)realloc(self_path, path_len+1);
        }

        struct stat self_st;
        stat(self_path, &self_st);
        self_path = dirname(self_path);
#else
        size_t path_len = 4;
        char *self_path = (char*)malloc(path_len+1);
	DWORD len;
        while (1) {
		len = GetModuleFileName(NULL, self_path, path_len);
		if (len > 0 && len != path_len) {
			break;
		}

                path_len *= 2;
                self_path = (char*)realloc(self_path, path_len+1);
        }
	WIN32_FIND_DATA self_st;
	HANDLE finder = FindFirstFile(self_path, &self_st);
	FindClose(finder);

	for (int si=len-1; si>=0; si--) {
		if (self_path[si] == '\\') {
			self_path[si] = '\0';
			break;
		}
	}
#endif

        std::string bin_path = std::string(self_path) + "/" + &dev_name[0] + ".bin";

        FILE *binfp = fopen(bin_path.c_str(), "rb");
        if (binfp) {
#ifdef __linux
                struct stat bin_st;
                stat(bin_path.c_str(), &bin_st);

                bool old = false;
                if (bin_st.st_mtim.tv_sec < self_st.st_mtim.tv_sec) {
                        old = true;
                }

                if (bin_st.st_mtim.tv_sec == self_st.st_mtim.tv_sec) {
                        if (bin_st.st_mtim.tv_nsec < self_st.st_mtim.tv_nsec) {
                                old = true;
                        }
                }
		size_t bin_sz = bin_st.st_size;
#else
                WIN32_FIND_DATA bin_st;
		HANDLE finder = FindFirstFile(bin_path.c_str(), &bin_st);
		FindClose(finder);

		bool old = false;
		uint64_t self_time = (((uint64_t)self_st.ftLastWriteTime.dwHighDateTime)<<32) |
			((uint64_t)self_st.ftLastWriteTime.dwLowDateTime);
		uint64_t bin_time = (((uint64_t)bin_st.ftLastWriteTime.dwHighDateTime)<<32) |
			((uint64_t)bin_st.ftLastWriteTime.dwLowDateTime);

		if (bin_time < self_time) {
			old = true;
		}

		size_t bin_sz = bin_st.nFileSizeLow;
#endif

                if (!old) {
                        unsigned char *bin = (unsigned char*)malloc(bin_sz);

                        size_t rem = bin_sz;
                        unsigned char *p = bin;
                        while (rem) {
                                size_t rsz = fread(p, 1, rem, binfp);
                                if (rsz <= 0) {
                                        break;
                                }

                                rem -= rsz;
                                p += rsz;
                        }

                        if (rem == 0) {
                                cl_int err;
                                program = clCreateProgramWithBinary(context, 1, &dev, &bin_sz,
                                                                    (const unsigned char**)&bin, NULL, &err);

                                if (err == CL_SUCCESS) {
                                        bin_avaiable = true;
                                }
                        }

                        free(bin);
                }

                fclose(binfp);
        }
#endif

        if (! bin_avaiable) {
                const char *source[1] = {prog};
                size_t src_len[1] = {sizeof(prog)-1};

                program = clCreateProgramWithSource(context, 1, source, src_len, &err);
                if (err != CL_SUCCESS) {
                        clReleaseContext(context);
                        return false;
                }

        }

#if defined __linux || defined _WIN32
        free(self_path);
#endif

        err = clBuildProgram(program, 1, &dev, "" , nullptr, nullptr);
        if (err != CL_SUCCESS) {
                size_t log_len;
                clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_len);

                std::vector<char> log(log_len+1);
                clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_len, &log[0], &log_len);
                log[log_len] = '\0';

                puts(&log[0]);

                clReleaseProgram(program);
                clReleaseContext(context);
                return false;
        }



#if defined __linux || _WIN32
        if (!bin_avaiable) {
                size_t binsz;
                size_t ret_len;
                clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsz), &binsz, &ret_len);

                char *buffer = new char [binsz];
                char *ptrs[1];
                ptrs[0] = buffer;

                clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(ptrs), ptrs, &ret_len);

                FILE *fp = fopen(bin_path.c_str(), "wb");

                size_t rem = binsz;
                char *p = buffer;

                while (rem) {
                        size_t wsz = fwrite(p, 1, rem, fp);
                        if (wsz <= 0) {
                                fclose(fp);
                                unlink(bin_path.c_str());
                                fp=NULL;
                                break;
                        }
                        rem -= wsz;
                        p += wsz;
                }

                if (fp) {
                        fclose(fp);
                }

                delete [] buffer;
        }
#endif



        ker_filter = clCreateKernel(program, "filter", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                return false;
        }

        ker_filter_in1_out32 = clCreateKernel(program, "filter_in1_out32", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                return false;
        }

        ker_filter_in3_out32 = clCreateKernel(program, "filter_in3_out32", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        ker_filter_in128_out1 = clCreateKernel(program, "filter_in128_out1", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        ker_filter_in128_out3 = clCreateKernel(program, "filter_in128_out3", &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        queue = clCreateCommandQueue(context, dev, 0, &err);
        if (err != CL_SUCCESS) {
                clReleaseProgram(program);
                clReleaseContext(context);
                clReleaseKernel(ker_filter);
                clReleaseKernel(ker_filter_in1_out32);
                return false;
        }

        env->num_cl_dev = 1;
        env->cl_dev_list = new OpenCLDev[1];

        env->cl_dev_list[0].platform = platform;
        env->cl_dev_list[0].context = context;
        env->cl_dev_list[0].devid = dev;
        env->cl_dev_list[0].queue = queue;
        env->cl_dev_list[0].program = program;
        env->cl_dev_list[0].ker_filter = ker_filter;
        env->cl_dev_list[0].ker_filter_in1_out32 = ker_filter_in1_out32;
        env->cl_dev_list[0].ker_filter_in128_out1 = ker_filter_in128_out1;
        env->cl_dev_list[0].ker_filter_in3_out32 = ker_filter_in3_out32;
        env->cl_dev_list[0].ker_filter_in128_out3 = ker_filter_in128_out3;
        env->cl_dev_list[0].name = &dev_name[0];

        return true;
}
Exemplo n.º 9
0
int main(int argc, const char** argv)
{
	size_t x = 512, y = 250000; //y has to be a multiple of ciDeviceCount!
	struct svm_node* px = (struct svm_node*)malloc((x+1)*sizeof(struct svm_node));
	gen_data(px, x, 1, 3);
	struct svm_node* py = (struct svm_node*)malloc((x+1)*y*sizeof(struct svm_node));
	for(size_t i = 0; i < y; ++i) {
		struct svm_node* tmp = py+i*(x+1);
		gen_data(tmp, x, 3,2);
	}
	dtype* result = (dtype*)malloc(y*sizeof(dtype));
	int* pyLength = (int*)malloc(y*sizeof(int));
	
	for(size_t i = 0; i < y; ++i)
	{
		for(size_t j = 0; py[i*(x+1)+j].index >= 0; ++j)
			pyLength[i] = py[i*(x+1)+j].index;
		++pyLength[i];
	}
	
	cl_int err = CL_SUCCESS;
//	cl_platform_id platform = NULL;
//	cl_uint ciDeviceCount = 0;
//	cl_device_id *device = NULL;

	// retrieve devices
	cl_platform_id platform;
	err = clGetPlatformIDs(1, &platform, NULL);
	cl_device_id device;
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);

	size_t localDim  = 256l;
	size_t globalDim = localDim*y;
/*	
	device = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, ciDeviceCount, device, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Failed to get devices:\n%s\n", oclErrorString(err));
		return -1;
	}
	*/
	//Create the context
	cl_context context1 = clCreateContext(0, 1, &device, NULL, NULL, &err);
	if(err != CL_SUCCESS)
	{
		printf("Context creation failed:\n%d\n", err);
		return -1;
	}										 

	// create a command queue for first device the context reported
	cl_command_queue queue = clCreateCommandQueue(context1, device, 0, 0);
	
	// load program from disk
	char *tmp = strdup(argv[0]);
	char* my_dir = dirname(tmp);

//	size_t program_length;
	char path[256];
  	snprintf(path, PATH_MAX - 1, "%s/vecops.cl", my_dir);
 
	cl_program vecops = load_kernel(path, context1);

	if(err != CL_SUCCESS)
	{
		printf("Program creation failed:\n%d\n", (err));
		return -1;
	}
 
	err = clBuildProgram(vecops, 0, NULL, "-I.", NULL, NULL);
	if(err != CL_SUCCESS)
	{
			err = clGetProgramBuildInfo(vecops, device, CL_PROGRAM_BUILD_LOG, 8192, buffer, NULL);
			if(err != CL_SUCCESS)
				printf("Cannot get build info: %d\n", (err));

			printf("Build log:\n%s\n", buffer);
	}
	
	// create kernel
	cl_kernel sparsedot_kernel;
	
#if version == 1
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot1_kernel", &err);
#endif
#if version == 2
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot4_kernel", &err);
#endif
#if version == 3
	sparsedot_kernel = clCreateKernel(vecops, "sparsedot3_kernel", &err);
#endif
	if (err != CL_SUCCESS)
	{
		printf("Kernel creation failed:\n%d\n", (err));
		return -1;
	}
	
	 
	// allocate memory on the devices
	cl_mem px_d, py_d, result_d, pyLength_d;
	
#if version == 1
	px_d = clCreateBuffer(context1,
							 CL_MEM_READ_ONLY,
							 (x+1) * sizeof(struct svm_node),
							 0, &err);
#endif
#if version == 2 || version == 3
	//unpack px
	int size = px[x-1].index+1;

	for(size_t i = 0; i < y; ++i)
		size = size > pyLength[i] ? size : pyLength[i];

	dtype* px_u = (dtype*)calloc(size, sizeof(dtype));

	unpack(px, px_u);
	printf("px size: %d\n", size);
#endif
#if version == 3
	size_t height, width;
	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &height, 0);
	clGetDeviceInfo(Device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &width, 0);

	size_t region[3];
	region[2] = 1;

	region[0] = min(4, size);
	region[1] = (size+2-1) / 4;
		

	cl_image_format px_format;
	px_format.image_channel_order = CL_R;
	px_format.image_channel_data_type = CL_FLOAT;
#endif
#if version == 2
	px_d = clCreateBuffer(context1,
				 CL_MEM_READ_ONLY,
				 size * sizeof(dtype),
				 0, &err);
#endif
#if version == 3
	 px_d = clCreateImage2D(context1, CL_MEM_READ_ONLY, &px_format,
				  region[0], region[1], 0, 0, &err);

#endif
	if(err != CL_SUCCESS)
	{
		printf("Failed to allocate px:\n%d\n", (err));
		return -1;
	}
	py_d = clCreateBuffer(context1,
		 CL_MEM_READ_ONLY,
		 (x+1) * y * sizeof(struct svm_node),
		 0, &err);
	if(err != CL_SUCCESS)
	{
		printf("Failed to allocate px:\n%d\n", (err));
		return -1;
	}
	result_d = clCreateBuffer(context1,
		CL_MEM_WRITE_ONLY,
		y * sizeof(dtype),
		0, 0);
	pyLength_d = clCreateBuffer(context1,
		CL_MEM_READ_ONLY,
		y * sizeof(int),
		0, 0);
	

#if bench
	//start time measurement
	start_timer(0);
#endif

	// copy host vectors to device
	err = CL_SUCCESS;
   
    err |= clEnqueueWriteBuffer(queue, py_d, CL_FALSE, 0, 
								(x+1) * y * sizeof(struct svm_node), py, 0, NULL, NULL);
									

	err |= clEnqueueWriteBuffer(queue, pyLength_d, CL_FALSE, 0, 
								y * sizeof(int), pyLength, 0, NULL, NULL);

#if version == 1
	err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, 
								(x+1) * sizeof(struct svm_node), px, 0, NULL, NULL);
#endif
#if version == 2
	err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, 
								size * sizeof(dtype), px_u, 0, NULL, NULL);
#endif
#if version == 3
	size_t offset[] = {0,0,0};
	err |= clEnqueueWriteImage(queue, px_d, CL_TRUE, offset, region, sizeof(dtype), 0, 
							   px_u, 0, 0, NULL);
#endif
	clFinish(queue);

	 
	if(err != CL_SUCCESS)
	{
		printf("Data transfer to GPU failed:\n%d\n", (err));
		return -1;
	}

#if bench
	stop_timer(0);
	start_timer(1);
#endif
	// set kernel arguments

	clSetKernelArg(sparsedot_kernel, 0, sizeof(cl_mem), (void *) &px_d);
	clSetKernelArg(sparsedot_kernel, 1, sizeof(cl_mem), (void *) &py_d);
	clSetKernelArg(sparsedot_kernel, 2, sizeof(cl_mem), (void *) &result_d);
	clSetKernelArg(sparsedot_kernel, 3, sizeof(cl_mem), (void *) &pyLength_d);
	clSetKernelArg(sparsedot_kernel, 4, sizeof(cl_ulong), (void *) &x);
	clSetKernelArg(sparsedot_kernel, 5, sizeof(cl_ulong), (void *) &y);
//	clSetKernelArg(sparsedot_kernel, 6, sizeof(cl_float8)*localDim, 0);
#if version == 3
		clSetKernelArg(sparsedot_kernel, 7, sizeof(cl_long), (void *) &region[1]) ;		
		clSetKernelArg(sparsedot_kernel, 8, sizeof(cl_long), (void *) &region[0]) ;		
#endif
	clFlush(queue);

	// start kernel
	err = clEnqueueNDRangeKernel(queue, sparsedot_kernel, 1, 0, &globalDim, &localDim,
					   0, NULL, 0);

	if(err != CL_SUCCESS)
	{
		printf("Kernel launch failed:\n%d\n", (err));
		return -1;
	}

	clFinish(queue);
	
#if bench	
	stop_timer(1);
	start_timer(2);
#endif

	cl_event result_gather;
	 
	// Non-blocking copy of result from device to host
	err = clEnqueueReadBuffer(queue, result_d, CL_FALSE, 0, y * sizeof(dtype), 
						result, 0, NULL, &result_gather);
	
	if(err != CL_SUCCESS)
	{
		printf("Reading result failed:\n%d\n", (err));
		return -1;
	}

	// CPU sync with GPU
	clWaitForEvents(1, &result_gather);

#if bench	
	// stop GPU time measurement
	stop_timer(2);
#endif
	//check result
/*	for(size_t i = 0; i < y; ++i)
	{
		printf("%f ", result[i]);
	}
	printf("\n");
  */  

#if bench
	start_timer(3);
#endif
	bool correct = validate(px, py, result, x, y);
#if bench
	stop_timer(3);
	printf("v%i; x: %lu, y: %lu\n", version, x, y);
	printf("CPU: %f, upcpy: %f DeviceCalc: %f, downcpy: %f\n", 
		   get_secs(3), get_secs(0), get_secs(1), get_secs(2));
#endif
	
	if(correct)
		printf("SUCCESS!\n");
		
	//cleenup

	clReleaseKernel(sparsedot_kernel);
	clReleaseCommandQueue(queue);
	clReleaseEvent(result_gather);
	clReleaseMemObject(px_d);
	clReleaseMemObject(py_d);
	clReleaseMemObject(result_d);
	clReleaseMemObject(pyLength_d);
//	clReleaseDevice(device);

	free(px);
#if version == 2 || version == 3
	free(px_u);
#endif
	free(py);
	free(result);

	return 0;
}
Exemplo n.º 10
0
//Maybe somebody could tell me how to use template when exporting a class from a dll. Probably not possible?
cl_uint OclHost::getDeviceInfoInt(cl_device_info info) {
    cl_uint value = 0;
    clGetDeviceInfo(oclDevice, info, sizeof(value), &value, 0);
    return value;
}
Exemplo n.º 11
0
cl_ulong OclHost::getDeviceInfoLong(cl_device_info info) {
    cl_ulong value = 0;
    clGetDeviceInfo(oclDevice, info, sizeof(value), &value, 0);
    return value;
}
Exemplo n.º 12
0
OclHost::OclHost(int const device_type, int gpu_id, int const cpu_cores) :
    devType(device_type), maxGlobalMem(0), maxLocalMem(0) {
//		if (!isGPU()) {
//				gpu_id = 0;
//		}

    cl_int ciErrNum = CL_SUCCESS;
    Log.Verbose("Using device number %d", gpu_id);
//#pragma omp critical
//	{
    if (contextUserCount == 0) {
        Log.Verbose("Creating ocl context.");
//		cl_uint ciDeviceCount = 0;
        cl_platform_id cpPlatform = NULL;

        cpPlatform = getPlatform();
        //Get the devices

        //Get number of devices
        ciErrNum = clGetDeviceIDs(cpPlatform, devType, 0, NULL, &ciDeviceCount);
        checkClError("Couldn't get number of OpenCl devices. Error: ",
                     ciErrNum);

        if (isGPU()) {
            //Getting device ids
            devices = (cl_device_id *) malloc(
                          ciDeviceCount * sizeof(cl_device_id));
            ciErrNum = clGetDeviceIDs(cpPlatform, devType, ciDeviceCount,
                                      devices, NULL);
            checkClError("Couldn't get OpenCl device ids. Error: ", ciErrNum);

            //Create context
            oclGpuContext = clCreateContext(0, ciDeviceCount, devices, NULL,
                                            NULL, &ciErrNum);
            checkClError("Couldn't create context. Error: ", ciErrNum);
            Log.Message("Context for GPU devices created.");

            Log.Message("%d GPU device(s) found: ", ciDeviceCount);
            for (int i = 0; i < ciDeviceCount; ++i) {
                char device_string[1024];
                char driver_string[1024];
                clGetDeviceInfo(devices[i], CL_DEVICE_NAME,
                                sizeof(device_string), &device_string, NULL);
                clGetDeviceInfo(devices[i], CL_DRIVER_VERSION,
                                sizeof(driver_string), &driver_string, NULL);
                Log.Message("Device %d: %s (Driver: %s)", i, device_string, driver_string);
            }

        } else {
            if (ciDeviceCount > 1) {
                Log.Error("More than one CPU device found.");
                exit(-1);
            }

            cl_device_id device_id;
            ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1,
                                      &device_id, NULL);
            checkClError("Couldn't get CPU device id. Error: ", ciErrNum);

            Log.Message("%d CPU device found.", ciDeviceCount);
            char device_string[1024];
            char driver_string[1024];
            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string),
                            &device_string, NULL);
            clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(driver_string),
                            &driver_string, NULL);
            Log.Message("Device %d: %s (Driver: %s)", 0, device_string, driver_string);

            cl_device_partition_property props[3];

            props[0] = CL_DEVICE_PARTITION_EQUALLY; // Equally
            props[1] = 1; // 4 compute units per sub-device
            props[2] = 0;

            devices = (cl_device_id *) malloc(256 * sizeof(cl_device_id));
            ciErrNum = clCreateSubDevices(device_id, props, 256, devices,
                                          &ciDeviceCount);
            if (ciErrNum == -18) {
                ciDeviceCount = 1;
                devices[0] = device_id;
            } else {
                checkClError("Couldn't create sub-devices. Error: ", ciErrNum);
            }

            Log.Message("%d CPU cores available.", ciDeviceCount);

            //Create context
            oclGpuContext = clCreateContext(0, ciDeviceCount, devices, NULL,
                                            NULL, &ciErrNum);
            checkClError("Couldn't create context. Error: ", ciErrNum);

        }
    }
    contextUserCount += 1;
    //}

    if (!isGPU()) {
        gpu_id = gpu_id % ciDeviceCount;
    }
    oclDevice = devices[gpu_id];
    //Create context
    //oclGpuContext = clCreateContext(0, 1, &oclDevice, NULL, NULL, &ciErrNum);
    //checkClError("Couldn't create context. Error: ", ciErrNum);

    // create command queue
    oclCommandQueue = clCreateCommandQueue(oclGpuContext, oclDevice, 0,
                                           &ciErrNum);

    checkClError("Couldn't create command queue for device: ", ciErrNum);

}
Exemplo n.º 13
0
int clDevicesNum(void) {
	cl_int status;
	char pbuff[256];
	cl_uint numDevices;
	cl_uint numPlatforms;
	int most_devices = -1;
	cl_platform_id *platforms;
	cl_platform_id platform = NULL;
	unsigned int i, mdplatform = 0;

	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	/* If this fails, assume no GPUs. */
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: clGetPlatformsIDs failed (no OpenCL SDK installed?)", status);
		return -1;
	}

	if (numPlatforms == 0) {
		applog(LOG_ERR, "clGetPlatformsIDs returned no platforms (no OpenCL SDK installed?)");
		return -1;
	}

	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
		return -1;
	}

	for (i = 0; i < numPlatforms; i++) {
		if (opt_platform_id >= 0 && (int)i != opt_platform_id)
			continue;

		status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
			return -1;
		}
		platform = platforms[i];
		applog(LOG_INFO, "CL Platform %d vendor: %s", i, pbuff);
		status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
		if (status == CL_SUCCESS)
			applog(LOG_INFO, "CL Platform %d name: %s", i, pbuff);
		status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pbuff), pbuff, NULL);
		if (status == CL_SUCCESS)
			applog(LOG_INFO, "CL Platform %d version: %s", i, pbuff);
		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
		if (status != CL_SUCCESS) {
			applog(LOG_INFO, "Error %d: Getting Device IDs (num)", status);
			continue;
		}
		applog(LOG_INFO, "Platform %d devices: %d", i, numDevices);
		if ((int)numDevices > most_devices) {
			most_devices = numDevices;
			mdplatform = i;
		}
		if (numDevices) {
			unsigned int j;
			cl_device_id *devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

			clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
			for (j = 0; j < numDevices; j++) {
				clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
				applog(LOG_INFO, "\t%i\t%s", j, pbuff);
			}
			free(devices);
		}
	}

	if (opt_platform_id < 0)
		opt_platform_id = mdplatform;;

	return most_devices;
}
Exemplo n.º 14
0
_clState *initCl(unsigned int gpu, char *name, size_t nameSize)
{
	_clState *clState = calloc(1, sizeof(_clState));
	bool patchbfi = false, prog_built = false;
	struct cgpu_info *cgpu = &gpus[gpu];
	cl_platform_id platform = NULL;
	char pbuff[256], vbuff[255];
	cl_platform_id* platforms;
	cl_uint preferred_vwidth;
	cl_device_id *devices;
	cl_uint numPlatforms;
	cl_uint numDevices;
	cl_int status;

	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status);
		return NULL;
	}

	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
		return NULL;
	}

	if (opt_platform_id >= (int)numPlatforms) {
		applog(LOG_ERR, "Specified platform that does not exist");
		return NULL;
	}

	status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
		return NULL;
	}
	platform = platforms[opt_platform_id];

	if (platform == NULL) {
		perror("NULL platform found!\n");
		return NULL;
	}

	applog(LOG_INFO, "CL Platform vendor: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform name: %s", pbuff);
	status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL);
	if (status == CL_SUCCESS)
		applog(LOG_INFO, "CL Platform version: %s", vbuff);

	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
		return NULL;
	}

	if (numDevices > 0 ) {
		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

		/* Now, get the device list data */

		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status);
			return NULL;
		}

		applog(LOG_INFO, "List of devices:");

		unsigned int i;
		for (i = 0; i < numDevices; i++) {
			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "\t%i\t%s", i, pbuff);
		}

		if (gpu < numDevices) {
			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS) {
				applog(LOG_ERR, "Error %d: Getting Device Info", status);
				return NULL;
			}

			applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
			strncpy(name, pbuff, nameSize);
		} else {
			applog(LOG_ERR, "Invalid GPU %i", gpu);
			return NULL;
		}

	} else return NULL;

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
		return NULL;
	}

	/////////////////////////////////////////////////////////////////
	// Create an OpenCL command queue
	/////////////////////////////////////////////////////////////////
	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
	if (status != CL_SUCCESS) /* Try again without OOE enable */
		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
		return NULL;
	}

	/* Check for BFI INT support. Hopefully people don't mix devices with
	 * and without it! */
	char * extensions = malloc(1024);
	const char * camo = "cl_amd_media_ops";
	char *find;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status);
		return NULL;
	}
	find = strstr(extensions, camo);
	if (find)
		clState->hasBitAlign = true;
		
	/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
	char * devoclver = malloc(1024);
	const char * ocl10 = "OpenCL 1.0";

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status);
		return NULL;
	}
	find = strstr(devoclver, ocl10);
	if (!find)
		clState->hasOpenCL11plus = true;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
		return NULL;
	}
	applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc);

	/* Create binary filename based on parameters passed to opencl
	 * compiler to ensure we only load a binary that matches what would
	 * have otherwise created. The filename is:
	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
	 * For scrypt the filename is:
	 * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin
	 */
	char binaryfilename[255];
	char filename[255];
	char numbuf[16];

	if (cgpu->kernel == KL_NONE) {
		if (opt_scrypt) {
			applog(LOG_INFO, "Selecting scrypt kernel");
			clState->chosen_kernel = KL_SCRYPT;
		} else if (!strstr(name, "Tahiti") &&
			/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
			 strstr(vbuff, "831.4") ||
			 strstr(vbuff, "898.1") ||  // 12.2 driver SDK 
			 strstr(vbuff, "923.1") ||  // 12.4
			 strstr(vbuff, "938.2") ||  // SDK 2.7
			 strstr(vbuff, "1113.2"))) {// SDK 2.8
				applog(LOG_INFO, "Selecting diablo kernel");
				clState->chosen_kernel = KL_DIABLO;
		/* Detect all 7970s, older ATI and NVIDIA and use poclbm */
		} else if (strstr(name, "Tahiti") || !clState->hasBitAlign) {
			applog(LOG_INFO, "Selecting poclbm kernel");
			clState->chosen_kernel = KL_POCLBM;
		/* Use phatk for the rest R5xxx R6xxx */
		} else {
			applog(LOG_INFO, "Selecting phatk kernel");
			clState->chosen_kernel = KL_PHATK;
		}
		cgpu->kernel = clState->chosen_kernel;
	} else {
		clState->chosen_kernel = cgpu->kernel;
		if (clState->chosen_kernel == KL_PHATK &&
		    (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") ||
		     strstr(vbuff, "831.4") || strstr(vbuff, "898.1") ||
		     strstr(vbuff, "923.1") || strstr(vbuff, "938.2") ||
		     strstr(vbuff, "1113.2"))) {
			applog(LOG_WARNING, "WARNING: You have selected the phatk kernel.");
			applog(LOG_WARNING, "You are running SDK 2.6+ which performs poorly with this kernel.");
			applog(LOG_WARNING, "Downgrade your SDK and delete any .bin files before starting again.");
			applog(LOG_WARNING, "Or allow cgminer to automatically choose a more suitable kernel.");
		}
	}

	/* For some reason 2 vectors is still better even if the card says
	 * otherwise, and many cards lie about their max so use 256 as max
	 * unless explicitly set on the command line. Tahiti prefers 1 */
	if (strstr(name, "Tahiti"))
		preferred_vwidth = 1;
	else if (preferred_vwidth > 2)
		preferred_vwidth = 2;

	switch (clState->chosen_kernel) {
		case KL_POCLBM:
			strcpy(filename, POCLBM_KERNNAME".cl");
			strcpy(binaryfilename, POCLBM_KERNNAME);
			break;
		case KL_PHATK:
			strcpy(filename, PHATK_KERNNAME".cl");
			strcpy(binaryfilename, PHATK_KERNNAME);
			break;
		case KL_DIAKGCN:
			strcpy(filename, DIAKGCN_KERNNAME".cl");
			strcpy(binaryfilename, DIAKGCN_KERNNAME);
			break;
		case KL_SCRYPT:
			strcpy(filename, SCRYPT_KERNNAME".cl");
			strcpy(binaryfilename, SCRYPT_KERNNAME);
			/* Scrypt only supports vector 1 */
			cgpu->vwidth = 1;
			break;
		case KL_NONE: /* Shouldn't happen */
		case KL_DIABLO:
			strcpy(filename, DIABLO_KERNNAME".cl");
			strcpy(binaryfilename, DIABLO_KERNNAME);
			break;
	}

	if (cgpu->vwidth)
		clState->vwidth = cgpu->vwidth;
	else {
		clState->vwidth = preferred_vwidth;
		cgpu->vwidth = preferred_vwidth;
	}

	if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
		clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt)
			clState->goffset = true;

	if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
		clState->wsize = cgpu->work_size;
	else if (opt_scrypt)
		clState->wsize = 256;
	else if (strstr(name, "Tahiti"))
		clState->wsize = 64;
	else
		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
	cgpu->work_size = clState->wsize;

#ifdef USE_SCRYPT
	if (opt_scrypt) {
		if (!cgpu->opt_lg) {
			applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
			cgpu->lookup_gap = 2;
		} else
			cgpu->lookup_gap = cgpu->opt_lg;

		if (!cgpu->opt_tc) {
			unsigned int sixtyfours;

			sixtyfours =  cgpu->max_alloc / 131072 / 64 - 1;
			cgpu->thread_concurrency = sixtyfours * 64;
			if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
				cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
				if (cgpu->thread_concurrency > cgpu->shaders * 5)
					cgpu->thread_concurrency = cgpu->shaders * 5;
			}
			applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu,  cgpu->thread_concurrency);
		} else
			cgpu->thread_concurrency = cgpu->opt_tc;
	}
#endif

	FILE *binaryfile;
	size_t *binary_sizes;
	char **binaries;
	int pl;
	char *source = file_contents(filename, &pl);
	size_t sourceSize[] = {(size_t)pl};
	cl_uint slot, cpnd;

	slot = cpnd = 0;

	if (!source)
		return NULL;

	binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binary_sizes)) {
		applog(LOG_ERR, "Unable to calloc binary_sizes");
		return NULL;
	}
	binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
	if (unlikely(!binaries)) {
		applog(LOG_ERR, "Unable to calloc binaries");
		return NULL;
	}

	strcat(binaryfilename, name);
	if (clState->goffset)
		strcat(binaryfilename, "g");
	if (opt_scrypt) {
#ifdef USE_SCRYPT
		sprintf(numbuf, "lg%utc%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency);
		strcat(binaryfilename, numbuf);
#endif
	} else {
		sprintf(numbuf, "v%d", clState->vwidth);
		strcat(binaryfilename, numbuf);
	}
	sprintf(numbuf, "w%d", (int)clState->wsize);
	strcat(binaryfilename, numbuf);
	sprintf(numbuf, "l%d", (int)sizeof(long));
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, ".bin");

	binaryfile = fopen(binaryfilename, "rb");
	if (!binaryfile) {
		applog(LOG_DEBUG, "No binary found, generating from source");
	} else {
		struct stat binary_stat;

		if (unlikely(stat(binaryfilename, &binary_stat))) {
			applog(LOG_DEBUG, "Unable to stat binary, generating from source");
			fclose(binaryfile);
			goto build;
		}
		if (!binary_stat.st_size)
			goto build;

		binary_sizes[slot] = binary_stat.st_size;
		binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
		if (unlikely(!binaries[slot])) {
			applog(LOG_ERR, "Unable to calloc binaries");
			fclose(binaryfile);
			return NULL;
		}

		if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
			applog(LOG_ERR, "Unable to fread binaries");
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			fclose(binaryfile);
			free(binaries[slot]);
			goto build;
		}

		fclose(binaryfile);
		applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

		goto built;
	}

	/////////////////////////////////////////////////////////////////
	// Load CL file, build CL program object, create CL kernel object
	/////////////////////////////////////////////////////////////////

build:
	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
		return NULL;
	}

	/* create a cl program executable for all the devices specified */
	char *CompilerOptions = calloc(1, 256);

#ifdef USE_SCRYPT
	if (opt_scrypt)
		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
			cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize);
	else
#endif
	{
		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
			(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
	}
	applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize);
	if (clState->vwidth > 1)
		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);

	if (clState->hasBitAlign) {
		strcat(CompilerOptions, " -D BITALIGN");
		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
		if (strstr(name, "Cedar") ||
		    strstr(name, "Redwood") ||
		    strstr(name, "Juniper") ||
		    strstr(name, "Cypress" ) ||
		    strstr(name, "Hemlock" ) ||
		    strstr(name, "Caicos" ) ||
		    strstr(name, "Turks" ) ||
		    strstr(name, "Barts" ) ||
		    strstr(name, "Cayman" ) ||
		    strstr(name, "Antilles" ) ||
		    strstr(name, "Wrestler" ) ||
		    strstr(name, "Zacate" ) ||
		    strstr(name, "WinterPark" ))
			patchbfi = true;
	} else
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");

	if (patchbfi) {
		strcat(CompilerOptions, " -D BFI_INT");
		applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
	} else
		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");

	if (clState->goffset)
		strcat(CompilerOptions, " -D GOFFSET");

	if (!clState->hasOpenCL11plus)
		strcat(CompilerOptions, " -D OCL1");

	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
	free(CompilerOptions);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_ERR, "%s", log);
		return NULL;
	}

	prog_built = true;

#ifdef __APPLE__
	/* OSX OpenCL breaks reading off binaries with >1 GPU so always build
	 * from source. */
	goto built;
#endif

	status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
		return NULL;
	}

	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
		return NULL;
	}

	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
	 * to iterate over all the binary slots and find where the real program
	 * is. What the heck is this!? */
	for (slot = 0; slot < cpnd; slot++)
		if (binary_sizes[slot])
			break;

	/* copy over all of the generated binaries. */
	applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, binary_sizes[slot]);
	if (!binary_sizes[slot]) {
		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
		return NULL;
	}
	binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1);
	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
		return NULL;
	}

	/* Patch the kernel if the hardware supports BFI_INT but it needs to
	 * be hacked in */
	if (patchbfi) {
		unsigned remaining = binary_sizes[slot];
		char *w = binaries[slot];
		unsigned int start, length;

		/* Find 2nd incidence of .text, and copy the program's
		* position and length at a fixed offset from that. Then go
		* back and find the 2nd incidence of \x7ELF (rewind by one
		* from ELF) and then patch the opcocdes */
		if (!advance(&w, &remaining, ".text"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, ".text")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		memcpy(&start, w + 285, 4);
		memcpy(&length, w + 289, 4);
		w = binaries[slot]; remaining = binary_sizes[slot];
		if (!advance(&w, &remaining, "ELF"))
			goto build;
		w++; remaining--;
		if (!advance(&w, &remaining, "ELF")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		w--; remaining++;
		w += start; remaining -= start;
		applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
			w, remaining);
		patch_opcodes(w, length);

		status = clReleaseProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
			return NULL;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
			return NULL;
		}

		/* Program needs to be rebuilt */
		prog_built = false;
	}

	free(source);

	/* Save the binary to be loaded next time */
	binaryfile = fopen(binaryfilename, "wb");
	if (!binaryfile) {
		/* Not a fatal problem, just means we build it again next time */
		applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
	} else {
		if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
			applog(LOG_ERR, "Unable to fwrite to binaryfile");
			return NULL;
		}
		fclose(binaryfile);
	}
built:
	if (binaries[slot])
		free(binaries[slot]);
	free(binaries);
	free(binary_sizes);

	applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d",
	       filename, clState->hasBitAlign ? "" : "out", clState->vwidth, clState->wsize);

	if (!prog_built) {
		/* create a cl program executable for all the devices specified */
		status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
			size_t logSize;
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

			char *log = malloc(logSize);
			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
			applog(LOG_ERR, "%s", log);
			return NULL;
		}
	}

	/* get a kernel object handle for a kernel with the given name */
	clState->kernel = clCreateKernel(clState->program, "search", &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status);
		return NULL;
	}

#ifdef USE_SCRYPT
	if (opt_scrypt) {
		size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0));
		size_t bufsize = 128 * ipt * cgpu->thread_concurrency;

		/* Use the max alloc value which has been rounded to a power of
		 * 2 greater >= required amount earlier */
		if (bufsize > cgpu->max_alloc) {
			applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u", gpu, cgpu->max_alloc);
			applog(LOG_WARNING, "Your scrypt settings come to %u", bufsize);
		}
		applog(LOG_DEBUG, "Creating scrypt buffer sized %u", bufsize);
		clState->padbufsize = bufsize;

		/* This buffer is weird and might work to some degree even if
		 * the create buffer call has apparently failed, so check if we
		 * get anything back before we call it a failure. */
		clState->padbuffer8 = NULL;
		clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
		if (status != CL_SUCCESS && !clState->padbuffer8) {
			applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status);
			return NULL;
		}

		clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
			return NULL;
		}
	}
#endif
	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
		return NULL;
	}

	return clState;
}
Exemplo n.º 15
0
long OpenCLDevice::getLocalMemSize() {
	cl_ulong value;
	check_error(clGetDeviceInfo(my_id, CL_DEVICE_LOCAL_MEM_SIZE, (sizeof(cl_ulong)), &value, NULL));
	return value;
}
Exemplo n.º 16
0
int QueryHWinfo(size_t *maxCmptUnits)
{
    cl_ulong globalmemSize, localmemSize, maxConstBufSize;
    size_t maxWGroupSize;
    size_t maxWIdims;
    size_t maxWItemSize3D[3];
    char device_str[100];
    char local_plat_buf[100];
    char local_dev_buf[100];
    int i;
     
  // Get & Set OpenCL Platforms
    // get Platform numbers
    cl_int ret = clGetPlatformIDs(1, NULL, &numPlatforms);
    cl_errChk(ret,"Error 0>> clGetPlatformIDs");
    
    printf(">> Get Platform num = %d\n\n", numPlatforms);
    
    // get memory to store platform IDs
    platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
    // store IDs into memory
    ret = clGetPlatformIDs(numPlatforms, platforms, NULL);
    cl_errChk(ret,"Error 1>> clGetPlatformIDs");
	
// Get OpenCL Platforms & Devices Info.
    for (i = 0; i < numPlatforms; i++)
    {
    // Get Platform Info.
        ret = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
              sizeof(local_plat_buf), local_plat_buf, NULL);
        cl_errChk(ret,"Error >> clGetPlatformInfo");
        // Vendor Info.
        printf(">> Platform #%d: Vendor => %s\n", i, local_plat_buf);
                
        // get Devices numbers
        ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 
          0, NULL, &numDevices);
        cl_errChk(ret,"Error >> clGetDeviceIDs");
        
        // get memory to store device IDs
        Devices = (cl_device_id*)malloc(sizeof(cl_device_id)* numDevices);
        if (numDevices == 0)
        {
            printf("!! There is no device in platform #%d\n", i);
            exit(0);
        }
        else
        {
            ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 
                  numDevices, Devices, NULL);
            printf(">> %d Device(s) in platform #%d\n", numDevices, i);
        }

        // Get Devices info.
        int j = 0;
       // cl_device_svm_capabilities caps;
        
        for (j=0; j< numDevices; j++)
        {
            printf("\n>> [ Device: %d ]\n", j);

         /*   // Get SVM support
            ret = clGetDeviceInfo(Devices[j], CL_DEVICE_SVM_CAPABILITIES,
                     sizeof(caps), &caps, 0);
            cl_errChk(ret,"Error >> clGetDeviceInfo_dev_svm");
              printf("\t>> SVM Capabilities:\n");
            if (ret == CL_SUCCESS){
              if (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)
                 printf("\t\t>> CL_DEVICE_SVM_COARSE_GRAIN_BUFFER\n");
              if (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)
                 printf("\t\t>> CL_DEVICE_SVM_FINE_GRAIN_BUFFER\n");
              if (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)
                 printf("\t\t>> CL_DEVICE_SVM_FINE_GRAIN_SYSTEM\n");
              if (caps & CL_DEVICE_SVM_ATOMICS)
                 printf("\t\t>> CL_DEVICE_SVM_ATOMICS\n");
             }
         */
            // Get Vendor info.
            ret = clGetDeviceInfo(Devices[j], CL_DEVICE_VENDOR, 
                    sizeof(device_str), device_str, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_dev_vendor");
            printf("\t>> Vendor: %s\n", device_str);

            // Get Name info.
            ret = clGetDeviceInfo(Devices[j], CL_DEVICE_NAME, 
                    sizeof(local_dev_buf), local_dev_buf, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_dev_name");
            printf("\t>> Model: %s\n", local_dev_buf);

            // Get Max Work Group Size
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_MAX_WORK_GROUP_SIZE, 
                    sizeof(maxWGroupSize), &maxWGroupSize, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_maxWGroupSize");
            printf("\t>> CL_DEVICE_MAX_WORK_GROUP_SIZE (WIs/WG): %d\n", (int)maxWGroupSize);

            // Get Max Compute Units Size
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_MAX_COMPUTE_UNITS, 
                    sizeof(*maxCmptUnits), maxCmptUnits, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_maxCmptUnits");
            printf("\t>> CL_DEVICE_MAX_COMPUTE_UNITS : %d\n", (int)*maxCmptUnits);

            // Get Max WORK_ITEM_DIMENSIONS
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, 
                    sizeof(maxWIdims), &maxWIdims, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_maxWorkItemD");
            printf("\t>> CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %d\n", (int)maxWIdims);

            // Get Max WORK_ITEM_SIZES
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_MAX_WORK_ITEM_SIZES, 
                    sizeof(maxWItemSize3D), &maxWItemSize3D, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_maxWItemSize3D");
            printf("\t>> CL_DEVICE_MAX_WORK_ITEM_SIZES: %d, %d, %d\n", 
            (int)maxWItemSize3D[0], (int)maxWItemSize3D[1], (int)maxWItemSize3D[2]);

            // Get GLOBAL_MEM_SIZE
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_GLOBAL_MEM_SIZE, 
                    sizeof(globalmemSize), &globalmemSize, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_globalmemSize");
            printf("\t>> CL_DEVICE_GLOBAL_MEM_SIZE(B): %.1f\n", 
                (float)globalmemSize);

            // Get MAX_CONSTANT_BUFFER_SIZE
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, 
                    sizeof(maxConstBufSize), &maxConstBufSize, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_maxConstBufSize");
            printf("\t>> CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE(B): %.1f\n", 
                (float)maxConstBufSize);

            // Get LOCAL_MEM_SIZE
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_LOCAL_MEM_SIZE, 
                    sizeof(localmemSize), &localmemSize, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_localmemSize");
            printf("\t>> CL_DEVICE_LOCAL_MEM_SIZE(B): %.1f\n", 
                (float)localmemSize);
                
            // Get CL_DEVICE_MAX_CLOCK_FREQUENCY 
            ret = clGetDeviceInfo(Devices[j], 
                    CL_DEVICE_MAX_CLOCK_FREQUENCY , 
                    sizeof(localmemSize), &localmemSize, NULL);
            cl_errChk(ret,"Error >> clGetDeviceInfo_MAX_CLOCK_FREQUENCY");
            printf("\t>> CL_DEVICE_MAX_CLOCK_FREQUENCY (MHz): %lu\n", 
                localmemSize);
        }
    }
    return 0;	
}
Exemplo n.º 17
0
long OpenCLDevice::getMaxParamSize() {
	cl_ulong value;
	check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_PARAMETER_SIZE, (sizeof(cl_ulong)), &value, NULL));
	return value;
}
Exemplo n.º 18
0
vx_status vxTargetInit(vx_target_t *target)
{
    vx_status status = VX_ERROR_NO_RESOURCES;
    cl_int err = 0;
    vx_context context = target->base.context;
    cl_uint p, d, k;
    char *vx_incs = getenv("VX_CL_INCLUDE_DIR");
    char *cl_dirs = getenv("VX_CL_SOURCE_DIR");
    char cl_args[1024];

    snprintf(cl_args, sizeof(cl_args), "-D VX_CL_KERNEL -I %s -I %s %s %s", (vx_incs?vx_incs:"C:\\Users\\Eric\\Desktop\\VS_OpenVX2\\example_multinode_graph\\cl_code"), cl_dirs,
//#if !defined(__APPLE__)
//        "-D CL_USE_LUMINANCE",
//#else
        "",
//#endif
#if defined(VX_INCLUDE_DIR)
    "-I "VX_INCLUDE_DIR" "
#else
    " "
#endif
    );

    if (cl_dirs == NULL) {
#ifdef VX_CL_SOURCE_DIR
        const char *sdir = VX_CL_SOURCE_DIR;
        int len = strlen(sdir);
        cl_dirs = malloc(len);
        strncpy(cl_dirs, sdir, len);
#else
        return status;
#endif
    }

    strncpy(target->name, name, VX_MAX_TARGET_NAME);
    target->priority = VX_TARGET_PRIORITY_OPENCL;

    context->num_platforms = CL_MAX_PLATFORMS;
    err = clGetPlatformIDs(CL_MAX_PLATFORMS, context->platforms, NULL);
    if (err != CL_SUCCESS)
        goto exit;

    for (p = 0; p < context->num_platforms; p++) {
        err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL,
            0, NULL, &context->num_devices[p]);
        err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL,
            context->num_devices[p] > CL_MAX_DEVICES ? CL_MAX_DEVICES : context->num_devices[p],
            context->devices[p], NULL);
        if (err == CL_SUCCESS) {
            cl_context_properties props[] = {
                (cl_context_properties)CL_CONTEXT_PLATFORM,
                (cl_context_properties)context->platforms[p],
                (cl_context_properties)0,
            };
            for (d = 0; d < context->num_devices[p]; d++) {
                char deviceName[64];
                cl_bool compiler = CL_FALSE;
                cl_bool available = CL_FALSE;
                cl_bool image_support = CL_FALSE;
                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
                CL_ERROR_MSG(err, "clGetDeviceInfo");
                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_COMPILER_AVAILABLE, sizeof(cl_bool), &compiler, NULL);
                CL_ERROR_MSG(err, "clGetDeviceInfo");
                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
                CL_ERROR_MSG(err, "clGetDeviceInfo");
                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL);
                CL_ERROR_MSG(err, "clGetDeviceInfo");
                VX_PRINT(VX_ZONE_INFO, "Device %s (compiler=%s) (available=%s) (images=%s)\n", deviceName, (compiler?"TRUE":"FALSE"), (available?"TRUE":"FALSE"), (image_support?"TRUE":"FALSE"));
            }
            context->global[p] = clCreateContext(props,
                                                 context->num_devices[p],
                                                 context->devices[p],
                                                 vxcl_platform_notifier,
                                                 target,
                                                 &err);
            if (err != CL_SUCCESS)
                break;

            /* check for supported formats */
            if (err == CL_SUCCESS) {
                cl_uint f,num_entries = 0u;
                cl_image_format *formats = NULL;
                cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR;
                cl_mem_object_type type = CL_MEM_OBJECT_IMAGE2D;

                err = clGetSupportedImageFormats(context->global[p], flags, type, 0, NULL, &num_entries);
                formats = (cl_image_format *)malloc(num_entries * sizeof(cl_image_format));
                err = clGetSupportedImageFormats(context->global[p], flags, type, num_entries, formats, NULL);
                for (f = 0; f < num_entries; f++) {
                    char order[256];
                    char datat[256];
    #define CASE_STRINGERIZE2(value, string) case value: strcpy(string, #value); break
                    switch(formats[f].image_channel_order) {
                        CASE_STRINGERIZE2(CL_R, order);
                        CASE_STRINGERIZE2(CL_A, order);
                        CASE_STRINGERIZE2(CL_RG, order);
                        CASE_STRINGERIZE2(CL_RA, order);
                        CASE_STRINGERIZE2(CL_RGB, order);
                        CASE_STRINGERIZE2(CL_RGBA, order);
                        CASE_STRINGERIZE2(CL_BGRA, order);
                        CASE_STRINGERIZE2(CL_ARGB, order);
                        CASE_STRINGERIZE2(CL_INTENSITY, order);
                        CASE_STRINGERIZE2(CL_LUMINANCE, order);
                        CASE_STRINGERIZE2(CL_Rx, order);
                        CASE_STRINGERIZE2(CL_RGx, order);
                        CASE_STRINGERIZE2(CL_RGBx, order);
    #if defined(CL_VERSION_1_2) && defined(cl_khr_gl_depth_images)
                        CASE_STRINGERIZE2(CL_DEPTH, order);
                        CASE_STRINGERIZE2(CL_DEPTH_STENCIL, order);
    #if defined(__APPLE__)
                        CASE_STRINGERIZE2(CL_1RGB_APPLE, order);
                        CASE_STRINGERIZE2(CL_BGR1_APPLE, order);
                        CASE_STRINGERIZE2(CL_SFIXED14_APPLE, order);
                        CASE_STRINGERIZE2(CL_BIASED_HALF_APPLE, order);
                        CASE_STRINGERIZE2(CL_YCbYCr_APPLE, order);
                        CASE_STRINGERIZE2(CL_CbYCrY_APPLE, order);
                        CASE_STRINGERIZE2(CL_ABGR_APPLE, order);
    #endif
    #endif
                        default:
                            sprintf(order, "%x", formats[f].image_channel_order);
                            break;
                    }
                    switch(formats[f].image_channel_data_type) {
                        CASE_STRINGERIZE2(CL_SNORM_INT8, datat);
                        CASE_STRINGERIZE2(CL_SNORM_INT16, datat);
                        CASE_STRINGERIZE2(CL_UNORM_INT8, datat);
                        CASE_STRINGERIZE2(CL_UNORM_INT16, datat);
                        CASE_STRINGERIZE2(CL_UNORM_SHORT_565, datat);
                        CASE_STRINGERIZE2(CL_UNORM_SHORT_555, datat);
                        CASE_STRINGERIZE2(CL_UNORM_INT_101010, datat);
                        CASE_STRINGERIZE2(CL_SIGNED_INT8, datat);
                        CASE_STRINGERIZE2(CL_SIGNED_INT16, datat);
                        CASE_STRINGERIZE2(CL_SIGNED_INT32, datat);
                        CASE_STRINGERIZE2(CL_UNSIGNED_INT8, datat);
                        CASE_STRINGERIZE2(CL_UNSIGNED_INT16, datat);
                        CASE_STRINGERIZE2(CL_UNSIGNED_INT32, datat);
                        CASE_STRINGERIZE2(CL_HALF_FLOAT, datat);
                        CASE_STRINGERIZE2(CL_FLOAT, datat);
    #if defined(CL_VERSION_2_0)
                        CASE_STRINGERIZE2(CL_UNORM_INT24, datat);
    #endif
                        default:
                            sprintf(order, "%x", formats[f].image_channel_data_type);
                            break;
                    }
                    VX_PRINT(VX_ZONE_INFO, "%s : %s\n", order, datat);
                }
            }

            /* create a queue for each device */
            for (d = 0; d < context->num_devices[p]; d++)
            {
                context->queues[p][d] = clCreateCommandQueue(context->global[p],
                                                          context->devices[p][d],
                                                          CL_QUEUE_PROFILING_ENABLE,
                                                          &err);
                if (err == CL_SUCCESS) {
                }
            }

			char abs_source_path[VX_CL_MAX_PATH];
            /* for each kernel */
            for (k = 0; k < num_cl_kernels; k++)
            {
                char *sources = NULL;
                size_t programSze = 0;

                /* load the source file */
                VX_PRINT(VX_ZONE_INFO, "Joiner: %s\n", FILE_JOINER);
                VX_PRINT(VX_ZONE_INFO, "Path: %s\n", cl_dirs);
                VX_PRINT(VX_ZONE_INFO, "Kernel[%u] File: %s\n", k, cl_kernels[k]->sourcepath);
                VX_PRINT(VX_ZONE_INFO, "Kernel[%u] Name: %s\n", k, cl_kernels[k]->kernelname);
                VX_PRINT(VX_ZONE_INFO, "Kernel[%u] ID: %s\n", k, cl_kernels[k]->description.name);
				
				int cl_dirs_len = strlen(cl_dirs);
				int sourcepath_len = strlen(cl_kernels[k]->sourcepath);
				strncpy(abs_source_path, cl_dirs, cl_dirs_len);
				strncpy(&abs_source_path[cl_dirs_len], cl_kernels[k]->sourcepath, sourcepath_len);
				abs_source_path[cl_dirs_len+sourcepath_len] = '\0';
                sources = clLoadSources(abs_source_path, &programSze);
				VX_PRINT(VX_ZONE_INFO, "clLoadSources programSze:%d\n", programSze);
				
                /* create a program with this source */
                cl_kernels[k]->program[p] = clCreateProgramWithSource(context->global[p],
                    1,
                    (const char **)&sources,
                    &programSze,
                    &err);
                if (err == CL_SUCCESS)
                {
                    err = clBuildProgram((cl_program)cl_kernels[k]->program[p],
                        1,
                        (const cl_device_id *)context->devices,
                        (const char *)cl_args,
                        NULL,
                        NULL);
                    if (err != CL_SUCCESS)
                    {
                        CL_BUILD_MSG(err, "Build Error");
                        if (err == CL_BUILD_PROGRAM_FAILURE)
                        {
                            char log[10][1024];
                            size_t logSize = 0;
                            clGetProgramBuildInfo((cl_program)cl_kernels[k]->program[p],
                                (cl_device_id)context->devices[p][0],
                                CL_PROGRAM_BUILD_LOG,
                                sizeof(log),
                                log,
                                &logSize);
                            VX_PRINT(VX_ZONE_ERROR, "%s", log);
                        }
                    }
                    else
                    {
                        cl_int k2 = 0;
                        cl_build_status bstatus = 0;
                        size_t bs = 0;
                        err = clGetProgramBuildInfo(cl_kernels[k]->program[p],
                            context->devices[p][0],
                            CL_PROGRAM_BUILD_STATUS,
                            sizeof(cl_build_status),
                            &bstatus,
                            &bs);
                        VX_PRINT(VX_ZONE_INFO, "Status = %d (%d)\n", bstatus, err);
                        /* get the cl_kernels from the program */
                        cl_kernels[k]->num_kernels[p] = 1;
                        err = clCreateKernelsInProgram(cl_kernels[k]->program[p],
                            1,
                            &cl_kernels[k]->kernels[p],
                            NULL);
                        VX_PRINT(VX_ZONE_INFO, "Found %u cl_kernels in %s (%d)\n", cl_kernels[k]->num_kernels[p], cl_kernels[k]->sourcepath, err);
                        for (k2 = 0; (err == CL_SUCCESS) && (k2 < (cl_int)cl_kernels[k]->num_kernels[p]); k2++)
                        {
                            char kName[VX_MAX_KERNEL_NAME];
                            size_t size = 0;
                            err = clGetKernelInfo(cl_kernels[k]->kernels[p],
                                CL_KERNEL_FUNCTION_NAME,
                                0,
                                NULL,
                                &size);
                            err = clGetKernelInfo(cl_kernels[k]->kernels[p],
                                CL_KERNEL_FUNCTION_NAME,
                                size,
                                kName,
                                NULL);
                            VX_PRINT(VX_ZONE_INFO, "Kernel %s\n", kName);
                            if (strncmp(kName, cl_kernels[k]->kernelname, VX_MAX_KERNEL_NAME) == 0)
                            {
                                vx_kernel_f kfunc = cl_kernels[k]->description.function;
                                VX_PRINT(VX_ZONE_INFO, "Linked Kernel %s on target %s\n", cl_kernels[k]->kernelname, target->name);
                                target->num_kernels++;
                                target->base.context->num_kernels++;
                                status = vxInitializeKernel(target->base.context,
                                    &target->kernels[k],
                                    cl_kernels[k]->description.enumeration,
                                    (kfunc == NULL ? vxclCallOpenCLKernel : kfunc),
                                    cl_kernels[k]->description.name,
                                    cl_kernels[k]->description.parameters,
                                    cl_kernels[k]->description.numParams,
                                    cl_kernels[k]->description.input_validate,
                                    cl_kernels[k]->description.output_validate,
                                    cl_kernels[k]->description.initialize,
                                    cl_kernels[k]->description.deinitialize);
                                if (vxIsKernelUnique(&target->kernels[k]) == vx_true_e) {
                                    target->base.context->num_unique_kernels++;
                                } else {
                                    VX_PRINT(VX_ZONE_KERNEL, "Kernel %s is NOT unqiue\n", target->kernels[k].name);
                                }
                            }
                        }
                    }
                }
                else
                {
                    CL_ERROR_MSG(err, "Program");
                }
                free(sources);
            }
        }
    }
exit:
    if (err == CL_SUCCESS) {
        status = VX_SUCCESS;
    } else {
        status = VX_ERROR_NO_RESOURCES;
    }
    return status;
}
Exemplo n.º 19
0
// OpenCL functions
int InitialiseCLEnvironment(cl_platform_id **platform, cl_device_id ***device_id, cl_program *program, renderStruct *render)
{
	// error flag
	cl_int err;
	char infostring[1024];
	char deviceInfo[1024];

	// need to ensure platform supports OpenGL OpenCL interop before querying devices
	// to avoid segfault when calling clGetGLContextInfoKHR
	int *platformSupportsInterop;

	//get kernel from file
	FILE* kernelFile = fopen(kernelFileName, "rb");
	fseek(kernelFile, 0, SEEK_END);
	long fileLength = ftell(kernelFile);
	rewind(kernelFile);
	char *kernelSource = malloc(fileLength*sizeof(char));
	long read = fread(kernelSource, sizeof(char), fileLength, kernelFile);
	if (fileLength != read) printf("Error reading kernel file, line %d\n", __LINE__);
	fclose(kernelFile);

	//get platform and device information
	cl_uint numPlatforms;
	err = clGetPlatformIDs(0, NULL, &numPlatforms);
	*platform = malloc(numPlatforms * sizeof(cl_platform_id));
	*device_id = malloc(numPlatforms * sizeof(cl_device_id*));
	platformSupportsInterop = malloc(numPlatforms * sizeof(*platformSupportsInterop));
	err |= clGetPlatformIDs(numPlatforms, *platform, NULL);
	CheckOpenCLError(err, __LINE__);
	cl_uint *numDevices;
	numDevices = malloc(numPlatforms * sizeof(cl_uint));

	for (cl_uint i = 0; i < numPlatforms; i++) {
		clGetPlatformInfo((*platform)[i], CL_PLATFORM_VENDOR, sizeof(infostring), infostring, NULL);
		printf("\n---OpenCL: Platform Vendor %d: %s\n", i, infostring);

		err = clGetDeviceIDs((*platform)[i], CL_DEVICE_TYPE_ALL, 0, NULL, &(numDevices[i]));
		CheckOpenCLError(err, __LINE__);
		(*device_id)[i] = malloc(numDevices[i] * sizeof(cl_device_id));
		platformSupportsInterop[i] = 0;
		err = clGetDeviceIDs((*platform)[i], CL_DEVICE_TYPE_ALL, numDevices[i], (*device_id)[i], NULL);
		CheckOpenCLError(err, __LINE__);
		for (cl_uint j = 0; j < numDevices[i]; j++) {
			char deviceName[200];
			clGetDeviceInfo((*device_id)[i][j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
			printf("---OpenCL:    Device found %d. %s\n", j, deviceName);
			clGetDeviceInfo((*device_id)[i][j], CL_DEVICE_EXTENSIONS, sizeof(deviceInfo), deviceInfo, NULL);
			if (strstr(deviceInfo, "cl_khr_gl_sharing") != NULL) {
				printf("---OpenCL:        cl_khr_gl_sharing supported!\n");
				platformSupportsInterop[i] = 1;
			}
			else {
				printf("---OpenCL:        cl_khr_gl_sharing NOT supported!\n");
				platformSupportsInterop[i] |= 0;
			}
			if (strstr(deviceInfo, "cl_khr_fp64") != NULL) {
				printf("---OpenCL:        cl_khr_fp64 supported!\n");
			}
			else {
				printf("---OpenCL:        cl_khr_fp64 NOT supported!\n");
			}
		}
	}
	printf("\n");


	////////////////////////////////
	// This part is different to how we usually do things. Need to get context and device from existing
	// OpenGL context. Loop through all platforms looking for the device:
	cl_device_id device = NULL;
	int deviceFound = 0;
	cl_uint checkPlatform = 0;

#ifdef TRYINTEROP
	while (!deviceFound) {
		if (platformSupportsInterop[checkPlatform]) {
			printf("---OpenCL: Looking for OpenGL Context device on platform %d ... ", checkPlatform);
			clGetGLContextInfoKHR_fn pclGetGLContextInfoKHR;
			PTR_FUNC_PTR pclGetGLContextInfoKHR = clGetExtensionFunctionAddressForPlatform((*platform)[checkPlatform], "clGetGLContextInfoKHR");
			cl_context_properties properties[] = {
				CL_GL_CONTEXT_KHR, (cl_context_properties) glfwGetGLXContext(render->window),
				CL_GLX_DISPLAY_KHR, (cl_context_properties) glfwGetX11Display(),
				CL_CONTEXT_PLATFORM, (cl_context_properties) (*platform)[checkPlatform],
				0};
			err = pclGetGLContextInfoKHR(properties, CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR, sizeof(cl_device_id), &device, NULL);
			if (err != CL_SUCCESS) {
				printf("Not Found.\n");
				checkPlatform++;
				if (checkPlatform > numPlatforms-1) {
					printf("---OpenCL: Error! Could not find OpenGL sharing device.\n");
					deviceFound = 1;
					render->glclInterop = 0;
				}
			}
			else {
				printf("Found!\n");
				deviceFound = 1;
				render->glclInterop = 1;
			}
		}
		else {
			checkPlatform++;
		}
	}

	if (render->glclInterop) {
		// Check the device we've found supports double precision
		clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(deviceInfo), deviceInfo, NULL);
		if (strstr(deviceInfo, "cl_khr_fp64") == NULL) {
			printf("---OpenCL: Interop device doesn't support double precision! We cannot use it.\n");
		}
		else {
			cl_context_properties properties[] = {
				CL_GL_CONTEXT_KHR, (cl_context_properties) glfwGetGLXContext(render->window),
				CL_GLX_DISPLAY_KHR, (cl_context_properties) glfwGetX11Display(),
				CL_CONTEXT_PLATFORM, (cl_context_properties) (*platform)[checkPlatform],
				0};
			render->contextCL = clCreateContext(properties, 1, &device, NULL, 0, &err);
			CheckOpenCLError(err, __LINE__);
		}
	}
#endif

	// if render->glclInterop is 0, either we are not trying to use it, we couldn't find an interop
	// device, or we found an interop device but it doesn't support double precision.
	// In these cases, have the user choose a platform and device manually.
	if (!(render->glclInterop)) {
		printf("Choose a platform and device.\n");
		checkPlatform = numPlatforms;
		while (checkPlatform >= numPlatforms) {
			printf("Platform: ");
			scanf("%u", &checkPlatform);
			if (checkPlatform >= numPlatforms) {
				printf("Invalid Platform choice.\n");
			}
		}

		cl_uint chooseDevice = numDevices[checkPlatform];
		while (chooseDevice >= numDevices[checkPlatform]) {
			printf("Device: ");
			scanf("%u", &chooseDevice);
			if (chooseDevice >= numDevices[checkPlatform]) {
				printf("Invalid Device choice.\n");
			} else {
				// Check the device we've chosen supports double precision
				clGetDeviceInfo((*device_id)[checkPlatform][chooseDevice], CL_DEVICE_EXTENSIONS, sizeof(deviceInfo), deviceInfo, NULL);
				if (strstr(deviceInfo, "cl_khr_fp64") == NULL) {
					printf("---OpenCL: Interop device doesn't support double precision! We cannot use it.\n");
					chooseDevice = numDevices[checkPlatform];
				}
			}
		}

		// Create non-interop context
		render->contextCL = clCreateContext(NULL, 1, &((*device_id)[checkPlatform][chooseDevice]), NULL, NULL, &err);
		device = (*device_id)[checkPlatform][chooseDevice];
	}
	////////////////////////////////

	// device is now fixed. Query its max global memory allocation size and store it, used in
	// HighResolutionRender routine, to determine into how many tiles we need to split the
	// computation.
	clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(render->deviceMaxAlloc), &(render->deviceMaxAlloc), NULL);
	printf("---OpenCL: Selected device has CL_DEVICE_MAX_MEM_ALLOC_SIZE: %lfMB\n",
	       render->deviceMaxAlloc/1024.0/1024.0);

	// create a command queue
	render->queue = clCreateCommandQueue(render->contextCL, device, 0, &err);
	CheckOpenCLError(err, __LINE__);


	//create the program with the source above
//	printf("Creating CL Program...\n");
	*program = clCreateProgramWithSource(render->contextCL, 1, (const char**)&kernelSource, NULL, &err);
	if (err != CL_SUCCESS) {
		printf("Error in clCreateProgramWithSource: %d, line %d.\n", err, __LINE__);
		return EXIT_FAILURE;
	}

	//build program executable
	err = clBuildProgram(*program, 0, NULL, "-I. -I src/", NULL, NULL);
	if (err != CL_SUCCESS) {
		printf("Error in clBuildProgram: %d, line %d.\n", err, __LINE__);
		char buffer[5000];
		clGetProgramBuildInfo(*program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
		printf("%s\n", buffer);
		return EXIT_FAILURE;
	}

	// dump ptx
	size_t binSize;
	clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binSize, NULL);
	unsigned char *bin = malloc(binSize);
	clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
	FILE *fp = fopen("openclPTX.ptx", "wb");
	fwrite(bin, sizeof(char), binSize, fp);
	fclose(fp);
	free(bin);

	free(numDevices);
	free(kernelSource);
	printf("\n");
	return EXIT_SUCCESS;
}
Exemplo n.º 20
0
int PGR_radiosity::prepareCL()
{
    cl_int ciErr = CL_SUCCESS;

    // Get Platform
    cl_platform_id *cpPlatforms;
    cl_uint cuiPlatformsCount;
    ciErr = clGetPlatformIDs(0, NULL, &cuiPlatformsCount);
    this->CheckOpenCLError(ciErr, "clGetPlatformIDs: cuiPlatformsNum=%i", cuiPlatformsCount);
    cpPlatforms = (cl_platform_id*) malloc(cuiPlatformsCount * sizeof (cl_platform_id));
    ciErr = clGetPlatformIDs(cuiPlatformsCount, cpPlatforms, NULL);
    this->CheckOpenCLError(ciErr, "clGetPlatformIDs");

    cl_platform_id platform = 0;

    const unsigned int TMP_BUFFER_SIZE = 1024;
    char sTmp[TMP_BUFFER_SIZE];

    for (unsigned int f0 = 0; f0 < cuiPlatformsCount; f0++)
    {
        //bool shouldBrake = false;
        ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_PROFILE, TMP_BUFFER_SIZE, sTmp, NULL);
        this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_PROFILE=%s", f0, sTmp);
        ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_VERSION, TMP_BUFFER_SIZE, sTmp, NULL);
        this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_VERSION=%s", f0, sTmp);
        ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_NAME, TMP_BUFFER_SIZE, sTmp, NULL);
        this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_NAME=%s", f0, sTmp);
        ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_VENDOR, TMP_BUFFER_SIZE, sTmp, NULL);
        this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_VENDOR=%s", f0, sTmp);

        //prioritize AMD and CUDA platforms

        if ((strcmp(sTmp, "NVIDIA Corporation") == 0))
        {
            platform = cpPlatforms[f0];
        }

        //        if ((strcmp(sTmp, "Advanced Micro Devices, Inc.") == 0))
        //        {
        //            platform = cpPlatforms[f0];
        //        }

        //prioritize Intel
        /*if ((strcmp(sTmp, "Intel(R) Corporation") == 0)) {
            platform = cpPlatforms[f0];
        }*/

        ciErr = clGetPlatformInfo(cpPlatforms[f0], CL_PLATFORM_EXTENSIONS, TMP_BUFFER_SIZE, sTmp, NULL);
        this->CheckOpenCLError(ciErr, "clGetPlatformInfo: Id=%i: CL_PLATFORM_EXTENSIONS=%s", f0, sTmp);
    }

    if (platform == 0)
    { //no prioritized found
        if (cuiPlatformsCount > 0)
        {
            platform = cpPlatforms[0];
        }
        else
        {
            cerr << "No device was found" << endl;
            return -1;
        }
    }
    // Get Devices
    cl_uint cuiDevicesCount;
    ciErr = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &cuiDevicesCount);
    CheckOpenCLError(ciErr, "clGetDeviceIDs: cuiDevicesCount=%i", cuiDevicesCount);
    cl_device_id *cdDevices = (cl_device_id*) malloc(cuiDevicesCount * sizeof (cl_device_id));
    ciErr = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, cuiDevicesCount, cdDevices, NULL);
    CheckOpenCLError(ciErr, "clGetDeviceIDs");

    unsigned int deviceIndex = 0;

    for (unsigned int f0 = 0; f0 < cuiDevicesCount; f0++)
    {
        cl_device_type cdtTmp;
        size_t iDim[3];

        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_TYPE, sizeof (cdtTmp), &cdtTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_TYPE=%s%s%s%s", f0, cdtTmp & CL_DEVICE_TYPE_CPU ? "CPU," : "",
                         cdtTmp & CL_DEVICE_TYPE_GPU ? "GPU," : "",
                         cdtTmp & CL_DEVICE_TYPE_ACCELERATOR ? "ACCELERATOR," : "",
                         cdtTmp & CL_DEVICE_TYPE_DEFAULT ? "DEFAULT," : "");

        if (cdtTmp & CL_DEVICE_TYPE_GPU)
        { //prioritize gpu if both cpu and gpu are available
            deviceIndex = f0;
        }

        cl_bool bTmp;
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_AVAILABLE, sizeof (bTmp), &bTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_AVAILABLE=%s", f0, bTmp ? "YES" : "NO");
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_NAME, TMP_BUFFER_SIZE, sTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_NAME=%s", f0, sTmp);
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_VENDOR, TMP_BUFFER_SIZE, sTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_VENDOR=%s", f0, sTmp);
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DRIVER_VERSION, TMP_BUFFER_SIZE, sTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DRIVER_VERSION=%s", f0, sTmp);
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_PROFILE, TMP_BUFFER_SIZE, sTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_PROFILE=%s", f0, sTmp);
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_VERSION, TMP_BUFFER_SIZE, sTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_VERSION=%s", f0, sTmp);
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof (iDim), iDim, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_MAX_WORK_ITEM_SIZES=%ix%ix%i", f0, iDim[0], iDim[1], iDim[2]);
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (size_t), iDim, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_MAX_WORK_GROUP_SIZE=%i", f0, iDim[0]);
        ciErr = clGetDeviceInfo(cdDevices[f0], CL_DEVICE_EXTENSIONS, TMP_BUFFER_SIZE, sTmp, NULL);
        CheckOpenCLError(ciErr, "clGetDeviceInfo: Id=%i: CL_DEVICE_EXTENSIONS=%s", f0, sTmp);
    }

    cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0};



    /* Create context */
    this->context = clCreateContext(cps, 1, &cdDevices[deviceIndex], NULL, NULL, &ciErr);
    CheckOpenCLError(ciErr, "clCreateContext");

    /* Create a command queue */
    this->queue = clCreateCommandQueue(this->context, cdDevices[deviceIndex], CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ciErr);
    CheckOpenCLError(ciErr, "clCreateCommandQueue");

    /* Create and compile and openCL program */
    char *cSourceCL = loadProgSource("kernels.cl");

    this->program = clCreateProgramWithSource(this->context, 1, (const char **) &cSourceCL, NULL, &ciErr);
    CheckOpenCLError(ciErr, "clCreateProgramWithSource");
    free(cSourceCL);

    ciErr = clBuildProgram(this->program, 0, NULL, NULL, NULL, NULL);
    CheckOpenCLError(ciErr, "clBuildProgram");

    cl_int logStatus;
    char *buildLog = NULL;
    size_t buildLogSize = 0;
    logStatus = clGetProgramBuildInfo(this->program,
                                      cdDevices[deviceIndex],
                                      CL_PROGRAM_BUILD_LOG,
                                      buildLogSize,
                                      buildLog,
                                      &buildLogSize);

    CheckOpenCLError(logStatus, "clGetProgramBuildInfo.");

    buildLog = (char*) malloc(buildLogSize);
    if (buildLog == NULL)
    {
        printf("Failed to allocate host memory. (buildLog)");
        return -1;
    }
    memset(buildLog, 0, buildLogSize);

    logStatus = clGetProgramBuildInfo(this->program,
                                      cdDevices[deviceIndex],
                                      CL_PROGRAM_BUILD_LOG,
                                      buildLogSize,
                                      buildLog,
                                      NULL);
    CheckOpenCLError(logStatus, "clGetProgramBuildInfo.");
    free(buildLog);

    size_t tempKernelWorkGroupSize;

    /* Create kernels */
    this->radiosityKernel = clCreateKernel(program, "radiosity", &ciErr);
    CheckOpenCLError(ciErr, "clCreateKernel radiosity");
    this->sortKernel = clCreateKernel(program, "sort", &ciErr);
    CheckOpenCLError(ciErr, "clCreateKernel sort");

    this->maxWorkGroupSize = 64;
    this->workGroupSize = 64;

    ciErr = clGetKernelWorkGroupInfo(this->radiosityKernel,
                                     cdDevices[deviceIndex],
                                     CL_KERNEL_WORK_GROUP_SIZE,
                                     sizeof (size_t),
                                     &tempKernelWorkGroupSize,
                                     0);
    CheckOpenCLError(ciErr, "clGetKernelInfo");
    this->maxWorkGroupSize = MIN(tempKernelWorkGroupSize, this->maxWorkGroupSize);

    if (this->workGroupSize > this->maxWorkGroupSize)
    {
        cout << "Out of Resources!" << endl;
        cout << "Group Size specified: " << this->workGroupSize << endl;
        cout << "Max Group Size supported on the kernel: " << this->maxWorkGroupSize << endl;
        cout << "Falling back to " << this->maxWorkGroupSize << endl;

        this->workGroupSize = this->maxWorkGroupSize;
    }


    /* Allocate buffer of colors */
    this->patchesColorsCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_uchar3), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer patchesCL");

    this->raw_patchesColors = new cl_uchar3[this->model->getPatchesCount()];
    this->raw_patchesEnergies = new cl_float[this->model->getPatchesCount()];
    this->raw_diffColors = new cl_uchar3[this->model->getPatchesCount()];
    this->raw_intensities = new cl_float[this->model->getPatchesCount()];
    this->model->getPatchesCL(this->raw_patchesColors, this->raw_patchesEnergies);

    ciErr = clEnqueueWriteBuffer(this->queue,
                                 this->patchesColorsCL,
                                 CL_TRUE, //blocking write
                                 0,
                                 this->model->getPatchesCount() * sizeof (cl_uchar3),
                                 this->raw_patchesColors,
                                 0,
                                 0,
                                 0);
    CheckOpenCLError(ciErr, "Copy patches colors");

    /* Alocate buffer of energies */
    this->patchesEnergiesCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_float), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer patchesCL");

    ciErr = clEnqueueWriteBuffer(this->queue,
                                 this->patchesEnergiesCL,
                                 CL_TRUE, //blocking write
                                 0,
                                 this->model->getPatchesCount() * sizeof (cl_float),
                                 this->raw_patchesEnergies,
                                 0,
                                 0,
                                 0);
    CheckOpenCLError(ciErr, "Copy patches");

    /* Allocate buffer of patches geometry */
    this->patchesGeoCL = clCreateBuffer(this->context, CL_MEM_READ_ONLY, this->model->getPatchesCount() * sizeof (cl_float8), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer patchesGeometryCL");

    this->raw_patchesGeo = new cl_float8[this->model->getPatchesCount()];
    this->model->getPatchesGeometryCL(raw_patchesGeo);
    ciErr = clEnqueueWriteBuffer(this->queue,
                                 this->patchesGeoCL,
                                 CL_TRUE, //blocking write
                                 0,
                                 this->model->getPatchesCount() * sizeof (cl_float8),
                                 this->raw_patchesGeo,
                                 0,
                                 0,
                                 0);
    CheckOpenCLError(ciErr, "Copy patches geometry");


    this->indicesCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->maxWorkGroupSize * sizeof (cl_uint), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer indicesCL");

    this->indicesCountCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, sizeof (cl_uint), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer indicesCountCL");

    this->maximalEnergyCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, sizeof (cl_float), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer maximalEnergyCL");

    this->diffColorsCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_uchar3), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer diffColorsCL");

    cl_uchar3* zeros = new cl_uchar3[this->model->getPatchesCount()];
    memset(zeros, 0, this->model->getPatchesCount() * sizeof (cl_uchar3));
    ciErr = clEnqueueWriteBuffer(this->queue,
                                 this->diffColorsCL,
                                 CL_TRUE, //blocking write
                                 0,
                                 this->model->getPatchesCount() * sizeof (cl_uchar3),
                                 zeros,
                                 0,
                                 0,
                                 0);
    CheckOpenCLError(ciErr, "Clear diff colors");
    delete [] zeros;

    this->intensitiesCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->model->getPatchesCount() * sizeof (cl_float), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer intensitiesCL");

    cl_float* zeroIntensity = new cl_float[this->model->getPatchesCount()];
    memset(zeroIntensity, 0, this->model->getPatchesCount() * sizeof (cl_float));
    ciErr = clEnqueueWriteBuffer(this->queue,
                                 this->intensitiesCL,
                                 CL_TRUE, //blocking write
                                 0,
                                 this->model->getPatchesCount() * sizeof (cl_float),
                                 zeroIntensity,
                                 0,
                                 0,
                                 0);
    CheckOpenCLError(ciErr, "Clear intensities");
    delete [] zeroIntensity;

    this->texturesCL = clCreateBuffer(this->context, CL_MEM_READ_ONLY, this->maxWorkGroupSize * 768 * 256 * sizeof (cl_uchar3), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer texturesCL");

    this->visitedCL = clCreateBuffer(this->context, CL_MEM_READ_WRITE, this->maxWorkGroupSize * this->model->getPatchesCount() * sizeof (cl_bool), 0, &ciErr);
    CheckOpenCLError(ciErr, "CreateBuffer visitedCL");

    cl_bool* zeroVisited = new cl_bool[this->maxWorkGroupSize * this->model->getPatchesCount()];
    memset(zeroVisited, 0, this->maxWorkGroupSize * this->model->getPatchesCount() * sizeof (cl_bool));
    ciErr = clEnqueueWriteBuffer(this->queue,
                                 this->visitedCL,
                                 CL_TRUE, //blocking write
                                 0,
                                 this->model->getPatchesCount() * sizeof (cl_bool),
                                 zeroVisited,
                                 0,
                                 0,
                                 0);
    CheckOpenCLError(ciErr, "Clear visited flags");
    delete [] zeroVisited;

    free(cdDevices);

    return 0;
}
Exemplo n.º 21
0
int
main(int argc, char **argv)
{
    cl_uint num;
    cl_int err;
    int platform_idx = -1;
    cl_platform_id *plat_ids;
    int i;
    size_t sz;
    cl_device_id *gpu_devs;
    cl_context_properties cps[3];
    cl_context context;
    int opt;
    char *input;
    int run_size = 1024;
    struct AIISA_Program prog;
    cl_command_queue queue;
    int ei;
    int nloop = 16;
    struct AIISA_CodeBuffer buf;

    aiisa_code_buffer_init(&buf);

    clGetPlatformIDs(0, NULL, &num);

    plat_ids = (cl_platform_id*)malloc(sizeof(*plat_ids) * num);
    clGetPlatformIDs(num, plat_ids, NULL);

    while ((opt = getopt(argc, argv, "n:")) != -1) {
        switch (opt) {
        case 'n':
            run_size = atoi(optarg);
            break;

        default:
            puts("usage : run in.cl");
            return 1;
        }
    }

    if (optind >= argc) {
        puts("usage : run in.cl");
        return 1;
    }

    input = argv[optind];

    for (i=0; i<(int)num; i++) {
        char name[1024];
        size_t len;
        clGetPlatformInfo(plat_ids[i], CL_PLATFORM_VENDOR, sizeof(name), name, &len);

        //puts(name);
        if (strcmp(name, "Advanced Micro Devices, Inc.") == 0) {
            platform_idx = i;
            break;
        }
    }

    if (platform_idx == -1) {
        puts("no amd");
        return -1;
    }

    clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &num);
    if (num == 0) {
        puts("no gpu");
        return -1;
    }

    gpu_devs = (cl_device_id*)malloc(sizeof(gpu_devs[0]) * 1);
    //clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, num, gpu_devs, NULL);

    cps[0] = CL_CONTEXT_PLATFORM;
    cps[1] = (cl_context_properties)plat_ids[platform_idx];
    cps[2] = 0;

    context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
    clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(gpu_devs), gpu_devs, &sz);

    queue = clCreateCommandQueue(context, gpu_devs[0], 0, NULL);

    {
        char name[1024];
        size_t sz;
        clGetDeviceInfo(gpu_devs[0], CL_DEVICE_NAME, sizeof(name), name, &sz);

        puts(name);
    }

    //puts(input);

    aiisa_build_binary_from_cl(&prog, context, gpu_devs[0], input);

    for (ei=0; ei<nloop; ei++) {
        cl_program cl_prog;
        const unsigned char *bin[1];
        size_t bin_size[1];
        cl_kernel ker;
        cl_mem in, out;
        size_t global_size[3];
        double tb, te;

        tb = sec();
        gen_code(&prog, &buf);
        bin[0] = prog.cl_binary;
        bin_size[0] = prog.size;
        cl_prog = clCreateProgramWithBinary(context, 1, gpu_devs, bin_size, bin, NULL, NULL);
        clBuildProgram(cl_prog, 1, gpu_devs, NULL, NULL, NULL);
        ker = clCreateKernel(cl_prog, "f", &err);
        te = sec();
        printf("build : %f[usec]\n", (te-tb)*1000000);

        in = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err);
        out = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err);

        clSetKernelArg(ker, 0, sizeof(cl_mem), &in);
        clSetKernelArg(ker, 1, sizeof(cl_mem), &out);


        {
            int *ptr = (int*)clEnqueueMapBuffer(queue, in, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                ptr[i] = i;
            }
            clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL);
        }

        {
            int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                ptr[i] = 0xdeadbeef;
            }
            clEnqueueUnmapMemObject(queue, out, ptr, 0, NULL, NULL);
        }

        err = clFinish(queue);

        global_size[0] = run_size;
        err = clEnqueueNDRangeKernel(queue, ker, 1, NULL, global_size, NULL, 0, NULL, NULL);
        if (err != CL_SUCCESS) {
            puts("enqueue nd");
        }
        err = clFinish(queue);
        if (err != CL_SUCCESS) {
            puts("fini");
        }

        if (ei == 0) {
            int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_READ, 0, run_size*sizeof(int), 0, NULL, NULL, NULL);
            int i;
            for (i=0; i<run_size; i++) {
                printf("%d : %x\n", i, ptr[i]);
            }
            clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL);
        }

        err = clFinish(queue);

        clReleaseMemObject(in);
        clReleaseMemObject(out);
        clReleaseKernel(ker);
        clReleaseProgram(cl_prog);
    }

    return 0;
}
Exemplo n.º 22
0
/**
 * \brief Creates and initializes the working data for the plan
 * \param plan The Plan struct that holds the plan's data values.
 * \return Error flag value
 */
int initOPENCL_MEMPlan(void *plan){   // <- Replace YOUR_NAME with the name of your module.
    if(!plan){
        return make_error(ALLOC, generic_err);           // <- This is the error code for one of the malloc fails.
    }
    Plan *p;
    OPENCL_MEM_DATA *d;
    p = (Plan *)plan;

    #ifdef HAVE_PAPI
    int temp_event, i;
    int PAPI_Events [NUM_PAPI_EVENTS] = PAPI_COUNTERS;
    char *PAPI_units [NUM_PAPI_EVENTS] = PAPI_UNITS;
    #endif //HAVE_PAPI

    if(p){
        d = (OPENCL_MEM_DATA *)p->vptr;
        p->exec_count = 0;           // Initialize the plan execution count to zero.
        perftimer_init(&p->timers, NUM_TIMERS);         // Initialize all performance timers to zero.

        #ifdef HAVE_PAPI
        /* Initialize plan's PAPI data */
        p->PAPI_EventSet = PAPI_NULL;
        p->PAPI_Num_Events = 0;

        TEST_PAPI(PAPI_create_eventset(&p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME);

        //Add the desired events to the Event Set; ensure the dsired counters
        //  are on the system then add, ignore otherwise
        for(i = 0; i < TOTAL_PAPI_EVENTS && i < NUM_PAPI_EVENTS; i++){
            temp_event = PAPI_Events[i];
            if(PAPI_query_event(temp_event) == PAPI_OK){
                p->PAPI_Num_Events++;
                TEST_PAPI(PAPI_add_event(p->PAPI_EventSet, temp_event), PAPI_OK, MyRank, 9999, PRINT_SOME);
            }
        }

        PAPIRes_init(p->PAPI_Results, p->PAPI_Times);
        PAPI_set_units(p->name, PAPI_units, NUM_PAPI_EVENTS);

        TEST_PAPI(PAPI_start(p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME);
        #endif     //HAVE_PAPI
    }
    if(d){
        cl_int error;

        pthread_mutex_lock(&opencl_platform_mutex);
        error = clGetPlatformIDs(0, NULL,&(d->num_platforms));
        pthread_mutex_unlock(&opencl_platform_mutex);

        assert(error == CL_SUCCESS);
        d->platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * d->num_platforms);
        pthread_mutex_lock(&opencl_platform_mutex);
        error = clGetPlatformIDs(d->num_platforms, d->platforms, NULL);
        pthread_mutex_unlock(&opencl_platform_mutex);

        assert(error == CL_SUCCESS);
        error = clGetDeviceIDs(d->platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &(d->num_devices));
        assert(error == CL_SUCCESS);
        d->devices = (cl_device_id *)malloc(sizeof(cl_device_id) * d->num_devices);
        error = clGetDeviceIDs(d->platforms[0],CL_DEVICE_TYPE_ALL, d->num_devices, d->devices, NULL);
        assert(error == CL_SUCCESS);

        d->context = clCreateContext(NULL, 1, &(d->devices[d->device_id]), NULL, NULL, &error);
        assert(error == CL_SUCCESS);

        d->opencl_queue = clCreateCommandQueue(d->context, d->devices[d->device_id], 0, &error);
        assert(error == CL_SUCCESS);

        error = clGetDeviceInfo(d->devices[d->device_id], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &(d->device_memory), NULL);
        assert(error == CL_SUCCESS);

        d->device_memory -= SUB_FACTOR;

        d->buffer = clCreateBuffer(d->context, CL_MEM_WRITE_ONLY, d->device_memory, NULL, &error);
        assert(error == CL_SUCCESS);

        size_t page_size = sysconf(_SC_PAGESIZE);
        error = posix_memalign((void **)&(d->return_buffer), page_size, d->device_memory);
        assert(error == 0);

        d->program = clCreateProgramWithSource(d->context, 1, (const char **)&opencl_program,NULL,&error);
        assert(error == CL_SUCCESS);

        error = clBuildProgram(d->program,1,&(d->devices[d->device_id]),NULL,NULL,NULL);
        assert(error == CL_SUCCESS);

        d->kernel = clCreateKernel(d->program, "write_pattern", &error);
        assert(error == CL_SUCCESS);
    }
    return ERR_CLEAN;     // <- This indicates a clean run with no errors. Does not need to be changed.
} /* initOPENCL_MEMPlan */
int main(int argc, char** argv) {

  printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);

	cl_int error;
	cl_uint num_platforms;
	
	// Get the number of platforms
	error = clGetPlatformIDs(0, NULL, &num_platforms);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get the list of platforms
	cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(num_platforms, platforms, NULL);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Print the chosen platform (if there are multiple platforms, choose the first one)
	cl_platform_id platform = platforms[0];
	char pbuf[100];
	error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);
	
	// Create a GPU context
	cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0};
    context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Get and print the chosen device (if there are multiple devices, choose the first one)
	size_t devices_size;
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	device = devices[0];
	error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);
	
	// Create a command queue
	command_queue = clCreateCommandQueue(context, device, 0, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	

    int size;
    int grid_rows,grid_cols = 0;
    float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; 
    char *tfile, *pfile, *ofile;
    
    int total_iterations = 60;
    int pyramid_height = 1; // number of iterations
	
	if (argc < 7)
		usage(argc, argv);
	if((grid_rows = atoi(argv[1]))<=0||
	   (grid_cols = atoi(argv[1]))<=0||
       (pyramid_height = atoi(argv[2]))<=0||
       (total_iterations = atoi(argv[3]))<=0)
		usage(argc, argv);
		
	tfile=argv[4];
    pfile=argv[5];
    ofile=argv[6];
	
    size=grid_rows*grid_cols;

    // --------------- pyramid parameters --------------- 
    int borderCols = (pyramid_height)*EXPAND_RATE/2;
    int borderRows = (pyramid_height)*EXPAND_RATE/2;
    int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
    int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1);
    int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1);

    FilesavingTemp = (float *) malloc(size*sizeof(float));
    FilesavingPower = (float *) malloc(size*sizeof(float));
    // MatrixOut = (float *) calloc (size, sizeof(float));

    if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut)
        fatal("unable to allocate memory");
	
	// Read input data from disk
    readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
    readinput(FilesavingPower, grid_rows, grid_cols, pfile);
	
	// Load kernel source from file
	const char *source = load_kernel_source("hotspot_kernel.cl");
	size_t sourceSize = strlen(source);
	
	// Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	char clOptions[110];
	//  sprintf(clOptions,"-I../../src"); 
	sprintf(clOptions," ");
#ifdef BLOCK_SIZE
	sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE);
#endif

    // Create an executable from the kernel
	error = clBuildProgram(program, 1, &device, clOptions, NULL, NULL);
	// Show compiler warnings/errors
	static char log[65536]; memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
	if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
    kernel = clCreateKernel(program, "hotspot", &error);
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
		
	long long start_time = get_time();
	
	// Create two temperature matrices and copy the temperature input data
	cl_mem MatrixTemp[2];
	// Create input memory buffers on device
	MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
    
    // Lingjie Zhang modifited at Nov 1, 2015
    //MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error);
    MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE , sizeof(float) * size, NULL, &error);
    // end Lingjie Zhang modification
    
    if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Copy the power input data
	cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	// Perform the computation
	int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height,
								blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower);
	
	// Copy final temperature data back
	cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	long long end_time = get_time();	
	printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000));
	
	// Write final output to output file
    writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
    
	error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL);
	if (error != CL_SUCCESS) fatal_CL(error, __LINE__);
	
	clReleaseMemObject(MatrixTemp[0]);
	clReleaseMemObject(MatrixTemp[1]);
	clReleaseMemObject(MatrixPower);
	
        clReleaseContext(context);

	return 0;
}
Exemplo n.º 24
0
int main(int argc, char** argv) {
    // beginning of the verbose OpenCL allocation
    cl_platform_id platform_id = NULL;
    cl_uint ret_num_platforms = 0;
    cl_uint ret_num_devices = 0;
    cl_int ret = 0;

    // the output from opencl kernel
    float *c_inputs = malloc(ARRAY_SIZE*sizeof(float));
    float *c_outputs = malloc(ARRAY_SIZE*sizeof(float));
    cl_float *cl_inputs = malloc(ARRAY_SIZE*sizeof(cl_float));
    cl_float *cl_outputs = malloc(ARRAY_SIZE*sizeof(cl_float));

    // get random numbers via Rmath
    set_seed(atoi(argv[1]), 197414);
    float tmp_in = 0.0;

    #pragma omp parallel for
    for (long i = 0; i < ARRAY_SIZE; i++) {
        tmp_in = rnorm(0, 1);
        c_inputs[i] = tmp_in;
        cl_inputs[i] = (cl_float) tmp_in;
    }

    // measure time elapse
    clock_t start = clock();
    #pragma omp parallel for
    for (long i = 0; i < ARRAY_SIZE; i++) {
        c_outputs[i] = expf(c_inputs[i]);
    }
    printf("CPU time for %d exp operation: %d\n", ARRAY_SIZE, (int) (clock() - start));

    // read kernel source
    FILE *fp;
    char filename[] = "./hello_log.cl";
    char *source_str;
    size_t source_size;
    fp = fopen(filename, "r");
    source_str = (char*) malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str,
                        1,
                        MAX_SOURCE_SIZE,
                        fp);
    fclose(fp);

    // get platform and device info
    ret = clGetPlatformIDs(1,
                           &platform_id,
                           &ret_num_platforms);
    cl_device_id device_ids[2];
    ret = clGetDeviceIDs(platform_id,
                         CL_DEVICE_TYPE_GPU,
                         2,
                         device_ids,
                         &ret_num_devices);
    printf("Number of devices: %5d\n", ret_num_devices);

    // print device name
    char bdname[100];
    clGetDeviceInfo(device_ids[1], CL_DEVICE_NAME, 100, bdname, NULL);
    printf("Used device: %s\n", bdname);

    // use second GPU
    cl_device_id device_id = device_ids[1];

    // create opencl context
    cl_context context = clCreateContext(NULL,
                                         1,
                                         &device_id,
                                         NULL,
                                         NULL,
                                         &ret);


    // create command queue
    cl_command_queue command_queue = clCreateCommandQueueWithProperties(context,
                                     device_id,
                                     0,
                                     &ret);

    // create memory buffer for input
    cl_mem memobj_in = clCreateBuffer(context,
                                      CL_MEM_READ_WRITE,
                                      ARRAY_SIZE*sizeof(cl_float),
                                      NULL,
                                      &ret);

    // create memory buffer for output
    cl_mem memobj_out = clCreateBuffer(context,
                                       CL_MEM_READ_WRITE,
                                       ARRAY_SIZE*sizeof(cl_float),
                                       NULL,
                                       &ret);

    // create kernel program
    cl_program program = clCreateProgramWithSource(context,
                         1,
                         (const char **)&source_str,
                         (const size_t *)&source_size,
                         &ret);

    // build program
    ret = clBuildProgram(program,
                         1,
                         &device_id,
                         NULL,
                         NULL,
                         NULL);
    printf("build program successfully\n");

    // create opencl kernel
    cl_kernel kernel = clCreateKernel(program,
                                      "hello_exp",
                                      &ret);

    // set opencl parameters for inputs
    ret = clSetKernelArg(kernel,
                         0,
                         sizeof(cl_mem),
                         (void *)&memobj_in);

    // set opencl parameters for inputs
    ret = clSetKernelArg(kernel,
                         1,
                         sizeof(cl_mem),
                         (void *)&memobj_out);

    // execute opencl kernel
    size_t global_item_size = ARRAY_SIZE/32;
    size_t local_item_size = 32;

    // measure time
    start = clock();
    ret = clEnqueueWriteBuffer(command_queue,
                               memobj_in,
                               CL_TRUE,
                               0,
                               ARRAY_SIZE*sizeof(cl_float),
                               cl_inputs,
                               0,
                               NULL,
                               NULL);
    // run it
    ret = clEnqueueNDRangeKernel(command_queue,
                                 kernel,
                                 1,
                                 NULL,
                                 &global_item_size,
                                 &local_item_size,
                                 0,
                                 NULL,
                                 NULL);

    // copy results from the memory buffer
    ret = clEnqueueReadBuffer(command_queue,
                              memobj_out,
                              CL_TRUE,
                              0,
                              ARRAY_SIZE*sizeof(cl_float),
                              cl_outputs,
                              0,
                              NULL,
                              NULL);
    printf("GPU time (with PCI-E overhead): %d\n", (int) (clock() - start));
    printf("inputs: %3.7f  %3.7f\n", c_inputs[150000], cl_inputs[150000]);
    printf("outputs: %3.7f  %3.7f\n", c_outputs[150000], (float) cl_outputs[150000]);

    // finalization
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj_in);
    ret = clReleaseMemObject(memobj_out);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(source_str);

    return 0;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'tgamma_float4.cl' */
        source_code = read_buffer("tgamma_float4.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "tgamma_float4", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_float4 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_float4));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_float4){{2.0, 2.0, 2.0, 2.0}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float4), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_float4 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_float4));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float4));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float4), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float4));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Exemplo n.º 26
0
long OpenCLDevice::getMaxWorkGroupSize() {
	cl_ulong value;
	check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, (sizeof(cl_ulong)), &value, NULL));
	return value;
}
Exemplo n.º 27
0
int main(int argc, char **argv)
{

	cl_int ret;


	/*
	 * Command line
	 */
	char *source_path;
	if (argc != 2)
	{
		printf("syntax: %s <kernel-source>\n", argv[0]);
		exit(1);
	}
	source_path = argv[1];


	/*
	 * Platform
	 */

	/* Get platform */
	cl_platform_id platform;
	cl_uint num_platforms;
	ret = clGetPlatformIDs(1, &platform, &num_platforms);
	if (ret != CL_SUCCESS)
	{
		printf("error: second call to 'clGetPlatformIDs' failed\n");
		exit(1);
	}
	printf("Number of platforms: %d\n", num_platforms);

	/* Get platform name */
	char platform_name[100];
	ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetPlatformInfo' failed\n");
		exit(1);
	}
	printf("platform.name='%s'\n", platform_name);
	printf("\n");



	/*
	 * Device
	 */

	/* Get device */
	cl_device_id device;
	cl_uint num_devices;
	ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetDeviceIDs' failed\n");
		exit(1);
	}
	printf("Number of devices: %d\n", num_devices);

	/* Get device name */
	char device_name[100];
	ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetDeviceInfo' failed\n");
		exit(1);
	}
	printf("device.name='%s'\n", device_name);
	printf("\n");



	/*
	 * Context
	 */
	
	/* Create context */
	cl_context context;
	context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateContext' failed\n");
		exit(1);
	}

	

	/*
	 * Command Queue
	 */
	
	/* Create command queue */
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(context, device, 0, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateCommandQueue' failed\n");
		exit(1);
	}
	printf("\n");



	/*
	 * Program
	 */
	
	/* Program source */
	const char *source;
	size_t source_length;

	/* Read binary */
	source = read_buffer(source_path, &source_length);
	if (!source)
	{
		printf("error: %s: cannot open kernel source\n", source_path);
		exit(1);
	}
	
	/* Create a program */
	cl_program program;
	program = clCreateProgramWithSource(context, 1, &source,
			&source_length, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateProgramWithSource' failed\n");
		exit(1);
	}

	/* Build program */
	ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	if (ret != CL_SUCCESS )
	{
		size_t size;
		char *log;

		/* Get log size */
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);

		/* Allocate log and print */
		log = malloc(size);
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, log, NULL);
		printf("error: call to 'clBuildProgram' failed:\n%s\n", log);

		/* Free log and exit */
		free(log);
		exit(1);
	}
	printf("program built\n");
	printf("\n");



	/*
	 * Kernel
	 */
	
	/* Create kernel */
	cl_kernel kernel;
	kernel = clCreateKernel(program, "vector_add", &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateKernel' failed\n");
		exit(1);
	}
	printf("\n");


	/*
	 * Buffers
	 */
	
	/* Create and allocate host buffers */
	size_t num_elem = 10;

	cl_int *src1_host_buffer;
	cl_int *src2_host_buffer;
	cl_int *dst_host_buffer;
	src1_host_buffer = malloc(num_elem * sizeof(cl_int));
	src2_host_buffer = malloc(num_elem * sizeof(cl_int));
	dst_host_buffer = malloc(num_elem * sizeof(cl_int));

	/* Initialize host source buffer */
	int i;
	for (i = 0; i < num_elem; i++)
	{
		src1_host_buffer[i] = i;
		src2_host_buffer[i] = 100;
	}
	
	/* Create device source buffers */
	cl_mem src1_device_buffer;
	cl_mem src2_device_buffer;
	src1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL);
	src2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL);
	if (!src1_device_buffer || !src2_device_buffer)
	{
		printf("error: could not create destination buffer\n");
		exit(1);
	}

	/* Create device destination buffer */
	cl_mem dst_device_buffer;
	dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem * sizeof(cl_int), NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: could not create destination buffer\n");
		exit(1);
	}

	/* Copy buffer */
	ret = clEnqueueWriteBuffer(command_queue, src1_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), src1_host_buffer, 0, NULL, NULL);
	ret |= clEnqueueWriteBuffer(command_queue, src2_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), src2_host_buffer, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueWriteBuffer' failed\n");
		exit(1);
	}


	/*
	 * Kernel arguments
	 */
	
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src1_device_buffer);
	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src2_device_buffer);
	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clSetKernelArg' failed\n");
		exit(1);
	}
	
	
	/*
	 * Launch Kernel
	 */
	
	size_t global_work_size = num_elem;
	size_t local_work_size = num_elem;

	/* Launch the kernel */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
		&global_work_size, &local_work_size, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
		exit(1);
	}

	/* Wait for it to finish */
	clFinish(command_queue);


	/*
	 * Result
	 */
	
	/* Receive buffer */
	ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), dst_host_buffer, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueReadBuffer' failed\n");
		exit(1);
	}

	/* Print result */
	for (i = 0; i < num_elem; i++)
		printf("dst_host_buffer[%d] = %d\n", i, dst_host_buffer[i]);
	printf("\n");

	return 0;
}
Exemplo n.º 28
0
long OpenCLDevice::getMaxMemAllocSize() {
	cl_ulong value;
	check_error(clGetDeviceInfo(my_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, (sizeof(cl_ulong)), &value, NULL));
	return value;
}
Exemplo n.º 29
0
/* Find a GPU or CPU associated with the first available platform */
void CLHash_Utilities_CreateContext_p(cl_context *context, cl_command_queue *command_queue, const char *file , int line) {

   uint num_platforms;
   cl_platform_id *platforms;
   cl_device_id device;
   int err;

   /* Get all available platforms */
   err = clGetPlatformIDs(0, NULL, &num_platforms);
   CLHash_Utilities_HandleError(err, "CLHash_Utilities_CreateContext", "clGetPlatformIDs");
   platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id));
   /* Identify a platform */
   err = clGetPlatformIDs(num_platforms, platforms, NULL);
   CLHash_Utilities_HandleError(err, "CLHash_Utilities_CreateContext", "clGetPlatformIDs");

   if (DEBUG == 1) {
     char info[1024];
     for (int iplatform=0; iplatform<num_platforms; iplatform++){
       printf("  Platform %d:\n",iplatform+1);

       //clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_PROFILE,   1024L,info,0);
       //printf("    CL_PLATFORM_PROFILE    : %s\n",info);

       clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_VERSION,   1024L,info,0);
       printf("    CL_PLATFORM_VERSION    : %s\n",info);

       clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_NAME,      1024L,info,0);
       printf("    CL_PLATFORM_NAME       : %s\n",info);

       clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_VENDOR,    1024L,info,0);
       printf("    CL_PLATFORM_VENDOR     : %s\n",info);

       //clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_EXTENSIONS,1024L,info,0);
       // printf("    CL_PLATFORM_EXTENSIONS : %s\n",info);
     }
   }

   /* Access a device */
   for (int iplatform=0; iplatform<num_platforms; iplatform++){
     err = clGetDeviceIDs(platforms[iplatform], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
     if(err != CL_DEVICE_NOT_FOUND){
       break;
     }
   }
   if(err == CL_DEVICE_NOT_FOUND){
     for (int iplatform=0; iplatform<num_platforms; iplatform++){
       err = clGetDeviceIDs(platforms[iplatform], CL_DEVICE_TYPE_ACCELERATOR, 1, &device, NULL);
       if(err != CL_DEVICE_NOT_FOUND){
         break;
       }
     }
     if(err == CL_DEVICE_NOT_FOUND){
       for (int iplatform=0; iplatform<num_platforms; iplatform++){
         err = clGetDeviceIDs(platforms[iplatform], CL_DEVICE_TYPE_CPU, 1, &device, NULL);
           if(err != CL_DEVICE_NOT_FOUND){
           break;
         }
       }
     }
   }
   CLHash_Utilities_HandleError(err, "CLHash_Utilities_CreateContext", "clGetDeviceIDs");

   if (DEBUG == 1) {
     char info[1024];
   
     printf("\n\n");
     printf("  Device:\n");
     clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(info), info, NULL);
     printf("    CL_DEVICE_NAME         : %s\n",info);

     clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(info), info, NULL);
     printf("    CL_DEVICE_VENDOR       : %s\n",info);
   }

   *context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err != CL_SUCCESS) CLHash_Utilities_PrintError_p(err, "CLHash_Utilities_CreateContext", "clCreateContext", file, line);

   *command_queue = clCreateCommandQueue(*context, device, CL_QUEUE_PROFILING_ENABLE, &err);
   if(err != CL_SUCCESS) CLHash_Utilities_PrintError_p(err, "CLHash_Utilities_CreateContext", "clCreateCommandQueue", file, line);

   free(platforms);
}
Exemplo n.º 30
0
int
main ()
{
  int err, i;
  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_context_properties context_props[3];
  cl_command_queue queue;
  cl_program program;
  cl_kernel kernel;
  cl_mem buffer;

  size_t len;
  const char *program_source = NULL;
  char *device_extensions = NULL;
  char kernel_build_opts[256];
  size_t size = sizeof (cl_int) * SIZE;
  const size_t global_work_size[] = {SIZE, 0, 0}; /* size of each dimension */
  cl_int *data;

  /* In order to see which devices the OpenCL implementation on your platform
     provides you may issue a call to the print_clinfo () fuction.  */

  /* Initialize the data the OpenCl program operates on.  */
  data = (cl_int*) calloc (1, size);
  if (data == NULL)
    {
      fprintf (stderr, "calloc failed\n");
      exit (EXIT_FAILURE);
    }

  /* Pick the first platform.  */
  CHK (clGetPlatformIDs (1, &platform, NULL));
  /* Get the default device and create context.  */
  CHK (clGetDeviceIDs (platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL));
  context_props[0] = CL_CONTEXT_PLATFORM;
  context_props[1] = (cl_context_properties) platform;
  context_props[2] = 0;
  context = clCreateContext (context_props, 1, &device, NULL, NULL, &err);
  CHK_ERR ("clCreateContext", err);
  queue = clCreateCommandQueue (context, device, 0, &err);
  CHK_ERR ("clCreateCommandQueue", err);

  /* Query OpenCL extensions of that device.  */
  CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, 0, NULL, &len));
  device_extensions = (char *) malloc (len);
  CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, len, device_extensions,
			NULL));
  strcpy (kernel_build_opts, "-Werror -cl-opt-disable");
  if (strstr (device_extensions, "cl_khr_fp64") != NULL)
    strcpy (kernel_build_opts + strlen (kernel_build_opts),
	    " -D HAVE_cl_khr_fp64");
  if (strstr (device_extensions, "cl_khr_fp16") != NULL)
    strcpy (kernel_build_opts + strlen (kernel_build_opts),
	    " -D HAVE_cl_khr_fp16");

  /* Read the OpenCL kernel source into the main memory.  */
  program_source = read_file (STRINGIFY (CL_SOURCE), &len);
  if (program_source == NULL)
    {
      fprintf (stderr, "file does not exist: %s\n", STRINGIFY (CL_SOURCE));
      exit (EXIT_FAILURE);
    }

  /* Build the OpenCL kernel.  */
  program = clCreateProgramWithSource (context, 1, &program_source,
				       &len, &err);
  free ((void*) program_source);
  CHK_ERR ("clCreateProgramWithSource", err);
  err = clBuildProgram (program, 0, NULL, kernel_build_opts, NULL,
			NULL);
  if (err != CL_SUCCESS)
    {
      size_t len;
      char *clbuild_log = NULL;
      CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, 0,
				  NULL, &len));
      clbuild_log = malloc (len);
      if (clbuild_log)
	{
	  CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG,
				      len, clbuild_log, NULL));
	  fprintf (stderr, "clBuildProgram failed with:\n%s\n", clbuild_log);
 	  free (clbuild_log);
        }
      exit (EXIT_FAILURE);
  }

  /* In some cases it might be handy to save the OpenCL program binaries to do
     further analysis on them.  In order to do so you may call the following
     function: save_program_binaries (program);.  */

  kernel = clCreateKernel (program, "testkernel", &err);
  CHK_ERR ("clCreateKernel", err);

  /* Setup the input data for the kernel.  */
  buffer = clCreateBuffer (context, CL_MEM_USE_HOST_PTR, size, data, &err);
  CHK_ERR ("clCreateBuffer", err);

  /* Execute the kernel (data parallel).  */
  CHK (clSetKernelArg (kernel, 0, sizeof (buffer), &buffer));
  CHK (clEnqueueNDRangeKernel (queue, kernel, 1, NULL, global_work_size, NULL,
			       0, NULL, NULL));

  /* Fetch the results (blocking).  */
  CHK (clEnqueueReadBuffer (queue, buffer, CL_TRUE, 0, size, data, 0, NULL,
			    NULL));

  /* Compare the results.  */
  for (i = 0; i < SIZE; i++)
    {
      if (data[i] != 0x1)
	{
	  fprintf (stderr, "error: data[%d]: %d != 0x1\n", i, data[i]);
	  exit (EXIT_FAILURE);
	}
    }

  /* Cleanup.  */
  CHK (clReleaseMemObject (buffer));
  CHK (clReleaseKernel (kernel));
  CHK (clReleaseProgram (program));
  CHK (clReleaseCommandQueue (queue));
  CHK (clReleaseContext (context));
  free (data);

  return 0;
}