Example #1
0
void QueryOpenCLEnvironmentInfo( OpenCLEnvironmentInfo *opencl_env )
{
    cl_platform_id *platform_ids = NULL;

    opencl_env->platforms = NULL;

    // Detects the number of available OpenCL platforms
    CL_CHECK( clGetPlatformIDs( 1,
                                NULL,
                               &opencl_env->num_platforms ) );

    if ( opencl_env->num_platforms > 0 )
    {
        platform_ids = ( cl_platform_id* ) malloc( sizeof( cl_platform_id ) * opencl_env->num_platforms );

        CL_CHECK( clGetPlatformIDs( opencl_env->num_platforms,
                                    platform_ids,
                                    NULL ) );

        opencl_env->platforms = ( Platform* ) malloc( sizeof( Platform ) * opencl_env->num_platforms );

        // fill in the OpenCL environment data structures
        for ( cl_uint i = 0; i < opencl_env->num_platforms; i++ )
            QueryPlatformInfo( &platform_ids[i], &opencl_env->platforms[i] );

        if ( platform_ids )
        {
            free( platform_ids );
            platform_ids = NULL;
        }
    }
    else
    {
        fprintf( stderr, "No OpenCL platforms found. Exiting...\n" );
        exit( EXIT_FAILURE );
    }

    int opencl_devices_found = 0; // false

    for ( cl_uint i = 0; i < opencl_env->num_platforms; i++ )
        opencl_devices_found = ( opencl_env->platforms[i].num_devices ) ? 1 : 0;

    if ( !opencl_devices_found )
    {
        fprintf( stderr, "No OpenCL devices found. Exiting...\n" );
        exit( EXIT_FAILURE );
    }
}
void OpenCLMemory::free() {
  OpenCLDevice& current_device =
      OpenCLManager::CurrentPlatform()->CurrentDevice();
  cl_int err;

  if (this->ptr_device_mem_ != NULL) {
    if (this->hasEvent()) {
      CL_CHECK(
          clWaitForEvents(
              1,
              &this->memoryEvent));
      this->resetEvent();
    }
    err = clReleaseMemObject(
        this->ptr_device_mem_);
    if (err != CL_SUCCESS) {
      std::ostringstream oss;
      oss << current_device.name() << "> failed to call clReleaseMemObject("
          << this->ptr_device_mem << ").";
      LOG(ERROR)<< oss.str().c_str();
      throw OpenCLMemoryException(
          oss.str());
    }
    this->ptr_device_mem_ = NULL;
    DLOG(INFO)<< current_device.name() << "> clReleaseMemObject("
    << this->ptr_device_mem << ") succeeded.";
    numCallsFree++;
    logStatistics();
  }
}
clblasStatus SGEMM_mod1024(
	clblasTranspose transA,
	clblasTranspose transB,
	cl_uint M, cl_uint N, cl_uint K,
	float alpha,
	cl_mem A, cl_uint offA, cl_uint lda,
	cl_mem B, cl_uint offB, cl_uint ldb,
	float beta,
	cl_mem C, cl_uint offC, cl_uint ldc,
	cl_uint numCommandQueues,
	cl_command_queue *commandQueues,
	cl_uint numEventsInWaitList,
	const cl_event *eventWaitList,
	cl_event *events,
	bool &specialCaseHandled)
{
	const char *tileKernelSource = NULL;
	cl_kernel  *tileClKernel = NULL;
	size_t tileKernelBinarySize = 0;
	cl_int err;


	const unsigned char *tileKernelBinary = NULL;

	clblasStatus status;


	//split the kernel calls to handle sgemm NT perf drop at big multiples of 1024
	if ((lda % 1024 == 0) && (ldb % 1024 == 0) && (K > lda / 4))
	{
		if ((lda == ldb) && (lda >= 4096) && (lda <= 8192)) // between 4096 and 8192 for now
		{
			if (lda != 6144)// 6144 is handled by 96 x 96 kernel
			{
				// we are going to call 16 GEMMs with M=M/2, N=N/2, K=K/4
				// each GEMM requires M%128 == 0, N%128 == 0, K%16 == 0
				if (M % 256 == 0 && N % 256 == 0 && K % 64 == 0)
				{
					if (!((transA == clblasNoTrans) && (transB == clblasTrans)))
						return clblasNotImplemented;

					specialCaseHandled = true;
					unsigned int M_split_factor;
					unsigned int N_split_factor;
					unsigned int K_split_factor;

					if (lda < 7168)
					{
						M_split_factor = 1;
						N_split_factor = 1;
						K_split_factor = 1;
					}
					else
					{
						//7168, 8192
						M_split_factor = 2;
						N_split_factor = 2;
						K_split_factor = 4;
					}

					tileKernelSource = sgemm_Col_NT_B1_MX128_NX128_KX16_src;
					tileClKernel = &sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel;
					tileKernelBinary = sgemm_Col_NT_B1_MX128_NX128_KX16_bin;
					tileKernelBinarySize = sgemm_Col_NT_B1_MX128_NX128_KX16_binSize;

					makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions);

					err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC);
					CL_CHECK(err);

					status = GEMM_SPLIT_CALLS(
						tileClKernel, clblasColumnMajor,
						128, 16,
						M_split_factor,
						N_split_factor, K_split_factor,
						transA,
						transB,
						M, N, K,
						alpha,
						A, offA, lda,
						B, offB, ldb,
						beta,
						C, offC, ldc,
						numCommandQueues,
						commandQueues,
						numEventsInWaitList,
						eventWaitList,
						events);


					return status;
				}
			}
			else
			{
				// lda == ldb == 6144
				// we are going to call 4 GEMMs each with K = K/4
				if (M % 96 == 0 && N % 96 == 0 && K % 64 == 0)
				{
					if (!((transA == clblasNoTrans) && (transB == clblasTrans)))
						return clblasNotImplemented;

					specialCaseHandled = true;
					unsigned int M_split_factor = 1;
					unsigned int N_split_factor = 1;
					unsigned int K_split_factor = 4;



					tileKernelSource = sgemm_Col_NT_B1_MX096_NX096_KX16_src;
					tileClKernel = &sgemm_Col_NT_B1_MX096_NX096_KX16_clKernel;
					tileKernelBinary = sgemm_Col_NT_B1_MX096_NX096_KX16_bin;
					tileKernelBinarySize = sgemm_Col_NT_B1_MX096_NX096_KX16_binSize;

					makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions);

					err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB);
					CL_CHECK(err);
					err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC);
					CL_CHECK(err);


					status = GEMM_SPLIT_CALLS(
						tileClKernel, clblasColumnMajor,
						96, 16,
						M_split_factor,
						N_split_factor, K_split_factor,
						transA,
						transB,
						M, N, K,
						alpha,
						A, offA, lda,
						B, offB, ldb,
						beta,
						C, offC, ldc,
						numCommandQueues,
						commandQueues,
						numEventsInWaitList,
						eventWaitList,
						events);


					return status;
				}
			}
		}
	}

	return clblasNotImplemented;
}
Example #4
0
int main(int argc, char **argv)
{
  printf("enter demo main\n");
  fflush(stdout);
  putenv("POCL_VERBOSE=1");
  putenv("POCL_DEVICES=basic");
  putenv("POCL_LEAVE_TEMP_DIRS=1");
  putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1");
  putenv("POCL_TEMP_DIR=pocl");
  putenv("POCL_CACHE_DIR=pocl");
  putenv("POCL_WORK_GROUP_METHOD=spmd");
  if(argc >= 2){
    printf("argv[1]:%s:\n",argv[1]);
    if(!strcmp(argv[1], "h"))
      putenv("POCL_WORK_GROUP_METHOD=spmd");
    if(!strcmp(argv[1], "c"))
      putenv("POCL_CROSS_COMPILE=1");
  }
  if(argc >= 3){
    printf("argv[2]:%s:\n",argv[2]);
    if(!strcmp(argv[2], "h"))
      putenv("POCL_WORK_GROUP_METHOD=spmd");
    if(!strcmp(argv[2], "c"))
      putenv("POCL_CROSS_COMPILE=1");
  }

  //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
  //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
  //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib");
  //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath());
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	if (platforms_n == 0)
		return 1;

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", platforms_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err));

	cl_command_queue queue;
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err));

	cl_kernel kernel = 0;
  cl_mem memObjects[3] = {0,0,0};


  // Create OpenCL program - first attempt to load cached binary.
  //  If that is not available, then create the program from source
  //  and store the binary for future use.
  std::cout << "Attempting to create program from binary..." << std::endl;
  cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin");
  if (program == NULL)
  {
      std::cout << "Binary not loaded, create from source..." << std::endl;
      program = CreateProgram(context, devices[1], "kernel.cl");
      if (program == NULL)
      {
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }

      std::cout << "Save program binary for future run..." << std::endl;
      if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false)
      {
          std::cerr << "Failed to write program binary" << std::endl;
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }
  }
  else
  {
      std::cout << "Read program from binary." << std::endl;
  }

  printf("attempting to create input buffer\n");
  fflush(stdout);
	cl_mem input_bufferA;
	input_bufferA = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err));
	cl_mem input_bufferB;
	input_bufferB = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err));

  printf("attempting to create output buffer\n");
  fflush(stdout);
	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err));

  memObjects[0] = input_bufferA;
  memObjects[1] = input_bufferB;
  memObjects[2] = output_buffer;

	size_t width = NUM_DATA;

  printf("attempting to create kernel\n");
  fflush(stdout);
	kernel = CL_CHECK_ERR(clCreateKernel(program, "sgemm_single", &_err));
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_bufferA), &input_bufferA));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(input_bufferB), &input_bufferB));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 3, sizeof(width), &width));

  printf("attempting to enqueue write buffer\n");
  fflush(stdout);
	for (int i=0; i<NUM_DATA*NUM_DATA; i++) {
    float in = ((float)rand()/(float)(RAND_MAX)) * 100.0;
		CL_CHECK(clEnqueueWriteBuffer(queue, input_bufferA, CL_TRUE, i*sizeof(float), 4, &in, 0, NULL, NULL));
    in = ((float)rand()/(float)(RAND_MAX)) * 100.0;
		CL_CHECK(clEnqueueWriteBuffer(queue, input_bufferB, CL_TRUE, i*sizeof(float), 4, &in, 0, NULL, NULL));
	}

	cl_event kernel_completion;
  const size_t local_work_size[3] = { 64, 1, 1};
  //                             a_offset  
	size_t global_work_size[3] = { NUM_DATA, NUM_DATA, NUM_DATA };
  printf("attempting to enqueue kernel\n");
  fflush(stdout);
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &kernel_completion));
  printf("Enqueue'd kerenel\n");
  fflush(stdout);
  cl_ulong time_start, time_end;
  CL_CHECK(clWaitForEvents(1, &kernel_completion));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL));
  double elapsed = time_end - time_start;
  printf("time(ns):%lg\n",elapsed);
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA*NUM_DATA; i++) {
		float data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(float), 4, &data, 0, NULL, NULL));
		//printf(" %f", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(memObjects[0]));
	CL_CHECK(clReleaseMemObject(memObjects[1]));
	CL_CHECK(clReleaseMemObject(memObjects[2]));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Example #5
0
int fftCore(cl_mem dst, cl_mem src, cl_mem spin, cl_int m, enum Mode direction)
{
	cl_int ret;

	cl_int iter;
	cl_uint flag;

	cl_int n = 1 << m;

	cl_event kernelDone;

	cl_kernel brev = NULL;
	cl_kernel bfly = NULL;
	cl_kernel norm = NULL;

	brev = clCreateKernel(program, "bitReverse", &ret);
	CL_CHECK(ret);
	bfly = clCreateKernel(program, "butterfly", &ret);
	CL_CHECK(ret);
	norm = clCreateKernel(program, "norm", &ret);
	CL_CHECK(ret);

	size_t gws[2];
	size_t lws[2];

	switch (direction)
	{
		case forward: flag = 0x00000000; break;
		case inverse: flag = 0x80000000; break;
	}

	CL_CHECK(ret = clSetKernelArg(brev, 0, sizeof(cl_mem), (void *)&dst));
	CL_CHECK(ret = clSetKernelArg(brev, 1, sizeof(cl_mem), (void *)&src));
	CL_CHECK(ret = clSetKernelArg(brev, 2, sizeof(cl_int), (void *)&m));
	CL_CHECK(ret = clSetKernelArg(brev, 3, sizeof(cl_int), (void *)&n));

	CL_CHECK(ret = clSetKernelArg(bfly, 0, sizeof(cl_mem), (void *)&dst));
	CL_CHECK(ret = clSetKernelArg(bfly, 1, sizeof(cl_mem), (void *)&spin));
	CL_CHECK(ret = clSetKernelArg(bfly, 2, sizeof(cl_int), (void *)&m));
	CL_CHECK(ret = clSetKernelArg(bfly, 3, sizeof(cl_int), (void *)&n));
	CL_CHECK(ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag));

	CL_CHECK(ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst));
	CL_CHECK(ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n));

	setWorkSize(gws, lws, n, n);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL));

	setWorkSize(gws, lws, n/2, n);
	for(iter = 1; iter <= m; iter++)
	{
		CL_CHECK(ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter));
		CL_CHECK(ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone));
		CL_CHECK(ret = clWaitForEvents(1, &kernelDone));
	}

	if(direction == inverse)
	{
		setWorkSize(gws, lws, n, n);
		CL_CHECK(ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone));
		CL_CHECK(ret = clWaitForEvents(1, &kernelDone));
	}

	CL_CHECK(ret = clReleaseKernel(bfly));
	CL_CHECK(ret = clReleaseKernel(brev));
	CL_CHECK(ret = clReleaseKernel(norm));

	return 0;
}
Example #6
0
int main(int argc, char *argv[])
{
	//fprintf(stderr, "[%s:%d:%s()] FFT!\n", __FILE__, __LINE__, __func__);
	LOG("FFT Start\n");
	cl_mem xmobj = NULL;
	cl_mem rmobj = NULL;
	cl_mem wmobj = NULL;
	cl_kernel sfac = NULL;
	cl_kernel trns = NULL;
	cl_kernel hpfl = NULL;

	cl_uint ret_num_platforms;
	cl_uint ret_num_devices;

	cl_int ret;

	cl_float2 *xm;
	cl_float2 *rm;
	cl_float2 *wm;

	pgm_t ipgm;
	pgm_t opgm;

	FILE *fp;
	const char fileName[] = "./fft.cl";
	size_t source_size;
	char *source_str;
	cl_int i, j;
	cl_int n;
	cl_int m;

	size_t gws[2];
	size_t lws[2];

	fp = fopen(fileName, "r");
	if(!fp)
	{
		fprintf(stderr, "[%s:%d:%s()] ERROR, Failed to load kernel source.\n", __FILE__, __LINE__, __func__);
		return 1;
	}

	source_str = (char *)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	readPGM(&ipgm, "./lena.pgm");

	n = ipgm.width;
	m = (cl_int)(log((double)n)/log(2.0));

	LOG("n = %d, m = %d.\n", m, n);

	xm = (cl_float2*)malloc(n*n*sizeof(cl_float2));
	rm = (cl_float2*)malloc(n*n*sizeof(cl_float2));
	wm = (cl_float2*)malloc(n/2 *sizeof(cl_float2));

	for( i = 0; i < n; i++)
	{
		for(j = 0; j < n; j++)
		{
			((float*)xm)[2*(n*j + i) + 0] = (float)ipgm.buf[n*j + i];
			((float*)xm)[2*(n*j + i) + 1] = (float)0;
		}
	}

	CL_CHECK(ret = clGetPlatformIDs(MAX_PLATFORM_IDS, platform_ids, &ret_num_platforms));
	platform_id = platform_ids[0];
	CL_CHECK(ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices));

	LOG("platform_id = %p, device_id = %p\n", platform_id, device_id);


	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	CL_CHECK(ret);

	queue = clCreateCommandQueue(context, device_id, 0, &ret);

	xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
	CL_CHECK(ret);
	rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
	CL_CHECK(ret);
	wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
	CL_CHECK(ret);

	CL_CHECK(ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL));

	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
	CL_CHECK(ret);

	CL_CHECK(ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

	sfac = clCreateKernel(program, "spinFact", &ret);
	CL_CHECK(ret);
	trns = clCreateKernel(program, "transpose", &ret);
	CL_CHECK(ret);
	hpfl = clCreateKernel(program, "highPassFilter", &ret);
	CL_CHECK(ret);

	CL_CHECK(ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj));
	CL_CHECK(ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n));
	setWorkSize(gws, lws, n/2, 1);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL));

	fftCore(rmobj, xmobj, wmobj, m, forward);

	CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n));
	setWorkSize(gws, lws, n, n);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL));

	fftCore(rmobj, xmobj, wmobj, m, forward);

#if 1 //FILTER
	cl_int radius = n>>4;
	CL_CHECK(ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj));
	CL_CHECK(ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n));
	CL_CHECK(ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius));
	setWorkSize(gws, lws, n, n);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL));
#endif

#if 1 /* Inverse FFT */
	fftCore(xmobj, rmobj, wmobj, m, inverse);

	CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj));
	CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n));
	setWorkSize(gws, lws, n, n);
	CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL));

	fftCore(xmobj, rmobj, wmobj, m, inverse);
#endif

	CL_CHECK(ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL));

	float *ampd;
	ampd = (float*)malloc(n*n*sizeof(float));
	for(i = 0; i < n; i++)
	{
		for(j = 0; j < n; j++)
		{
			ampd[n*i + j] = AMP( ((float*)xm)[2*(n*i + j)], ((float*)xm)[2*(n*i + j) + 1] );
//			fprintf(stderr, "%d ", (int)ampd[n*i + j]);
		}
//		fprintf(stderr, "\n");
	}

	opgm.width = n;
	opgm.height = n;
	normalizeF2PGM(&opgm, ampd);
	free(ampd);

	writePGM(&opgm, "output.pgm");

	/* Termination */
	CL_CHECK(ret = clFlush(queue));
	CL_CHECK(ret = clFinish(queue));
	CL_CHECK(ret = clReleaseKernel(hpfl));
	CL_CHECK(ret = clReleaseKernel(trns));
	CL_CHECK(ret = clReleaseKernel(sfac));
	CL_CHECK(ret = clReleaseProgram(program));
	CL_CHECK(ret = clReleaseMemObject(xmobj));
	CL_CHECK(ret = clReleaseMemObject(rmobj));
	CL_CHECK(ret = clReleaseMemObject(wmobj));
	CL_CHECK(ret = clReleaseCommandQueue(queue));
	CL_CHECK(ret = clReleaseContext(context));

	destroyPGM(&ipgm);
	destroyPGM(&opgm);

	free(source_str);
	free(wm);
	free(rm);
	free(xm);

	return 0;
}
int fft_main(cl_mem dst, cl_mem src, cl_mem twiddles, cl_int m, enum Tipo direcao, struct event_in_fft_t *fft_event)
{
    cl_int ret_code;

    cl_int iter;
    cl_uint flag;

    size_t global_wg[2];
    size_t local_wg[2];

    cl_int n = 1 << m;

    cl_kernel kernel_bits_rev = NULL;
    cl_kernel kernel_butterfly_op = NULL;
    cl_kernel kernel_normalize = NULL;

    kernel_bits_rev = clCreateKernel(program, "bits_reverse", &ret_code);
    kernel_butterfly_op = clCreateKernel(program, "butterfly_operation", &ret_code);
    kernel_normalize = clCreateKernel(program, "normalizar", &ret_code);

    switch (direcao) {
        case direta:flag = 0x00000000; break;
        case inversa:flag = 0x80000000; break;
    }

    CL_CHECK(clSetKernelArg(kernel_bits_rev, 0, sizeof(cl_mem), (void *)&dst));
    CL_CHECK(clSetKernelArg(kernel_bits_rev, 1, sizeof(cl_mem), (void *)&src));
    CL_CHECK(clSetKernelArg(kernel_bits_rev, 2, sizeof(cl_int), (void *)&m));
    CL_CHECK(clSetKernelArg(kernel_bits_rev, 3, sizeof(cl_int), (void *)&n));

    CL_CHECK(clSetKernelArg(kernel_butterfly_op, 0, sizeof(cl_mem), (void *)&dst));
    CL_CHECK(clSetKernelArg(kernel_butterfly_op, 1, sizeof(cl_mem), (void *)&twiddles));
    CL_CHECK(clSetKernelArg(kernel_butterfly_op, 2, sizeof(cl_int), (void *)&m));
    CL_CHECK(clSetKernelArg(kernel_butterfly_op, 3, sizeof(cl_int), (void *)&n));
    CL_CHECK(clSetKernelArg(kernel_butterfly_op, 5, sizeof(cl_uint), (void *)&flag));

    CL_CHECK(clSetKernelArg(kernel_normalize, 0, sizeof(cl_mem), (void *)&dst));
    CL_CHECK(clSetKernelArg(kernel_normalize, 1, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n, n);

    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_bits_rev, 2, NULL, global_wg, local_wg, 0, NULL, &fft_event->kernel_bitsrev));

    config_workgroup_size(global_wg, local_wg, n/2, n);

    for (iter = 1; iter <= m; iter++) {
         CL_CHECK(clSetKernelArg(kernel_butterfly_op, 4, sizeof(cl_int), (void *)&iter));
         CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_butterfly_op, 2, NULL, global_wg, local_wg, 0, NULL, &kernel_butter_events[butter_event_it]));
         butter_event_it++;
    }

    fft_event->kernel_normalize = NULL;

    if (direcao == inversa) {
        config_workgroup_size(global_wg, local_wg, n, n);
         CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_normalize, 2, NULL, global_wg, local_wg, 0, NULL, &fft_event->kernel_normalize));
    }

    clReleaseKernel(kernel_bits_rev);
    clReleaseKernel(kernel_butterfly_op);
    clReleaseKernel(kernel_normalize);

    return 0;
}
clblasStatus DGEMM_BIG_MOD48(
	clblasTranspose transA,
	clblasTranspose transB,
	cl_uint M, cl_uint N, cl_uint K,
	double alpha,
	cl_mem A, cl_uint offA, cl_uint lda,
	cl_mem B, cl_uint offB, cl_uint ldb,
	double beta,
	cl_mem C, cl_uint offC, cl_uint ldc,
	cl_uint numCommandQueues,
	cl_command_queue *commandQueues,
	cl_uint numEventsInWaitList,
	const cl_event *eventWaitList,
	cl_event *events,
	bool &specialCaseHandled)
{
	const char *tileKernelSource = NULL;
	cl_kernel  *tileClKernel = NULL;
	size_t tileKernelBinarySize = 0;
	cl_int err;


	const unsigned char *tileKernelBinary = NULL;

	clblasStatus status;
	//split the kernel calls to handle dgemm NT perf drop when matrix sizes are big
	if ((lda == ldb) && (lda >= 18000) && (lda <= 36000)) // between 18000 and 36000 for now
	{
		if (!((transA == clblasNoTrans) && (transB == clblasTrans)))
			return clblasNotImplemented;

		unsigned int M_split_factor;
		unsigned int N_split_factor;
		unsigned int K_split_factor;
		if ((M % 192 == 0) && (N % 192 == 0) && (K % 192 == 0) && (K > lda / 4))
		{
			M_split_factor = 4;
			N_split_factor = 4;
			K_split_factor = 4;
		}
		else if ((M % 96 == 0) && (N % 96 == 0) && (K % 96 == 0) && (K > lda / 4))
		{
			M_split_factor = 2;
			N_split_factor = 2;
			K_split_factor = 2;
		}
		else
		{
			return clblasNotImplemented;
		}

		tileKernelSource = dgemm_Col_NT_B1_MX048_NX048_KX08_src;
		tileClKernel = &dgemm_Col_NT_B1_MX048_NX048_KX08_clKernel;
		tileKernelBinary = dgemm_Col_NT_B1_MX048_NX048_KX08_bin;
		tileKernelBinarySize = dgemm_Col_NT_B1_MX048_NX048_KX08_binSize;

		makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions);

		err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_double), &alpha);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_double), &beta);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB);
		CL_CHECK(err);
		err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC);
		CL_CHECK(err);

		status = GEMM_SPLIT_CALLS(
			tileClKernel, clblasColumnMajor,
			48, 8,
			M_split_factor,
			N_split_factor, K_split_factor,
			transA,
			transB,
			M, N, K,
			alpha,
			A, offA, lda,
			B, offB, ldb,
			beta,
			C, offC, ldc,
			numCommandQueues,
			commandQueues,
			numEventsInWaitList,
			eventWaitList,
			events);
		if (status == clblasSuccess)
			specialCaseHandled = true;

		return status;
	}


	return clblasNotImplemented;
}
Example #9
0
int main(int argc, char *argv[])
{
    /* Variaveis obrigatorias do openCL pdccpk*/
    cl_platform_id          platform_ids[2];
    cl_device_id            device_id;
    cl_context              context;
    cl_command_queue        commands;
    cl_program              program;
    cl_kernel               kernel_sobel;

    cl_int                  ret_code;
    cl_uint                 ret_num_devices;
    cl_uint                 ret_num_platforms;
    //
    cl_event                kernel_event;
    cl_ulong                kernel_start_time   = (cl_ulong) 0;
    cl_ulong                kernel_end_time     = (cl_ulong) 0;
    cl_ulong                kernel_run_time     = (cl_ulong) 0;

    cl_event                write_host_dev_event;
    cl_ulong                write_host_dev_start_time   = (cl_ulong) 0;
    cl_ulong                write_host_dev_end_time     = (cl_ulong) 0;
    cl_ulong                write_host_dev_run_time     = (cl_ulong) 0;

    cl_event                read_dev_host_event;
    cl_ulong                read_dev_host_start_time    = (cl_ulong) 0;
    cl_ulong                read_dev_host_end_time      = (cl_ulong) 0;
    cl_ulong                read_dev_host_run_time      = (cl_ulong) 0;

    unsigned __int64        image_tam;
    const unsigned __int64  MEGA_BYTES   =  1048576; // 1024*1024
    double                  image_tam_MB;
    double                  tempo_total;

    /* objetos que serao armazenados na memoria da GPU */
    cl_mem                  image_in_mem, image_out_mem;
    /* objetos que serao armazenados na memoria local (host) */
    unsigned char           *image_in_host, *image_out_host;
    unsigned  int           image_width, image_height;
    size_t                  image_size;
    /*IMPORTANTE: dimensionamento dos compute units para exec do kernel*/
    size_t                  work_global[C_NUM_DIMENSOES];
    size_t                  work_local[C_NUM_DIMENSOES];
    /*Setup dos nomes de arquivos*/
    const char              *kernel_filename = C_NOME_ARQ_KERNEL;
    pgm_t                   ipgm, opgm;
    /* Codigo fonte do kernel dever ser aberto como uma cadeia de caracteres*/
    image_file_t* image_filename;
    char* output_filename;
    FILE                    *fp;
    size_t                  source_size;
    char                    *source_str;

    /* Timer count start */
    timer_reset();
    timer_start();

    if (argc < 2) {
        printf("**Erro: A imagem de entrada é necessaria.\n");
        exit(EXIT_FAILURE);
    }

    //===================================================================================================
    image_filename = (image_file_t *) malloc(sizeof(image_file_t));
    split_image_filename(image_filename, argv[1]);
    output_filename = (char *) malloc(40*sizeof(char));
    sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO);
    //===================================================================================================

    fp = fopen(kernel_filename, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }

    source_str  = (char *) malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    //===================================================================================================
    // Abrindo imagem do arquivo para objeto de memoria local
    if( ler_pgm(&ipgm, argv[1]) == -1)
        exit(EXIT_FAILURE);

    image_in_host = ipgm.buf;
    image_width  = ipgm.width;
    image_height = ipgm.height;

    image_size =  (int) (image_width * image_height) * sizeof(unsigned char);

    image_tam = image_size;

    /* Alocando memoria para a imagem de saida apos o processamento*/
    image_out_host = (unsigned char *) malloc(image_size);
    //===================================================================================================

    /* Recebe um vetor de platform_id e retorna sucesso
     * se encontrar plataformas OpenCL no sistema, inseridos
     * essas plataformas no vetor com no maximo MAX_PLATFORM_ID
     * entradas, caso contrario retorna codigo de erro.
     * CL_CHECK é um macro para retornar o titulo do erro
     * a partir de uma funcao que retorne um codigo de erro
     ***************************************************/

    CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_ids, &ret_num_platforms));

    if (ret_num_platforms == 0) {
        fprintf(stderr, "[Erro] Não existem plataformas OpenCL\n");
        exit(2);
    }
    //===================================================================================================

    /* Recebe uma platform_id e retorna sucesso
     * se obter um device do tipo GPU dessa plataforma OpenCL
     * caso contrario retorna codigo de erro.
     ***************************************************/

    CL_CHECK(clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices));
    //print_platform_info(&platform_ids[0]);
    //system("pause");
    //exit(0);
    //===================================================================================================

    /* Retorna sucesso se consegui criar um contexto para
     * o device id escolhido, caso contrario retorna codigo de erro.
     ***************************************************/
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code);
    //===================================================================================================

    commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code);
    //===================================================================================================

    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, NULL);
    //===================================================================================================

    ret_code = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    if (ret_code != CL_SUCCESS) {
        char build_str[4096];
        fprintf(stderr, "[ERRO] clBuildProgram '%s' (code: %d)\n",
                error_cl_str(ret_code), ret_code );
        clGetProgramBuildInfo( program, device_id,
                               CL_PROGRAM_BUILD_LOG, sizeof(build_str), build_str, NULL);
        fprintf(stderr, "[ERRO] log: '%s'\n", build_str);
        system("pause");
        exit(4);
    }
    //===================================================================================================

    kernel_sobel = clCreateKernel(program, "sobel_kernel", NULL);
    image_in_mem  = clCreateBuffer(context, CL_MEM_READ_ONLY, image_size, NULL, NULL);
    image_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, image_size , NULL, NULL);
    //===================================================================================================

    CL_CHECK(clEnqueueWriteBuffer(commands, image_in_mem, CL_TRUE, 0, image_size, image_in_host, 0, NULL, &write_host_dev_event));
    CL_CHECK(clSetKernelArg(kernel_sobel, 0, sizeof(cl_mem), &image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_sobel, 1, sizeof(cl_mem), &image_out_mem));
    //===================================================================================================

    work_global[0] = image_width;
    work_global[1] = image_height;
    work_local[0] = MAX_WORK_GROUP_ITEM_SIZE_DIM_1;
    work_local[1] = MAX_WORK_GROUP_ITEM_SIZE_DIM_2;
    //===================================================================================================

    CL_CHECK(clEnqueueNDRangeKernel(commands, kernel_sobel, 2, NULL, work_global, work_local, 0, NULL,  &kernel_event) );
    // CL_CHECK(clFinish(commands));
    // CL_CHECK( clWaitForEvents(1 , &kernel_event) );
    //===================================================================================================

    CL_CHECK(clEnqueueReadBuffer(commands, image_out_mem, CL_TRUE, 0, image_size, image_out_host, 0, NULL, &read_dev_host_event));

    //== Total time elapsed =============================================================================
    timer_stop();
    tempo_total = get_elapsed_time();
    //===================================================================================================
    //====== Get time of Profile Info ===================================================================
    // kernel sobel time
    CL_CHECK(clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
    // Write data time
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL));
    // Read data time
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL));
    //===================================================================================================

    write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time;
    read_dev_host_run_time =  read_dev_host_end_time -  read_dev_host_start_time;
    kernel_run_time = kernel_end_time - kernel_start_time;

    image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES);

    //===================================================================================================
    save_log_gpu(image_filename, kernel_run_time, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)),
        (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME);
    //===================================================================================================

    opgm.width  = image_width;
    opgm.height = image_height;
    opgm.buf    = image_out_host;

    escrever_pgm(&opgm, output_filename);

    //===================================================================================================
    CL_CHECK(clReleaseMemObject(image_in_mem));
	CL_CHECK(clReleaseEvent(kernel_event));
    CL_CHECK(clReleaseEvent(read_dev_host_event));
    CL_CHECK(clReleaseEvent(write_host_dev_event));
    CL_CHECK(clReleaseMemObject(image_out_mem));
    CL_CHECK(clReleaseProgram(program));
    CL_CHECK(clReleaseKernel(kernel_sobel));
    CL_CHECK(clReleaseCommandQueue(commands));
    CL_CHECK(clReleaseContext(context));
    destruir_pgm(&ipgm);
    destruir_pgm(&opgm);
    free(source_str);
    free(image_filename);
    free(output_filename);

    //_CrtDumpMemoryLeaks();

    return 0;
}
bool OpenCLPlatform::Query() {
  Check();

  // get the name
  cl_int err = 0;
  name_ = platform_.getInfo<CL_PLATFORM_NAME>(
      &err);
  if (!CL_CHECK(
      err)) {
    name_ = "Failed to get platform name.";
  }

  // get the vendor
  vendor_ = platform_.getInfo<CL_PLATFORM_VENDOR>(
      &err);
  if (!CL_CHECK(
      err)) {
    vendor_ = "Failed to get platform vendor.";
  }

  version_ = platform_.getInfo<CL_PLATFORM_VERSION>(
      &err);
  if (!CL_CHECK(
      err)) {
    version_ = "Failed to get platform version.";
  }

  extensions_ = platform_.getInfo<CL_PLATFORM_EXTENSIONS>(
      &err);
  if (!CL_CHECK(
      err)) {
    extensions_ = "failed to get platform extensions.";
  }

  profile_ = platform_.getInfo<CL_PLATFORM_PROFILE>(
      &err);
  if (!CL_CHECK(
      err)) {
    profile_ = "Failed to get platform profile.";
  }

  // CPU & GPU devices
  if (!CL_CHECK(
      clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU,
                     0, NULL, &(numDevices)))) {
    return false;
  }
  devicePtr = reinterpret_cast<cl_device_id*>(malloc(
      numDevices * sizeof(cl_device_id)));
  if (!CL_CHECK(
      clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU,
                     numDevices, devicePtr, NULL))) {
    return false;
  }

  // CPU devices
  if (clGetDeviceIDs(
      platform_(),
      CL_DEVICE_TYPE_CPU,
      0,
      NULL,
      &(numCPUDevices)) == CL_SUCCESS) {
    cpuDevicePtr = reinterpret_cast<cl_device_id*>(malloc(
        numCPUDevices * sizeof(cl_device_id)));
    if (!CL_CHECK(clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_CPU,
                                 numCPUDevices, cpuDevicePtr, NULL))) {
      return false;
    }

    for (int i = 0; i < numCPUDevices; i++) {
      OpenCLDevice d(
          platform_(),
          cpuDevicePtr[i]);
      d.query();
      devices.push_back(
          d);
    }
  }

  // GPU DEVICES
  if (!CL_CHECK(clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_GPU,
                               0, NULL, &(numGPUDevices)))) {
    return false;
  }
  gpuDevicePtr = reinterpret_cast<cl_device_id*>(malloc(
      numGPUDevices * sizeof(cl_device_id)));
  if (!CL_CHECK(clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_GPU,
                               numGPUDevices, gpuDevicePtr, NULL))) {
    return false;
  }

  for (int i = 0; i < numGPUDevices; i++) {
    OpenCLDevice d(
        platform_(),
        gpuDevicePtr[i]);
    d.query();
    devices.push_back(
        d);
  }
  return true;
}
Example #11
0
static void
init_helmholtzbem3d_opencl(pcbem3d bem)
{
  uint      num_kernels;
  const char *kernel_names[] = {
    "assemble_slp_cc_list_0", "assemble_slp_cc_list_1",
    "assemble_slp_cc_list_2", "assemble_slp_cc_list_3",
    "assemble_dlp_cc_list_0", "assemble_dlp_cc_list_1",
    "assemble_dlp_cc_list_2", "assemble_dlp_cc_list_3"
  };
  cl_uint   num_devices = ocl_system.num_devices;
  cl_uint   num_queues = ocl_system.queues_per_device;
  cl_uint   nthreads = num_devices * num_queues;
  cl_int    res;
  cl_mem_flags mem_rflags, mem_wflags;
  uint      i, j;
  real     *gr_x;
  uint     *gr_t;
  real     *gr_n;
  real     *q_xw, *x, *w;
  uint      q, v, t;

  num_kernels = sizeof(kernel_names) / sizeof(kernel_names[0]);
  mem_rflags = CL_MEM_READ_ONLY;
  mem_wflags = CL_MEM_WRITE_ONLY;

  if (ocl_bem3d.num_kernels == 0) {

    /****************************************************
     * Setup all necessary kernels
     ****************************************************/

    setup_kernels(helmholtzbem3d_ocl_src, num_kernels, kernel_names,
		  &ocl_bem3d.kernels);
    ocl_bem3d.num_kernels = num_kernels;

    /****************************************************
     * Create buffers for matrix chunks
     ****************************************************/

    ocl_bem3d.mem_N = (cl_mem *) allocmem(nthreads * sizeof(cl_mem));
    ocl_bem3d.mem_ridx = (cl_mem *) allocmem(nthreads * sizeof(cl_mem));
    ocl_bem3d.mem_cidx = (cl_mem *) allocmem(nthreads * sizeof(cl_mem));

    for (i = 0; i < num_devices; ++i) {
      for (j = 0; j < num_queues; ++j) {
	ocl_bem3d.mem_N[j + i * num_queues] =
	  clCreateBuffer(ocl_system.contexts[i], mem_wflags,
			 ocl_system.max_package_size, NULL, &res);
	CL_CHECK(res)

	  ocl_bem3d.mem_ridx[j + i * num_queues] =
	  clCreateBuffer(ocl_system.contexts[i], mem_rflags,
			 ocl_system.max_package_size / sizeof(field) *
			 sizeof(uint), NULL, &res);
	CL_CHECK(res)

	  ocl_bem3d.mem_cidx[j + i * num_queues] =
	  clCreateBuffer(ocl_system.contexts[i], mem_rflags,
			 ocl_system.max_package_size / sizeof(field) *
			 sizeof(uint), NULL, &res);
	CL_CHECK(res)
      }
    }

    /****************************************************
     * Create buffer for non singular quadrature rules
     * and copy the contents
     ****************************************************/

    q = bem->sq->q;

    x = allocreal(q);
    w = allocreal(q);
    q_xw = allocreal(2 * q);

    assemble_gauss(q, x, w);

    for (i = 0; i < q; ++i) {
      q_xw[2 * i] = 0.5 + 0.5 * x[i];
      q_xw[2 * i + 1] = 0.5 * w[i];
    }

    ocl_bem3d.mem_q_xw = (cl_mem *) allocmem(num_devices * sizeof(cl_mem));
    for (i = 0; i < num_devices; ++i) {
      ocl_bem3d.mem_q_xw[i] =
	clCreateBuffer(ocl_system.contexts[i], mem_rflags,
		       2 * q * sizeof(real), NULL, &res);
      CL_CHECK(res)
	res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues],
				   ocl_bem3d.mem_q_xw[i], CL_TRUE, 0,
				   2 * q * sizeof(real), q_xw, 0, NULL, NULL);
      CL_CHECK(res);
    }

    ocl_bem3d.nq = 2 * q;

    freemem(x);
    freemem(w);
    freemem(q_xw);

    /****************************************************
     * Create buffer for singular quadrature rules
     * and copy the contents
     ****************************************************/

    q = bem->sq->q2;

    x = allocreal(q);
    w = allocreal(q);
    q_xw = allocreal(2 * q);

    assemble_gauss(q, x, w);

    for (i = 0; i < q; ++i) {
      q_xw[2 * i] = 0.5 + 0.5 * x[i];
      q_xw[2 * i + 1] = 0.5 * w[i];
    }

    ocl_bem3d.mem_q2_xw = (cl_mem *) allocmem(num_devices * sizeof(cl_mem));
    for (i = 0; i < num_devices; ++i) {
      ocl_bem3d.mem_q2_xw[i] = clCreateBuffer(ocl_system.contexts[i],
					      mem_rflags,
					      2 * q * sizeof(real), NULL,
					      &res);
      CL_CHECK(res)
	res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues],
				   ocl_bem3d.mem_q2_xw[i], CL_TRUE, 0,
				   2 * q * sizeof(real), q_xw, 0, NULL, NULL);
      CL_CHECK(res);
    }

    ocl_bem3d.nq2 = 2 * q;

    freemem(x);
    freemem(w);
    freemem(q_xw);

    /****************************************************
     * Create buffers for geometry data and copy the contents
     ****************************************************/

    v = bem->gr->vertices;
    t = bem->gr->triangles;

    gr_x = allocreal(3 * v);
    gr_t = allocuint(3 * t);
    gr_n = allocreal(3 * t);

    for (i = 0; i < v; ++i) {
      gr_x[3 * i + 0] = bem->gr->x[i][0];
      gr_x[3 * i + 1] = bem->gr->x[i][1];
      gr_x[3 * i + 2] = bem->gr->x[i][2];
    }

    for (i = 0; i < t; ++i) {
      gr_t[i + 0 * t] = bem->gr->t[i][0];
      gr_t[i + 1 * t] = bem->gr->t[i][1];
      gr_t[i + 2 * t] = bem->gr->t[i][2];
    }

    for (i = 0; i < t; ++i) {
      gr_n[3 * i + 0] = bem->gr->n[i][0];
      gr_n[3 * i + 1] = bem->gr->n[i][1];
      gr_n[3 * i + 2] = bem->gr->n[i][2];
    }

    ocl_bem3d.mem_gr_t = (cl_mem *) allocmem(num_devices * sizeof(cl_mem));
    ocl_bem3d.mem_gr_x = (cl_mem *) allocmem(num_devices * sizeof(cl_mem));

    for (i = 0; i < num_devices; ++i) {
      ocl_bem3d.mem_gr_x[i] =
	clCreateBuffer(ocl_system.contexts[i], mem_rflags,
		       3 * v * sizeof(real), NULL, &res);
      CL_CHECK(res)

	ocl_bem3d.mem_gr_t[i] =
	clCreateBuffer(ocl_system.contexts[i], mem_rflags,
		       3 * t * sizeof(uint), NULL, &res);
      CL_CHECK(res)

	res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues],
				   ocl_bem3d.mem_gr_x[i], CL_TRUE, 0,
				   3 * v * sizeof(real), gr_x, 0, NULL, NULL);
      CL_CHECK(res);

      res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues],
				 ocl_bem3d.mem_gr_t[i], CL_TRUE, 0,
				 3 * t * sizeof(uint), gr_t, 0, NULL, NULL);
      CL_CHECK(res);
    }

    ocl_bem3d.triangles = t;

    freemem(gr_x);
    freemem(gr_t);
    freemem(gr_n);

    omp_init_lock(&nf_lock);
    omp_init_lock(&nf_dist_lock);
    omp_init_lock(&nf_vert_lock);
    omp_init_lock(&nf_edge_lock);
    omp_init_lock(&nf_iden_lock);
  }
Example #12
0
static void
fill_cc_ocl_gpu_wrapper_helmholtzbem3d(void *data, uint kernel)
{
  merge_data_nf *mdata = (merge_data_nf *) data;

  cl_int    res;
  cl_uint   nq2;
  cl_uint   triangles;
  uint      current_device;
  uint      current_queue;
  uint      current_kernel_id;
  uint      current_thread;
  cl_uint   num_devices = ocl_system.num_devices;
  cl_uint   num_queues = ocl_system.queues_per_device;
  cl_event  h2d[2], calc;
  cl_kernel oclkernel;
  cl_command_queue queue;
  size_t    global_off[] = {
    0
  };
  size_t    local_size[] = {
    128
  };
  size_t    global_size[] = {
    ((mdata->pos + local_size[0] - 1) / local_size[0]) * local_size[0]
  };
  field     k = mdata->bem->k;
  field     alpha = mdata->bem->alpha;

  /****************************************************
   * Determine device, queue and kernel.
   ****************************************************/

  current_device = omp_get_thread_num() / num_queues;
  current_queue = omp_get_thread_num() % num_queues;
  current_thread = current_queue + current_device * num_queues;
  current_kernel_id = current_queue + current_device * num_queues
    + kernel * num_devices * num_queues;
  oclkernel = ocl_bem3d.kernels[current_kernel_id];
  queue = ocl_system.queues[current_thread];

  /****************************************************
   * Transfer input data.
   ****************************************************/

  res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_ridx[current_thread],
			     CL_FALSE, 0, mdata->pos * sizeof(uint),
			     mdata->ridx, 0, NULL, h2d + 0);
  CL_CHECK(res)

    res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_cidx[current_thread],
			       CL_FALSE, 0, mdata->pos * sizeof(uint),
			       mdata->cidx, 0, NULL, h2d + 1);
  CL_CHECK(res)

  /****************************************************
   * Setup kernel arguments for 'assemble_xxx_cc_list_z'
   ****************************************************/
    triangles = ocl_bem3d.triangles;

  if (kernel == 0 || kernel == 4) {
    nq2 = ocl_bem3d.nq;
    res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem),
			 &ocl_bem3d.mem_q_xw[current_device]);
    CL_CHECK(res)
      res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2);
  }
  else {
    nq2 = ocl_bem3d.nq2;
    res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem),
			 &ocl_bem3d.mem_q2_xw[current_device]);
    CL_CHECK(res)
      res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2);
  }
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 2, sizeof(cl_mem),
			 &ocl_bem3d.mem_gr_t[current_device]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 3, sizeof(cl_mem),
			 &ocl_bem3d.mem_gr_x[current_device]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 4, sizeof(cl_uint), &triangles);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 5, sizeof(cl_mem),
			 &ocl_bem3d.mem_ridx[current_thread]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 6, sizeof(cl_mem),
			 &ocl_bem3d.mem_cidx[current_thread]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 7, sizeof(cl_mem),
			 &ocl_bem3d.mem_N[current_thread]);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 8, sizeof(cl_uint), &mdata->pos);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 9, sizeof(field), &k);
  CL_CHECK(res)
    res = clSetKernelArg(oclkernel, 10, sizeof(field), &alpha);
  CL_CHECK(res)

  /****************************************************
   * Invoke the kernel 'assemble_xxx_cc_list_z' on the GPU
   ****************************************************/
    res = clEnqueueNDRangeKernel(queue, oclkernel, 1, global_off, global_size,
				 local_size, 2, h2d, &calc);
  CL_CHECK(res)

  /****************************************************
   * Copy results back.
   ****************************************************/
    res =
    clEnqueueReadBuffer(queue, ocl_bem3d.mem_N[current_thread], CL_TRUE, 0,
			mdata->pos * sizeof(field), mdata->N, 1, &calc, NULL);
  CL_CHECK(res)

    clReleaseEvent(h2d[0]);
  clReleaseEvent(h2d[1]);
  clReleaseEvent(calc);
}
Example #13
0
void clean_all(void) {

		printf("Cleaning Variables ... \n\n");
		
		// Opencl environment variables
		clReleaseCommandQueue(command_queue);
		clReleaseContext(context);
		
		
		// Release all memory allocated

		
		if (Data_MeshType == UNSTRUCTURED) {
	
			// Mesh Variables
			free(MeshElementArray.Node1);
			free(MeshElementArray.Node2);
			free(MeshElementArray.Node3);
			free(MeshElementArray.Node4);
			
			free(MeshNodeArray_double.x);
			free(MeshNodeArray_double.y);
			free(MeshNodeArray_double.z);


			free(MeshElementArray.Neighborindex1);
			free(MeshElementArray.Neighborindex2);
			free(MeshElementArray.Neighborindex3);
			free(MeshElementArray.Neighborindex4);
		
			clReleaseMemObject(Mesh_Node_x);
			clReleaseMemObject(Mesh_Node_y);
			clReleaseMemObject(Mesh_Node_z);
			
			clReleaseMemObject(Mesh_Element_Node1);
			clReleaseMemObject(Mesh_Element_Node2);
			clReleaseMemObject(Mesh_Element_Node3);
			clReleaseMemObject(Mesh_Element_Node4);
			
			clReleaseMemObject(Mesh_Element_Neighborindex1);
			clReleaseMemObject(Mesh_Element_Neighborindex2);
			clReleaseMemObject(Mesh_Element_Neighborindex3);
			clReleaseMemObject(Mesh_Element_Neighborindex4);
			
			clReleaseMemObject(r);
			clReleaseMemObject(s);
			clReleaseMemObject(t);
			clReleaseMemObject(eid);
			
		}

		// Cleaning Velocity variables
		
			free(velocity.u0);
			free(velocity.v0);
			free(velocity.w0);
			free(velocity.u1);
			free(velocity.v1);
			free(velocity.w1);
			free(velocity.time0);
			free(velocity.time1);
			
			free(Tracer.x);
			Tracer.x = NULL;
			free(Tracer.y);
			Tracer.y = NULL;
			
		
			free(Tracer.z);
			Tracer.z = NULL;
		
			free(Tracer.ElementIndex);
			Tracer.ElementIndex = NULL;
			free(Tracer.Start_time);
			Tracer.Start_time = NULL;
			free(Tracer.Stop_time);
			Tracer.Stop_time = NULL;
			free(Tracer.LeftDomain);
			Tracer.LeftDomain = NULL;
			
			if (Trace_ReleaseStrategy == 1) {
				free(Tracer1.x);
				Tracer1.x = NULL;
				free(Tracer1.y);
				Tracer1.y = NULL;
			
		
				free(Tracer1.z);
				Tracer1.z = NULL;
		
				free(Tracer1.ElementIndex);
				Tracer1.ElementIndex = NULL;
				free(Tracer1.Start_time);
				Tracer1.Start_time = NULL;
				free(Tracer1.Stop_time);
				Tracer1.Stop_time = NULL;
				free(Tracer1.LeftDomain);
				Tracer1.LeftDomain = NULL;
			
				free(index1);
				index1 = NULL;
				
				free(Tracer.Status);
				Tracer.Status = NULL;
			}
			free(DataTime1);
			free(Output_time);
			free(Launch_time);
			
			
		clReleaseMemObject(Vel_U0);
		clReleaseMemObject(Vel_U1);
		clReleaseMemObject(Vel_V0);
		clReleaseMemObject(Vel_V1);
		clReleaseMemObject(Vel_W0);
		clReleaseMemObject(Vel_W1);
		
		clReleaseMemObject(x_dev);
		clReleaseMemObject(y_dev);
		
		clReleaseMemObject(posx);
		clReleaseMemObject(posy);
		clReleaseMemObject(xn0);
		clReleaseMemObject(xn1);
		clReleaseMemObject(integrate);
		
		if (Dimensions == 3) {
	
			clReleaseMemObject(z_dev);
			clReleaseMemObject(posz);
			clReleaseMemObject(xn2);
		}
	
		clReleaseMemObject(Start_time_dev);
		clReleaseMemObject(Stop_time_dev);
		
		clReleaseMemObject(ElementIndex_dev);
		clReleaseMemObject(LeftDomain_dev);
		
		// Remove Temp file containing tracer release information
		if (!Keep_Tempfile) {
			char BinFile[LONGSTRING];
			sprintf(BinFile, "%s%s.bin", Path_Output, Temp_OutFilePrefix);
			if(remove(BinFile))
					fprintf(stderr, "Warning: Could not delete file %s\n", BinFile);
		}
		
		CL_CHECK(clReleaseKernel(kernel1));
		CL_CHECK(clReleaseKernel(kernel2));
		CL_CHECK(clReleaseKernel(kernel3));
		CL_CHECK(clReleaseKernel(kernel4));
		CL_CHECK(clReleaseKernel(kernel5));
	
   		CL_CHECK(clReleaseProgram(program));
		printf("Cleaning Successfull \n\n");

}
Example #14
0
void initopencl(void) {
	
	int i;


	// Get Platform and Device Info
	CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms));
	
	// Currently this program only runs on a SINGLE GPU.
	CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices));
	printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
	printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
	
	
	char buffer[10240];
	cl_uint buf_uint;
	cl_ulong buf_ulong;
	printf("  -- %d --\n", i);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_NAME = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VENDOR = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DEVICE_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
	printf("  DRIVER_VERSION = %s\n", buffer);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
	printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
	CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
	printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);

	if (num_devices == 0)
	{	
		fprintf(stderr, "No Devices found that can run OpenCL.");
		exit(0);	
	}
	// Create OpenCL context
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating context: Function returned %d \n\n", ret);
		exit(1);
	
	}
	// Create Command Queue
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	if (ret != CL_SUCCESS) {
		
		fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret);
		exit(1);
	
	}
	
	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;
	
	fp = fopen("integrate.cl", "r");
	if (!fp) {
	    fprintf(stderr, "Failed to load kernel.\n");
	    exit(1);
	}
	
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose( fp );	
	
	
	// Create a program from the kernel source
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);	
    if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret);
			exit(1);
	}
    // Build the program
    
    ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); 
    if (ret != CL_SUCCESS)
    {
    
    	size_t length;
    	char buffer[10240];
    	clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
    	fprintf(stderr, "Error returned %d. \n\n", (int)ret);
    	printf("Error Log: \n\n %s \n\n", buffer);
    	exit(0);
    }
	
/*    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
*/	
	// Create the OpenCL kernel (check_int)
    kernel2 = clCreateKernel(program, "check_int", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret);
			exit(1);
	}
	
    
    // Create the OpenCL kernel (compute_points_Unstructure3D_1)
    kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
	kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel4 = clCreateKernel(program, "LocalSearch3D", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	// Create the OpenCL kernel (initialize_timestep3D)
    kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret);
	if (ret != CL_SUCCESS) {	
			fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n");
			exit(1);
	}
	
	
	
	printf("\n\n");
}
Example #15
0
clblasStatus SGEMM_SPLIT64_32(
	clblasTranspose transA,
	clblasTranspose transB,
	cl_uint M, cl_uint N, cl_uint K,
	float alpha,
	cl_mem A, cl_uint offA, cl_uint lda,
	cl_mem B, cl_uint offB, cl_uint ldb,
	float beta,
	cl_mem C, cl_uint offC, cl_uint ldc,
	cl_uint numCommandQueues,
	cl_command_queue *commandQueues,
	cl_uint numEventsInWaitList,
	const cl_event *eventWaitList,
	cl_event *events,
	bool &specialCaseHandled)
{
	//all the mod32 sizes that is not mod64 or mod96 ranging from 1184 to 3872 
	//non mod32 cases are not implemented in this approach and are of less interest
	const char *tileKernelSource = NULL;
	const char *rowKernelSource = NULL;
	const char *columnKernelSource = NULL;
	const char *singleKernelSource = NULL;

	cl_kernel  *tileClKernel = NULL;
	cl_kernel  *rowClKernel = NULL;
	cl_kernel  *columnClKernel = NULL;
	cl_kernel  *singleClKernel = NULL;

	const unsigned char *tileKernelBinary = NULL;
	const unsigned char *rowKernelBinary = NULL;
	const unsigned char *columnKernelBinary = NULL;
	const unsigned char *singleKernelBinary = NULL;

	size_t tileKernelBinarySize = 0;
	size_t rowKernelBinarySize = 0;
	size_t columnKernelBinarySize = 0;
	size_t singleKernelBinarySize = 0;

	cl_int err;
	
	if ((M >= 1184 && N >= 1184) && (M <= 3872 && N <= 3872) && (M % 64 != 0 && N % 64 != 0) && (M % 96 != 0 && N % 96 != 0) && (K % 16 == 0))
	{
		if ((M % 32 == 0 && N % 32 == 0) && (transA == clblasNoTrans && transB == clblasTrans))
		{
			specialCaseHandled = true;
			//execute the kernels

			//GlobalX = ((Mvalue - 1) / 64) * 16
			//GlobalY = ((Nvalue - 1) / 64) * 16
			size_t GlobalX = ((M - 1) / 64) * 16;
			size_t GlobalY = ((N - 1) / 64) * 16;
			size_t gs[2] = { GlobalX, GlobalY };
			size_t wgsize[2] = { 16, 16 };

			tileKernelSource = sgemm_Col_NT_B1_MX064_NX064_KX16_src;
			tileClKernel = &sgemm_Col_NT_B1_MX064_NX064_KX16_clKernel;
			tileKernelBinary = sgemm_Col_NT_B1_MX064_NX064_KX16_bin;
			tileKernelBinarySize = sgemm_Col_NT_B1_MX064_NX064_KX16_binSize;

			rowKernelSource = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src;
			rowClKernel = &sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel;
			rowKernelBinary = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_bin;
			rowKernelBinarySize = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_binSize;

			columnKernelSource = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src;
			columnClKernel = &sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel;
			columnKernelBinary = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_bin;
			columnKernelBinarySize = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_binSize;

			singleKernelSource = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src;
			singleClKernel = &sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel;
			singleKernelBinary = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_bin;
			singleKernelBinarySize = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_binSize;

			cl_kernel * Kernels[4] = { tileClKernel, rowClKernel, columnClKernel, singleClKernel };


			makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions);
			makeGemmKernel(rowClKernel, commandQueues[0], rowKernelSource, User_srcBuildOptions, &rowKernelBinary, &rowKernelBinarySize, User_binBuildOptions);
			makeGemmKernel(columnClKernel, commandQueues[0], columnKernelSource, User_srcBuildOptions, &columnKernelBinary, &columnKernelBinarySize, User_binBuildOptions);
			makeGemmKernel(singleClKernel, commandQueues[0], singleKernelSource, User_srcBuildOptions, &singleKernelBinary, &singleKernelBinarySize, User_binBuildOptions);

			for (int i = 0; i < 4; i++)
			{
				err = clSetKernelArg(*Kernels[i], 0, sizeof(cl_mem), &A);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 1, sizeof(cl_mem), &B);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 2, sizeof(cl_mem), &C);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 3, sizeof(cl_float), &alpha);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 4, sizeof(cl_float), &beta);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 5, sizeof(cl_uint), &M);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 6, sizeof(cl_uint), &N);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 7, sizeof(cl_uint), &K);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 8, sizeof(cl_uint), &lda);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 9, sizeof(cl_uint), &ldb);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 10, sizeof(cl_uint), &ldc);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 11, sizeof(cl_uint), &offA);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 12, sizeof(cl_uint), &offB);
				CL_CHECK(err);
				err = clSetKernelArg(*Kernels[i], 13, sizeof(cl_uint), &offC);
				CL_CHECK(err);
			}

			err = clEnqueueNDRangeKernel(commandQueues[0], *Kernels[0], 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, NULL);

			gs[0] = 16;
			err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[1], 2, NULL, gs, wgsize, 0, NULL, NULL);

			gs[1] = 16;
			gs[0] = GlobalX;
			err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[2], 2, NULL, gs, wgsize, 0, NULL, NULL);

			gs[0] = 16; gs[1] = 16;
			err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[3], 2, NULL, gs, wgsize, 0, NULL, events);

			if (err == 0)
				return clblasSuccess;

		}
	}
	
	return clblasNotImplemented;
}
Example #16
0
    void clPrintDevices()
    {
        cl_int err;

        // Enumerate platforms.
        cl_platform_id platforms[8];
        cl_uint numPlatforms;
        CL_CHECK(clGetPlatformIDs(8, platforms, &numPlatforms));

        for (cl_uint ii = 0; ii < numPlatforms; ++ii)
        {
            // Get platform vendor.
            char vendor[256];
            CL_CHECK(clGetPlatformInfo(platforms[ii], CL_PLATFORM_VENDOR, 256, vendor, NULL));

            // Check for known vendors and save vendor str for later printing.
            char platformOutputStr[32];
            if (NULL != bx::stristr(vendor, "advanced micro devices", 256))
            {
                dm::strscpya(platformOutputStr, "amd");
            }
            else if (NULL != bx::stristr(vendor, "intel", 256))
            {
                dm::strscpya(platformOutputStr, "intel");
            }
            else if (NULL != bx::stristr(vendor, "nvidia", 256))
            {
                dm::strscpya(platformOutputStr, "nvidia");
            }
            else
            {
                dm::strscpya(platformOutputStr, dm::trim(vendor));
            }

            // Enumerate current platform devices.
            cl_device_id devices[8];
            cl_uint numDevices;
            err = clGetDeviceIDs(platforms[ii], CL_DEVICE_TYPE_ALL, 8, devices, &numDevices);
            if (CL_SUCCESS == err)
            {
                for (cl_uint jj = 0; jj < numDevices; ++jj)
                {
                    // Get device name.
                    char deviceName[128];
                    CL_CHECK(clGetDeviceInfo(devices[jj], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL));

                    // Get device type.
                    cl_device_type deviceType;
                    CL_CHECK(clGetDeviceInfo(devices[jj], CL_DEVICE_TYPE, sizeof(deviceType), &deviceType, NULL));

                    // Get device type str.
                    char deviceTypeOutputStr[16];
                    if (CMFT_CL_DEVICE_TYPE_GPU == deviceType)
                    {
                        dm::strscpya(deviceTypeOutputStr, "gpu");
                    }
                    else if (CMFT_CL_DEVICE_TYPE_CPU == deviceType)
                    {
                        dm::strscpya(deviceTypeOutputStr, "cpu");
                    }
                    else if (CMFT_CL_DEVICE_TYPE_ACCELERATOR == deviceType)
                    {
                        dm::strscpya(deviceTypeOutputStr, "accelerator");
                    }
                    else //if (CMFT_CL_DEVICE_TYPE_DEFAULT == deviceType)
                    {
                        dm::strscpya(deviceTypeOutputStr, "default");
                    }

                    // Print device info.
                    INFO("%-40s --clVendor %-6s --deviceIndex %u --deviceType %s"
                        , dm::trim(deviceName)
                        , platformOutputStr
                        , uint32_t(jj)
                        , deviceTypeOutputStr
                        );
                }
            }
        }
    }
Example #17
0
clblasStatus SGEMM_BRANCH_32(
	clblasTranspose transA,
	clblasTranspose transB,
	cl_uint M, cl_uint N, cl_uint K,
	float alpha,
	cl_mem A, cl_uint offA, cl_uint lda,
	cl_mem B, cl_uint offB, cl_uint ldb,
	float beta,
	cl_mem C, cl_uint offC, cl_uint ldc,
	cl_uint numCommandQueues,
	cl_command_queue *commandQueues,
	cl_uint numEventsInWaitList,
	const cl_event *eventWaitList,
	cl_event *events,
	bool &specialCaseHandled)
{
	const char *tileKernelSource = NULL;
	cl_kernel  *tileClKernel = NULL;
	size_t tileKernelBinarySize = 0;
	cl_int err;


	const unsigned char *tileKernelBinary = NULL;

	clblasStatus status;

	if ((M * N < 1080 * 1080) && (M % 32 != 0 || N % 32 != 0) && (K%16==0))
	{
		// ((Mvalue - 1) / 32 + 1) * 16
		size_t GlobalX = ((M - 1) / 32 + 1) * 16;
		size_t GlobalY = ((N - 1) / 32 + 1) * 16;
		size_t gs[2] = { GlobalX, GlobalY };
		size_t wgsize[2] = { 16, 16 };

		if (transA == clblasNoTrans && transB == clblasNoTrans)
		{
			specialCaseHandled = true;
			tileKernelSource = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src;
			tileClKernel = &sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel;
			tileKernelBinary = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_bin;
			tileKernelBinarySize = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_binSize;

			makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions);

			err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC);
			CL_CHECK(err);

			err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL,
				gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]);

			if (err == 0)
				return clblasSuccess;
		}
		if (transA == clblasNoTrans && transB == clblasTrans)
		{
			specialCaseHandled = true;
			tileKernelSource = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src;
			tileClKernel = &sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel;
			tileKernelBinary = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_bin;
			tileKernelBinarySize = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_binSize;

			makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions);

			err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC);
			CL_CHECK(err);

			err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL,
				gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]);

			if (err == 0)
				return clblasSuccess;
		}
		if (transA == clblasTrans && transB == clblasNoTrans)
		{
			specialCaseHandled = true;
			tileKernelSource = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src;
			tileClKernel = &sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel;
			tileKernelBinary = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_bin;
			tileKernelBinarySize = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_binSize;

			makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions);

			err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB);
			CL_CHECK(err);
			err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC);
			CL_CHECK(err);

			err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL,
				gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]);

			if (err == 0)
				return clblasSuccess;
		}
	}

	return clblasNotImplemented;
}
Example #18
0
    bool ClContext::init(uint8_t _vendor
                       , cl_device_type _preferredDeviceType
                       , cl_uint _preferredDeviceIdx
                       , char* _vendorStrPart
                       )
    {
        cl_int err = CL_SUCCESS;

        // Enumerate platforms.
        cl_platform_id platforms[8];
        cl_uint numPlatforms;
        CL_CHECK(clGetPlatformIDs(8, platforms, &numPlatforms));

        // Choose preferred platform.
        cl_platform_id choosenPlatform = platforms[0];
        if (NULL != _vendorStrPart)
        {
            char buffer[256];
            for (cl_uint ii = 0; ii < numPlatforms; ++ii)
            {
                // Get platform vendor.
                CL_CHECK(clGetPlatformInfo(platforms[ii], CL_PLATFORM_VENDOR, 256, buffer, NULL));

                if (_vendor&CMFT_CL_VENDOR_OTHER)
                {
                    // If specific vendor is requested, check for it.
                    const char* searchSpecific = bx::stristr(buffer, _vendorStrPart, 256);
                    if (NULL != searchSpecific)
                    {
                        choosenPlatform = platforms[ii];
                        break;
                    }
                }
                else
                {
                    bool found = false;

                    // Check for predefined vendors.
                    if (_vendor&CMFT_CL_VENDOR_AMD)
                    {
                        const char* searchAmd = bx::stristr(buffer, "advanced micro devices", 256);
                        found |= (NULL != searchAmd);
                    }

                    if (_vendor&CMFT_CL_VENDOR_INTEL)
                    {
                        const char* searchIntel  = bx::stristr(buffer, "intel", 256);
                        found |= (NULL != searchIntel);
                    }

                    if (_vendor&CMFT_CL_VENDOR_NVIDIA)
                    {
                        const char* searchNvidia = bx::stristr(buffer, "nvidia", 256);
                        found |= (NULL != searchNvidia);
                    }

                    if (found)
                    {
                        choosenPlatform = platforms[ii];
                        break;
                    }
                }
            }
        }

        // Enumerate devices.
        cl_device_id devices[8];
        cl_uint numDevices = 0;

        // First try to get preferred device type.
        for (cl_uint ii = 0; ii < numPlatforms; ++ii)
        {
            err = clGetDeviceIDs(platforms[ii], _preferredDeviceType, 8, devices, &numDevices);
            if (CL_SUCCESS == err)
            {
                choosenPlatform = platforms[ii];
                break;
            }
        }

        // If failed, just get anything there is.
        if (CL_SUCCESS != err)
        {
            for (cl_uint ii = 0; ii < numPlatforms; ++ii)
            {
                err = clGetDeviceIDs(platforms[ii], CL_DEVICE_TYPE_ALL, 8, devices, &numDevices);
                if (CL_SUCCESS == err)
                {
                    choosenPlatform = platforms[ii];
                    break;
                }
            }
        }

        if (CL_SUCCESS != err)
        {
            WARN("OpenCL context initialization failed!");
            return false;
        }

        // Choose preferred device and create context.
        cl_uint preferredDeviceIdx = (_preferredDeviceIdx < numDevices) ? _preferredDeviceIdx : 0;
        cl_device_id chosenDevice = devices[preferredDeviceIdx];
        cl_context context = clCreateContext(NULL, 1, &chosenDevice, NULL, NULL, &err);
        if (CL_SUCCESS != err)
        {
            chosenDevice = devices[0];
            context = clCreateContext(NULL, 1, &chosenDevice, NULL, NULL, &err);
            if (CL_SUCCESS != err)
            {
                WARN("OpenCL context initialization failed!");
                return false;
            }
        }

        // Get device name, vendor and type.
        char deviceVendor[128];
        CL_CHECK(clGetPlatformInfo(choosenPlatform, CL_PLATFORM_VENDOR, sizeof(deviceVendor), deviceVendor, NULL));
        char deviceName[128];
        CL_CHECK(clGetDeviceInfo(chosenDevice, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL));
        CL_CHECK(clGetDeviceInfo(chosenDevice, CL_DEVICE_TYPE, sizeof(m_deviceType), &m_deviceType, NULL));

        // Create command queue
        cl_command_queue commandQueue;
        commandQueue = clCreateCommandQueue(context, chosenDevice, 0, &err);
        if (CL_SUCCESS != err)
        {
            WARN("OpenCL context initialization failed!");
            return false;
        }

        // Fill structure.
        dm::strscpya(m_deviceVendor, dm::trim(deviceVendor));
        dm::strscpya(m_deviceName, dm::trim(deviceName));
        m_device = chosenDevice;
        m_context = context;
        m_commandQueue = commandQueue;

        return true;
    }
Example #19
0
int main(int argc, char **argv)
{
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	if (platforms_n == 0)
		return 1;

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", platforms_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err));

	const char *program_source[] = {
		"__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n",
		"{\n",
		"	int i = get_global_id(0);\n",
		"	dst[i] = src[i] * factor;\n",
		"}\n"
	};

	cl_program program;
	program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err));
	if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) {
		char buffer[10240];
		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
		fprintf(stderr, "CL Compilation failed:\n%s", buffer);
		abort();
	}
	CL_CHECK(clUnloadCompiler());

	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	int factor = 2;

	cl_kernel kernel;
	kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err));
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

	cl_command_queue queue;
	queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err));

	for (int i=0; i<NUM_DATA; i++) {
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
	CL_CHECK(clWaitForEvents(1, &kernel_completion));
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		int data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL));
		printf(" %d", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(input_buffer));
	CL_CHECK(clReleaseMemObject(output_buffer));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Example #20
0
int main(int argc, char *argv[])
{
    //FILE *fp;

    cl_platform_id      platform_id[2];
    cl_uint             ret_num_devices;
    cl_uint             ret_num_platforms;
    cl_int              ret_code;

    cl_mem              image_in_mem = NULL;
    cl_mem              image_out_mem = NULL;
    cl_mem              twiddle_factors_mem = NULL;
    cl_float2           *image_in_host;
    cl_float2           *twiddle_factors_host;

    cl_kernel           kernel_twiddle_factors;
    cl_kernel           kernel_matriz_transpose;
    cl_kernel           kernel_lowpass_filter;

    pgm_t ipgm;
    pgm_t opgm;

    image_file_t        *image_filename;
    char                *output_filename;
    FILE                *fp;
    const char          *kernel_filename = C_NOME_ARQ_KERNEL;
    size_t              source_size;
    char                *source_str;
    cl_int              i, j,n ,m;
    cl_int              raio = 0;
    size_t              global_wg[2];
    size_t              local_wg[2];
    float               *image_amplitudes;
    size_t              log_size;
    char                *log_file;

    cl_event            kernels_events_out_fft[4];

    cl_ulong            kernel_runtime              = (cl_ulong) 0;
    cl_ulong            kernel_start_time           = (cl_ulong) 0;
    cl_ulong            kernel_end_time             = (cl_ulong) 0;

    cl_event            write_host_dev_event;
    cl_ulong            write_host_dev_start_time   = (cl_ulong) 0;
    cl_ulong            write_host_dev_end_time     = (cl_ulong) 0;
    cl_ulong            write_host_dev_run_time     = (cl_ulong) 0;

    cl_event            read_dev_host_event;
    cl_ulong            read_dev_host_start_time    = (cl_ulong) 0;
    cl_ulong            read_dev_host_end_time      = (cl_ulong) 0;
    cl_ulong            read_dev_host_run_time      = (cl_ulong) 0;

    unsigned __int64    image_tam;
    unsigned __int64    MEGA_BYTES   =  1048576; // 1024*1024
    double              image_tam_MB;
    double              tempo_total;

    struct event_in_fft_t *fft_events;


   //=== Timer count start ==============================================================================
    timer_reset();
    timer_start();
    //===================================================================================================

    if (argc < 2) {
        printf("**Erro: O arquivo de entrada eh necessario.\n");
        exit(EXIT_FAILURE);
    }

    image_filename = (image_file_t *) malloc(sizeof(image_file_t));
    split_image_filename(image_filename, argv[1]);
    output_filename = (char *) malloc(40*sizeof(char));
    sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO);

    fp = fopen(kernel_filename, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(EXIT_FAILURE);
    }

    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    //===================================================================================================
     /* Abrindo imagem do arquivo para objeto de memoria local*/
    if( ler_pgm(&ipgm, argv[1]) == -1)
        exit(EXIT_FAILURE);

    n = ipgm.width;
    raio = n/8;
    m = (cl_int)(log((double)n)/log(2.0));

    image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2));
    twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));

    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j];
            image_in_host[n*i + j].s[1] = (float)0;
        }
    }

    fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t));

    kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event));

    //===================================================================================================
    CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms));

    if (ret_num_platforms == 0 ) {
        fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n");
        exit(2);
    }

    //===================================================================================================

    CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices));
    //print_platform_info(&platform_id[1]);

    //===================================================================================================
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code);
    //===================================================================================================

    cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code);
    //===================================================================================================

    image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code);
    image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code);
    twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code);
    //===================================================================================================

    /* Transfer data to memory buffer */
    CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event));

    image_tam = n*n*sizeof(cl_float2);

    //===================================================================================================
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code);
    //===================================================================================================
    ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    //===================================================================================================
    if (ret_code != CL_SUCCESS) {
    // Determine the size of the log
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    //===================================================================================================

    // Allocate memory for the log
    log_file = (char *) malloc(log_size);

    // Get the log
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL);
    printf("%s\n", log_file);
    system("pause");
    exit(0);
}
    kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code);
    kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code);
    kernel_lowpass_filter  = clCreateKernel(program, "lowpass_filter", &ret_code);

    /* Processa os fatores Wn*/
    //===================================================================================================
    CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem));
    CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n/2, 1);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0]));

    //===================================================================================================
    /* Executa a FFT em N/2 */
    fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]);

    //===================================================================================================
    /* Realiza a transposta da Matriz (imagem) */
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1]));

    //===================================================================================================
    /* Executa a FFT N/2 */
    fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]);

    //===================================================================================================
    /* Processa o filtro passa baixa */
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n));
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2]));

    //===================================================================================================
    /* Obtem a FFT inversa*/
    fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]);
    //===================================================================================================

    /* Realiza a transposta da Matriz (imagem) */
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3]));

    //===================================================================================================
    fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]);
    //===================================================================================================

    CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event));
    //===================================================================================================

    //== Total time elapsed ============================================================================
    timer_stop();
    tempo_total = get_elapsed_time();
    //==================================================================================================

    //====== Get time of Profile Info ==================================================================
    // Write data time
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL));
    // Read data time
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL));

    for (i = 0; i < MAX_CALL_FFT; i++) {

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;
        CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;
        CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;

        if (fft_events[i].kernel_normalize != NULL) {
            CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
            CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
            kernel_runtime += (kernel_end_time - kernel_start_time);
        }
   }

    for (j=0; j < MAX_CALL_FFT*m; j++){
        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;

        CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);
    }

    write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time;
    read_dev_host_run_time =  read_dev_host_end_time -  read_dev_host_start_time;

/* save_log_debug(write_host_dev_run_time,fp);
    save_log_debug(read_dev_host_run_time,fp);
    close_log_debug(fp); */

    image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES);

    //==================================================================================================
    save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)),
    (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME);

    //===================================================================================================
    image_amplitudes = (float*)malloc(n*n*sizeof(float));
    for (i=0; i < n; i++) {
        for (j=0; j < n; j++) {
            image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1]));
        }
    }

    //clFlush(cmd_queue);
    //clFinish(cmd_queue);
    opgm.width = n;
    opgm.height = n;

    normalizar_pgm(&opgm, image_amplitudes);
    escrever_pgm(&opgm, output_filename);

    //===================================================================================================
	clFinish(cmd_queue);
    clReleaseKernel(kernel_twiddle_factors);
    clReleaseKernel(kernel_matriz_transpose);
    clReleaseKernel(kernel_lowpass_filter);
    clReleaseProgram(program);
    clReleaseMemObject(image_in_mem);
    clReleaseMemObject(image_out_mem);
    clReleaseMemObject(twiddle_factors_mem);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
	clReleaseEvent(read_dev_host_event);
	clReleaseEvent(write_host_dev_event);
	clReleaseEvent(kernels_events_out_fft[0]);
	clReleaseEvent(kernels_events_out_fft[1]);
	clReleaseEvent(kernels_events_out_fft[2]);
	clReleaseEvent(kernels_events_out_fft[3]);
    destruir_pgm(&ipgm);
    destruir_pgm(&opgm);
    free(image_amplitudes);
    free(source_str);
    free(image_in_host);
    free(image_filename);
    free(twiddle_factors_host);
    free(output_filename);
    free(fft_events);
    free(kernel_butter_events);

    //_CrtDumpMemoryLeaks();

    return 0;
}