Beispiel #1
0
int main(int argc, char *argv[]) {
	cl_int errNum;
	cl_uint nPlataformas;
	cl_uint nDispositivos;
	cl_platform_id *listaPlataformaID;
	cl_device_id *listaDispositivoID;
	cl_context contexto = NULL;
	cl_command_queue fila;
	cl_program programa;
	cl_kernel kernel;
	cl_mem Abuffer;
	cl_mem Bbuffer;
	cl_mem Cbuffer;
	cl_event evento;

// Constantes
// Matrizes A, B e C
// Tamanhos:
// A: l x m
// B: m x n
// C: l x n --- C = A x B
int nn = atoi(argv[1]);
const unsigned int l = nn;
const unsigned int m = nn;
const unsigned int n = nn;

/*
const unsigned int l = atoi(argv[1]);
const unsigned int m = atoi(argv[1]);
const unsigned int n = atoi(argv[1]);

cl_uint A[l][m];
cl_uint B[m][n];
cl_uint C[l][n];
*/

cl_uint **A;
A = (cl_uint **)malloc(sizeof(cl_uint *)*l);
cl_uint **C;
C = (cl_uint **)malloc(sizeof(cl_uint *)*l);
cl_uint **B;
B = (cl_uint **)malloc(sizeof(cl_uint *)*l);
	// Preenchendo as matrizes

	for ( int x = 0; x < l ; x ++ ) {
		A[x] = (cl_uint *)malloc(sizeof(cl_uint)*m);
		B[x] = (cl_uint *)malloc(sizeof(cl_uint)*m);
		C[x] = (cl_uint *)malloc(sizeof(cl_uint)*m);
		for (int y = 0; y < m ; y ++ ) {
			A[x][y] = x + y*2;
		}
	}

	for ( int x = 0; x < m ; x ++ ) {
		for (int y = 0; y < n ; y ++ ) {
			B[x][y] = 3*x + y;
			B[x][y] |= 1;
		}
	}
	///// Selecionando uma plataforma OpenCL para rodar

	// Atribuindo a nPlataformas o número de plataformas disponíveis
	errNum = clGetPlatformIDs(0, NULL, &nPlataformas);
	checkErr( (errNum != CL_SUCCESS) ? errNum : 
			(nPlataformas <= 0 ? -1 : CL_SUCCESS),
			"clGetPlataformsIDs");
	// Se não houve erro, alocar memória para cl_platform_id
	listaPlataformaID = (cl_platform_id *)alloca(sizeof(cl_platform_id)*nPlataformas);
	// Atribuindo uma plataforma ao listaPlataformaID
	errNum = clGetPlatformIDs(nPlataformas, listaPlataformaID, NULL);
	std::cout << "#Plataformas: " << nPlataformas << std::endl;
	checkErr(
			(errNum != CL_SUCCESS) ? errNum :
			(nPlataformas <= 0 ? -1 : CL_SUCCESS),
			"clGetPlatformIDs");


	// Iterando na lista de plataformas até achar uma que suporta um dispositivo de CPU. Se isso não ocorrer, acusar erro.
	cl_uint i;
	for (i=0; i < nPlataformas; i++) {
		// Atribuindo o número de dispositivos de GPU a nDispositivos
		errNum = clGetDeviceIDs (	listaPlataformaID[i],
	//				CL_DEVICE_TYPE_ALL,
			//		CL_DEVICE_TYPE_CPU,
					CL_DEVICE_TYPE_GPU,
				0,
				NULL,
				&nDispositivos		);
		if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) {
			infoPlataforma(listaPlataformaID, i);
			checkErr (errNum, "clGetDeviceIDs");
		}
		// Conferindo se há dispositivos de CPU
		else if (nDispositivos > 0) {
			// Atribuindo um dispositivo a uma listaDispositivoID
			listaDispositivoID = (cl_device_id *)alloca(sizeof(cl_device_id)*nDispositivos);

			errNum = clGetDeviceIDs (	
					listaPlataformaID[i],
	//				CL_DEVICE_TYPE_ALL,
				//	CL_DEVICE_TYPE_CPU,
					CL_DEVICE_TYPE_GPU,
					nDispositivos,
					&listaDispositivoID[0],
					NULL);
			checkErr(errNum, "clGetPlatformIDs");
			break;
		}
	}

	// Crindo um contexto no dispositivo/plataforma selecionada
	std::cout << "Adicionando dispositivos OpenCL de numero " << i << std::endl;
	infoPlataforma(listaPlataformaID, i);
	cl_context_properties propContexto[] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)listaPlataformaID[i],
		0
	};

	contexto = clCreateContext(
			propContexto,
			nDispositivos,
			listaDispositivoID,
			&contextCallback,
			NULL,
			&errNum			);

	checkErr(errNum, "clCreateContext");

	// Carregando o arquivo-fonte cl para póstuma compilação feita em runtime
	std::ifstream srcFile("mMatrix.cl");

	// Conferindo se ele foi aberto
	checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "lendo mMatrix.cl");

	std::string srcProg (
			std::istreambuf_iterator<char>(srcFile),
			(std::istreambuf_iterator<char>()));

	const char *fonte = srcProg.c_str();
	size_t tamanho = srcProg.length();

	// Criando programa da fonte
	programa = clCreateProgramWithSource (
			contexto,
			1,
			&fonte,
			&tamanho,
			&errNum);
	checkErr(errNum, "clCreateProgramWithSource");

	// Compilando programa
	errNum = clBuildProgram (
			programa,
			nDispositivos,
			listaDispositivoID,
			NULL,
			NULL,
			NULL);

	if (errNum != CL_SUCCESS) { 		// Verificando se houve erro
		// Determinando o motivo do erro
		char logCompilacao[16384];
		clGetProgramBuildInfo (
				programa,
				listaDispositivoID[0],
				CL_PROGRAM_BUILD_LOG,
				sizeof(logCompilacao),
				logCompilacao,
				NULL);

		std::cerr << "Erro no kernel: " << std::endl;

		std::cerr << logCompilacao;
		checkErr(errNum, "clBuildProgram");
	}

	// Criando o objeto do Kernel
	kernel = clCreateKernel (
			programa,
			"multiplyMatrix",
			&errNum);
	checkErr(errNum, "clCreateKernel");

	// Alocando Buffers
	Abuffer = clCreateBuffer (
			contexto,
			CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
			sizeof(cl_uint)*l*m,
			(A),
			&errNum);
	checkErr(errNum, "clCreateBuffer(A)");

	Bbuffer = clCreateBuffer (
			contexto,
			CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
			sizeof(cl_uint)*m*n,
			(B),
			&errNum);
	checkErr(errNum, "clCreateBuffer(B)");

	Cbuffer = clCreateBuffer (
			contexto,
			CL_MEM_WRITE_ONLY,
			sizeof(cl_uint)*l*n,
			NULL,
			&errNum);
	checkErr(errNum, "clCreateBuffer(C)");

	// Escolhendo o primeiro dispositivo e criando a fila de comando
	fila = clCreateCommandQueue (
			contexto, 
			listaDispositivoID[0],
			CL_QUEUE_PROFILING_ENABLE,
			&errNum);
	checkErr(errNum, "clCreateCommandQueue");

	// Setando os argumentos da função do Kernel
	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &Abuffer);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &Bbuffer);
	errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &Cbuffer);
	errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &l);
	errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &m);
	errNum |= clSetKernelArg(kernel, 5, sizeof(cl_uint), &n);
	checkErr(errNum, "clSetKernelArg");

	// Definindo o número de work-items globais e locais
	const size_t globalWorkSize[2] = { l, n };
	const size_t localWorkSize[2] = { 8, 8 };

	// Enfileirando o Kernel para execução através da matriz
	errNum = clEnqueueNDRangeKernel (
			fila,
			kernel,
			2,
			NULL,
			globalWorkSize,
			localWorkSize,
			0,
			NULL,
			&evento);
	checkErr(errNum, "clEnqueueNDRangeKernel");

	cl_ulong ev_start_time=(cl_ulong)0;     
	cl_ulong ev_end_time=(cl_ulong)0;

	clFinish(fila);
	errNum = clWaitForEvents(1, &evento);
	errNum |= clGetEventProfilingInfo(evento, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ev_start_time, NULL);
	errNum |= clGetEventProfilingInfo(evento, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);

	double run_time_gpu = (double)(ev_end_time - ev_start_time)/1000; // in usec

	errNum = clEnqueueReadBuffer (
			fila,
			Cbuffer,
			CL_TRUE,
			0,
			sizeof(cl_uint)*l*n,
			C,
			0,
			NULL,
			NULL);
	checkErr(errNum, "clEnqueueReadBuffer");

/*
	// Imprimindo saída do resultado
	for(int x = l-1; x < l; x++) {
		for( int y=0; y<m; y++) {
			std::cout << A[x][y] << " ";
		}
		std::cout << std::endl;
	}
	std::cout << std::endl;
	for(int x = m-1; x < m; x++) {
		for( int y=0; y<n; y++) {
			std::cout << B[x][y] << " ";
		}
		std::cout << std::endl;
	}
	std::cout << std::endl;
	for(int x = l-1; x < l; x++) {
		for( int y=0; y<n; y++) {
			std::cout << C[x][y] << " ";
		}
		std::cout << std::endl;
	}
*/
	std::cout << std::endl << std::fixed;
	std::cout << "Tempo de execução: " << std::setprecision(6) << run_time_gpu/1000000 << "ms";
	std::cout << std::endl;
	std::cout << run_time_gpu*1.0e-6;
	std::cout << std::endl;

	return 0;
}
Beispiel #2
0
// main() for simple buffer and sub-buffer example
//
int main(int argc, char** argv)
{
    
    std::cout << "Simple Image Processing Example" << std::endl;
    
    
    // First, select an OpenCL platform to run on.
    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms);
    std::cout << "Number of platforms: \t" << numPlatforms << std::endl;
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    std::ifstream srcFile("gaussian_filter.cl");
    
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl");
    
    std::string srcProg(
                        std::istreambuf_iterator<char>(srcFile),
                        (std::istreambuf_iterator<char>()));
    const char * src = srcProg.c_str();
    size_t length = srcProg.length();
    deviceIDs = NULL;
    DisplayPlatformInfo(
                        platformIDs[PLATFORM_INDEX],
                        CL_PLATFORM_VENDOR,
                        "CL_PLATFORM_VENDOR");
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            0,
                            NULL,
                            &numDevices);
    if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){
        checkErr(errNum, "clGetDeviceIDs");
    }
    
    deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            numDevices,
                            &deviceIDs[0],
                            NULL);
    checkErr(errNum, "clGetDeviceIDs");
    
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[PLATFORM_INDEX],
        0
    };
    
    context = clCreateContext(
                              contextProperties,
                              numDevices,
                              deviceIDs,
                              NULL,
                              NULL,
                              &errNum);
    
    checkErr(errNum, "clCreateContext");
    // Create program from source
    program = clCreateProgramWithSource(
                                        context,
                                        1,
                                        &src,
                                        &length,
                                        &errNum);
    checkErr(errNum, "clCreateProgramWithSource");
    
    // Build program
    errNum = clBuildProgram(
                            program,
                            numDevices,
                            deviceIDs,
                            "-I.",
                            NULL,
                            NULL);

    if (errNum != CL_SUCCESS){
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
                              program,
                              deviceIDs[0],
                              CL_PROGRAM_BUILD_LOG,
                              sizeof(buildLog),
                              buildLog,
                              NULL);
        std::cerr << "Error in OpenCL C source: " << std::endl;
        std::cerr << buildLog;
        checkErr(errNum, "clBuildProgram");
    }
    
    // Create a command commands
	//
	if(!(commands = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum))) {
        std::cout << "Failed to create a command commands!" << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    cl_kernel kernel = clCreateKernel(program, "gaussian_filter", &errNum);
    checkErr(errNum, "clCreateKernel(gaussian_filter)");

    if(!doesGPUSupportImageObjects){
        cleanKill(EXIT_FAILURE);
    }
    
    inputImage = LoadImage(context, (char*)"rgba.png", width, height);
        
    cl_image_format format; 
    format.image_channel_order = CL_RGBA; 
    format.image_channel_data_type = CL_UNORM_INT8;
    
    outputImage = clCreateImage2D(context, 
                             CL_MEM_WRITE_ONLY, 
                             &format, 
                             width, 
                             height,
                             0, 
                             NULL, 
                             &errNum);
    
    if(there_was_an_error(errNum)){
        std::cout << "Output Image Buffer creation error!" << std::endl;
        cleanKill(EXIT_FAILURE);
    }    
    
	if (!inputImage || !outputImage ){
        std::cout << "Failed to allocate device memory!" << std::endl;
        cleanKill(EXIT_FAILURE);
	}
    
    char *buffer = new char [width * height * 4];
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = { width, height, 1};

    sampler = clCreateSampler(context,
                              CL_FALSE, // Non-normalized coordinates 
                              CL_ADDRESS_CLAMP_TO_EDGE, 
                              CL_FILTER_NEAREST, 
                              &errNum);
    
    if(there_was_an_error(errNum)){
        std::cout << "Error creating CL sampler object." << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    // Set the kernel arguments
    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width);
    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error setting kernel arguments." << std::endl;
        std::cerr << print_cl_errstring(errNum) << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    //errNum = clGetKernelWorkGroupInfo(kernel, deviceIDs, CL_KERNEL_WORK_GROUP_SIZE, sizeof(unsigned short)* height*width*4, &local, NULL);
    
//	if (errNum != CL_SUCCESS)
//	{
//        cout << print_cl_errstring(err) << endl;
//        if(err == CL_INVALID_VALUE){
//            cout << "if param_name is not valid, or if size in bytes specified by param_value_size "
//            << "is less than the size of return type as described in the table above and "
//            << "param_value is not NULL." << endl;
//        }
//		cout << "Error: Failed to retrieve kernel work group info!" << err << endl;
//		cleanKill(EXIT_FAILURE);
//	}
    
    std::cout << "Max work group size is " << CL_DEVICE_MAX_WORK_GROUP_SIZE << std::endl;
    std::cout << "Max work item size is " << CL_DEVICE_MAX_WORK_ITEM_SIZES << std::endl;
    
    size_t localWorkSize[2];
    size_t globalWorkSize[2];
    
    localWorkSize[0] = 1;
    localWorkSize[1] = localWorkSize[0];
    globalWorkSize[0] = width*height;
    globalWorkSize[1] = globalWorkSize[0];
    
    //CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size
    
    
    
        //size_t globalWorkSize[2] =  { RoundUp(localWorkSize[0], width), RoundUp(localWorkSize[1], height)};

//    size_t globalWorkSize[1] = {sizeof(unsigned short)* height * width};
//	size_t localWorkSize[1] = {64};
    
    // Queue the kernel up for execution
    errNum = clEnqueueNDRangeKernel(commands, kernel, 2, NULL,
                                    globalWorkSize, localWorkSize,
                                    0, NULL, NULL);
    
    if (errNum != CL_SUCCESS){
        std::cerr << "Error queuing kernel for execution." << std::endl;
        std::cerr << print_cl_errstring(errNum) << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    // Wait for the command commands to get serviced before reading back results
	//
	clFinish(commands);
    
    // Read back computed data
    errNum = clEnqueueReadImage(commands, outputImage,
                                     CL_TRUE, origin, region, 0, 0, buffer, 0, NULL, NULL);
    
    SaveImage((char*)"outRGBA.png", (char*)buffer, width, height);

    std::cout << "Program completed successfully" << std::endl;        
    return 0;     
}
int main (int argc, char ** argv)
{
  cl_int errNum;
  cl_uint numPlatforms;
  cl_uint numDevices;
  cl_paltform_id * platformIDs;
  cl_device_id * deviceIDs;
  cl_context context = NULL;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kernel;
  cl_mem inputSignalBuffer;
  cl_mem outputSignalBuffer;
  cl_mem maskBuffer;

  errNum = clGetPlatformIDs (0, NULL, &numPlatforms);
  checkErr ((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
 
  platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms);
  errNum = clGetPlatformIDs (numPlatforms, platformIDs, NULL);
  checkErr ((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); 

  deviceIDs = NULL;
  cl_uint i;
  for (i = 0; i < numPlatforms; i++)
  {
    errNum = clGetDeviceIDs (platformIDs[i], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);
    if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
    {
      checkErr (errNum, "clGetDeviceIDs");
    }
    else if (numDevices > 0)
    {
      deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
      errNum = clGetDeviceIDs (platformIDs[i], CL_DEVICE_TYPE_CPU, numDevices, &deviceIDs[0], NULL);
      checkErr (errNum, "clGetDeviceIDs");
      break;
    }
  }

  if (deviceIDs == NULL)
  {
    std::cout << "No CPU device found" << std::endl;
    exit (-1);
  }

  cl_context_properties contextProperties[] = 
  {
    CL_CONTEXT_PLATFORM, 
    (cl_context_properties)platformIDs[i], 
    0
  };

  context = clCreateContext (contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
  checkErr (errNum, "clCreateContext");

  std::ifstream srcFile ("Convolution.cl");
  checkErr (srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");
  
  std::string srcProg(std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>()));

  const char * src = srcProg.c_str();
  size_t length = srcProg.length();

  program = clCreateProgramWithSource (context, 1, &src, &length, &errNum);
  checkErr (errNum, "clCreateProgramWithSource");

  inputSignalBuffer = clCreateBuffer (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                      sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
                                      static_cast<void *>(inputSignal), &errNum);
  checkErr (errNum, "clCreateBuffer (inputSignal)");

  maskBuffer = clCreateBuffer (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                               sizeof(cl_uint) * maskHeight * maskWidth,
                               static_cast<void *>(mask), &errNum);
  checkErr (errNum, "clCreateBuffer(mask)");

  outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                      sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
                                      NULL, &errNum);
  checkErr (errNum, "clCreateBuffer(outputSignal)");

  queue = clCreateCommandQueue (context, deviceIDs[0], 0, &errNum);
  checkErr (errNum, "clCreateCommandQueue");

  errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
  errNum |= clSetKernelArg (kernel, 1, sizeof(cl_mem), &maskBuffer);
  errNum |= clSetKernelArg (kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
  errNum |= clSetKernelArg (kernel, 3, sizeof(cl_uint), &inputSignalWidth);
  errNum |= clSetKernelArg (kernel, 4, sizeof(cl_uint), &maskWidth);
  checkErr (errNum, "clSetKernelArg");

  const size_t globalWorkSize[1] = { outputSignalWidth * outputSingalHeight };
  const size_t localWorkSize[1] = { 1 };

  errNum = clEnqueueNDRangeKernel (
    queue,
    kernel,
    1,
    NULL,
    globalWorkSize,
    localWorkSize,
    0,
    NULL,
    NULL
  );
  checkErr (errNum, "clEnqueueNDRangeKernel");

  errNum = clEnqueueReadBuffer (queue, outputSignalBuffer, CL_TRUE, 0,
                                sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,
                                outputSignal, 0, NULL, NULL);
  checkErr (errNum, "clEnqueueReadBuffer");

  for (int y = 0; y < outputSignalHeight; y++)
  {
    for (int x = 0; x < outputSignalWidth; x++)
    {
      std::cout << outputSignal[x][y] << " ";
    }
    std::cout << std::endl;
  }

  return 0;
}
Beispiel #4
0
// main() for simple buffer and sub-buffer example
//
int main(int argc, char** argv)
{
    cl_int errNum;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;
    cl_context context;
    cl_program program;
    std::vector<cl_kernel> kernels;
    std::vector<cl_command_queue> queues;
    std::vector<cl_mem> buffers;
    int * inputOutput;
    std::cout << "Simple buffer and sub-buffer Example" << std::endl;
    // First, select an OpenCL platform to run on.
    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms);
    std::cout << "Number of platforms: \t" << numPlatforms << std::endl;
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    std::ifstream srcFile("simple.cl");
    
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl");
    
    std::string srcProg(
                        std::istreambuf_iterator<char>(srcFile),
                        (std::istreambuf_iterator<char>()));
    const char * src = srcProg.c_str();
    size_t length = srcProg.length();
    deviceIDs = NULL;
    DisplayPlatformInfo(
                        platformIDs[PLATFORM_INDEX],
                        CL_PLATFORM_VENDOR,
                        "CL_PLATFORM_VENDOR");
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            0,
                            NULL,
                            &numDevices);
    if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){
        checkErr(errNum, "clGetDeviceIDs");
    }
    
    deviceIDs = (cl_device_id *)alloca(
                                       sizeof(cl_device_id) * numDevices);
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            numDevices,
                            &deviceIDs[0],
                            NULL);
    checkErr(errNum, "clGetDeviceIDs");
    
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[PLATFORM_INDEX],
        0
    };
    
    context = clCreateContext(
                              contextProperties,
                              numDevices,
                              deviceIDs,
                              NULL,
                              NULL,
                              &errNum);
    
    checkErr(errNum, "clCreateContext");
    // Create program from source
    program = clCreateProgramWithSource(
                                        context,
                                        1,
                                        &src,
                                        &length,
                                        &errNum);
    checkErr(errNum, "clCreateProgramWithSource");
    
    // Build program
    errNum = clBuildProgram(
                            program,
                            numDevices,
                            deviceIDs,
                            "-I.",
                            NULL,
                            NULL);

    if (errNum != CL_SUCCESS){
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
                              program,
                              deviceIDs[0],
                              CL_PROGRAM_BUILD_LOG,
                              sizeof(buildLog),
                              buildLog,
                              NULL);
        std::cerr << "Error in OpenCL C source: " << std::endl;
        std::cerr << buildLog;
        checkErr(errNum, "clBuildProgram");
    }
        // create buffers and sub-buffers
        inputOutput = new int[NUM_BUFFER_ELEMENTS * numDevices];
        for (unsigned int i = 0; i < NUM_BUFFER_ELEMENTS * numDevices; i++)
        {
            inputOutput[i] = i;
        }
        
        // create a single buffer to cover all the input data
        cl_mem buffer = clCreateBuffer(
                                       context,
                                       CL_MEM_READ_WRITE,
                                       sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                       NULL,
                                       &errNum);
        checkErr(errNum, "clCreateBuffer");
        buffers.push_back(buffer);
        // now for all devices other than the first create a sub-buffer
        for (unsigned int i = 1; i < numDevices; i++)
        {
            cl_buffer_region region =
            {
                NUM_BUFFER_ELEMENTS * i * sizeof(int),
                NUM_BUFFER_ELEMENTS * sizeof(int)
            };
            buffer = clCreateSubBuffer(
                                       buffers[0],
                                       CL_MEM_READ_WRITE,
                                       CL_BUFFER_CREATE_TYPE_REGION,
                                       &region,
                                       &errNum);
            checkErr(errNum, "clCreateSubBuffer");
            buffers.push_back(buffer);
        }
        // Create command queues
        for (int i = 0; i < numDevices; i++)
        {
            InfoDevice<cl_device_type>::display(deviceIDs[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE");
            cl_command_queue queue =
            clCreateCommandQueue(
                                 context,
                                 deviceIDs[i],
                                 0,
                                 &errNum);
            checkErr(errNum, "clCreateCommandQueue");
            queues.push_back(queue);
            cl_kernel kernel = clCreateKernel(
                                              program,
                                              "square",
                                              &errNum);
            checkErr(errNum, "clCreateKernel(square)");
            errNum = clSetKernelArg(
                                    kernel,
                                    0,
                                    sizeof(cl_mem), (void *)&buffers[i]);
            checkErr(errNum, "clSetKernelArg(square)");
            kernels.push_back(kernel);
            // Write input data
            clEnqueueWriteBuffer(
                                 queues[0],
                                 buffers[0],
                                 CL_TRUE,
                                 0,
                                 sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                 (void*)inputOutput,
                                 0,
                                 NULL,
                                 NULL);
            std::vector<cl_event> events;
            // call kernel for each device
            for (int i = 0; i < queues.size(); i++)
            {
                cl_event event;
                size_t gWI = NUM_BUFFER_ELEMENTS;
                errNum = clEnqueueNDRangeKernel(
                                                queues[i],
                                                kernels[i],
                                                1,
                                                NULL,
                                                (const size_t*)&gWI,
                                                (const size_t*)NULL,
                                                0,
                                                0,
                                                &event);
                events.push_back(event);
            }
            // Technically don't need this as we are doing a blocking read
            // with in-order queue.
            clWaitForEvents(events.size(), events.data());
            // Read back computed data
            clEnqueueReadBuffer(
                                queues[0],
                                buffers[0],
                                CL_TRUE,
                                0,
                                sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices,
                                (void*)inputOutput,
                                0,
                                NULL,
                                NULL);
            // Display output in rows
            for (unsigned i = 0; i < numDevices; i++)
            {
                for (unsigned elems = i * NUM_BUFFER_ELEMENTS;
                     elems < ((i+1) * NUM_BUFFER_ELEMENTS);
                     elems++)
                {
                    std::cout << " " << inputOutput[elems];
                }
                std::cout << std::endl;
            }
            std::cout << "Program completed successfully" << std::endl;
            return 0; 
        }
}
Beispiel #5
0
///
//	main() for Convoloution example
//
int main(int argc, char** argv)
{
    cl_int errNum;
    cl_uint numPlatforms;
	cl_uint numDevices;
    cl_platform_id * platformIDs;
	cl_device_id * deviceIDs;
    cl_context context = NULL;
	cl_command_queue queue;
	cl_program program;
	cl_kernel kernel;
	cl_mem inputSignalBuffer;
	cl_mem outputSignalBuffer;
	cl_mem maskBuffer;

    // First, select an OpenCL platform to run on.  
	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkErr( 
		(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
		"clGetPlatformIDs"); 
 
	platformIDs = (cl_platform_id *)alloca(
       		sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr( 
	   (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
	   "clGetPlatformIDs");

	// Iterate through the list of platforms until we find one that supports
	// a CPU device, otherwise fail with an error.
	deviceIDs = NULL;
	cl_uint i;
	for (i = 0; i < numPlatforms; i++)
	{
		errNum = clGetDeviceIDs(
            platformIDs[i], 
            CL_DEVICE_TYPE_CPU, 
            0,
            NULL,
            &numDevices);
		if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
	    {
			checkErr(errNum, "clGetDeviceIDs");
        }
	    else if (numDevices > 0) 
		{
		   	deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
			errNum = clGetDeviceIDs(
				platformIDs[i],
				CL_DEVICE_TYPE_CPU,
				numDevices, 
				&deviceIDs[0], 
				NULL);
			checkErr(errNum, "clGetDeviceIDs");
			break;
	   }
	}

	// Check to see if we found at least one CPU device, otherwise return
	if (deviceIDs == NULL) {
		std::cout << "No CPU device found" << std::endl;
		exit(-1);
	}

    // Next, create an OpenCL context on the selected platform.  
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[i],
        0
    };
    context = clCreateContext(
		contextProperties, 
		numDevices,
        deviceIDs, 
		&contextCallback,
		NULL, 
		&errNum);
	checkErr(errNum, "clCreateContext");

	std::ifstream srcFile("../convolution/Convolution.cl");
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");

	std::string srcProg(
        std::istreambuf_iterator<char>(srcFile),
        (std::istreambuf_iterator<char>()));

	const char * src = srcProg.c_str();
	size_t length = srcProg.length();

	// Create program from source
	program = clCreateProgramWithSource(
		context, 
		1, 
		&src, 
		&length, 
		&errNum);
	checkErr(errNum, "clCreateProgramWithSource");

	// Build program
	errNum = clBuildProgram(
		program,
		numDevices,
		deviceIDs,
		NULL,
		NULL,
		NULL);
    if (errNum != CL_SUCCESS)
    {
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
			program, 
			deviceIDs[0], 
			CL_PROGRAM_BUILD_LOG,
            sizeof(buildLog), 
			buildLog, 
			NULL);

        std::cerr << "Error in kernel: " << std::endl;
        std::cerr << buildLog;
		checkErr(errNum, "clBuildProgram");
    }

	// Create kernel object
	kernel = clCreateKernel(
		program,
		"convolve",
		&errNum);
	checkErr(errNum, "clCreateKernel");

	// Now allocate buffers
	inputSignalBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
		static_cast<void *>(inputSignal),
		&errNum);
	checkErr(errNum, "clCreateBuffer(inputSignal)");

	maskBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(cl_uint) * maskHeight * maskWidth,
		static_cast<void *>(mask),
		&errNum);
	checkErr(errNum, "clCreateBuffer(mask)");

	outputSignalBuffer = clCreateBuffer(
		context,
		CL_MEM_WRITE_ONLY,
		sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
		NULL,
		&errNum);
	checkErr(errNum, "clCreateBuffer(outputSignal)");

	// Pick the first device and create command queue.
	queue = clCreateCommandQueue(
		context,
		deviceIDs[0],
		0,
		&errNum);
	checkErr(errNum, "clCreateCommandQueue");

    errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
	errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
	errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
	checkErr(errNum, "clSetKernelArg");

	const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight };
    const size_t localWorkSize[1]  = { 1 };

    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel, 
		1, 
		NULL,
        globalWorkSize, 
		localWorkSize,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueNDRangeKernel");
    
	errNum = clEnqueueReadBuffer(
		queue, 
		outputSignalBuffer, 
		CL_TRUE,
        0, 
		sizeof(cl_uint) * outputSignalHeight * outputSignalHeight, 
		outputSignal,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueReadBuffer");

    // Output the result buffer
    for (int y = 0; y < outputSignalHeight; y++)
	{
		for (int x = 0; x < outputSignalWidth; x++)
		{
			std::cout << outputSignal[x][y] << " ";
		}
		std::cout << std::endl;
	}

    std::cout << std::endl << "Executed program succesfully." << std::endl;

	return 0;
}