int main(int argc, char *argv[]) { cl_int errNum; cl_uint nPlataformas; cl_uint nDispositivos; cl_platform_id *listaPlataformaID; cl_device_id *listaDispositivoID; cl_context contexto = NULL; cl_command_queue fila; cl_program programa; cl_kernel kernel; cl_mem Abuffer; cl_mem Bbuffer; cl_mem Cbuffer; cl_event evento; // Constantes // Matrizes A, B e C // Tamanhos: // A: l x m // B: m x n // C: l x n --- C = A x B int nn = atoi(argv[1]); const unsigned int l = nn; const unsigned int m = nn; const unsigned int n = nn; /* const unsigned int l = atoi(argv[1]); const unsigned int m = atoi(argv[1]); const unsigned int n = atoi(argv[1]); cl_uint A[l][m]; cl_uint B[m][n]; cl_uint C[l][n]; */ cl_uint **A; A = (cl_uint **)malloc(sizeof(cl_uint *)*l); cl_uint **C; C = (cl_uint **)malloc(sizeof(cl_uint *)*l); cl_uint **B; B = (cl_uint **)malloc(sizeof(cl_uint *)*l); // Preenchendo as matrizes for ( int x = 0; x < l ; x ++ ) { A[x] = (cl_uint *)malloc(sizeof(cl_uint)*m); B[x] = (cl_uint *)malloc(sizeof(cl_uint)*m); C[x] = (cl_uint *)malloc(sizeof(cl_uint)*m); for (int y = 0; y < m ; y ++ ) { A[x][y] = x + y*2; } } for ( int x = 0; x < m ; x ++ ) { for (int y = 0; y < n ; y ++ ) { B[x][y] = 3*x + y; B[x][y] |= 1; } } ///// Selecionando uma plataforma OpenCL para rodar // Atribuindo a nPlataformas o número de plataformas disponíveis errNum = clGetPlatformIDs(0, NULL, &nPlataformas); checkErr( (errNum != CL_SUCCESS) ? errNum : (nPlataformas <= 0 ? -1 : CL_SUCCESS), "clGetPlataformsIDs"); // Se não houve erro, alocar memória para cl_platform_id listaPlataformaID = (cl_platform_id *)alloca(sizeof(cl_platform_id)*nPlataformas); // Atribuindo uma plataforma ao listaPlataformaID errNum = clGetPlatformIDs(nPlataformas, listaPlataformaID, NULL); std::cout << "#Plataformas: " << nPlataformas << std::endl; checkErr( (errNum != CL_SUCCESS) ? errNum : (nPlataformas <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); // Iterando na lista de plataformas até achar uma que suporta um dispositivo de CPU. Se isso não ocorrer, acusar erro. cl_uint i; for (i=0; i < nPlataformas; i++) { // Atribuindo o número de dispositivos de GPU a nDispositivos errNum = clGetDeviceIDs ( listaPlataformaID[i], // CL_DEVICE_TYPE_ALL, // CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, 0, NULL, &nDispositivos ); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) { infoPlataforma(listaPlataformaID, i); checkErr (errNum, "clGetDeviceIDs"); } // Conferindo se há dispositivos de CPU else if (nDispositivos > 0) { // Atribuindo um dispositivo a uma listaDispositivoID listaDispositivoID = (cl_device_id *)alloca(sizeof(cl_device_id)*nDispositivos); errNum = clGetDeviceIDs ( listaPlataformaID[i], // CL_DEVICE_TYPE_ALL, // CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, nDispositivos, &listaDispositivoID[0], NULL); checkErr(errNum, "clGetPlatformIDs"); break; } } // Crindo um contexto no dispositivo/plataforma selecionada std::cout << "Adicionando dispositivos OpenCL de numero " << i << std::endl; infoPlataforma(listaPlataformaID, i); cl_context_properties propContexto[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)listaPlataformaID[i], 0 }; contexto = clCreateContext( propContexto, nDispositivos, listaDispositivoID, &contextCallback, NULL, &errNum ); checkErr(errNum, "clCreateContext"); // Carregando o arquivo-fonte cl para póstuma compilação feita em runtime std::ifstream srcFile("mMatrix.cl"); // Conferindo se ele foi aberto checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "lendo mMatrix.cl"); std::string srcProg ( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char *fonte = srcProg.c_str(); size_t tamanho = srcProg.length(); // Criando programa da fonte programa = clCreateProgramWithSource ( contexto, 1, &fonte, &tamanho, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Compilando programa errNum = clBuildProgram ( programa, nDispositivos, listaDispositivoID, NULL, NULL, NULL); if (errNum != CL_SUCCESS) { // Verificando se houve erro // Determinando o motivo do erro char logCompilacao[16384]; clGetProgramBuildInfo ( programa, listaDispositivoID[0], CL_PROGRAM_BUILD_LOG, sizeof(logCompilacao), logCompilacao, NULL); std::cerr << "Erro no kernel: " << std::endl; std::cerr << logCompilacao; checkErr(errNum, "clBuildProgram"); } // Criando o objeto do Kernel kernel = clCreateKernel ( programa, "multiplyMatrix", &errNum); checkErr(errNum, "clCreateKernel"); // Alocando Buffers Abuffer = clCreateBuffer ( contexto, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint)*l*m, (A), &errNum); checkErr(errNum, "clCreateBuffer(A)"); Bbuffer = clCreateBuffer ( contexto, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint)*m*n, (B), &errNum); checkErr(errNum, "clCreateBuffer(B)"); Cbuffer = clCreateBuffer ( contexto, CL_MEM_WRITE_ONLY, sizeof(cl_uint)*l*n, NULL, &errNum); checkErr(errNum, "clCreateBuffer(C)"); // Escolhendo o primeiro dispositivo e criando a fila de comando fila = clCreateCommandQueue ( contexto, listaDispositivoID[0], CL_QUEUE_PROFILING_ENABLE, &errNum); checkErr(errNum, "clCreateCommandQueue"); // Setando os argumentos da função do Kernel errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &Abuffer); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &Bbuffer); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &Cbuffer); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &l); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &m); errNum |= clSetKernelArg(kernel, 5, sizeof(cl_uint), &n); checkErr(errNum, "clSetKernelArg"); // Definindo o número de work-items globais e locais const size_t globalWorkSize[2] = { l, n }; const size_t localWorkSize[2] = { 8, 8 }; // Enfileirando o Kernel para execução através da matriz errNum = clEnqueueNDRangeKernel ( fila, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evento); checkErr(errNum, "clEnqueueNDRangeKernel"); cl_ulong ev_start_time=(cl_ulong)0; cl_ulong ev_end_time=(cl_ulong)0; clFinish(fila); errNum = clWaitForEvents(1, &evento); errNum |= clGetEventProfilingInfo(evento, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ev_start_time, NULL); errNum |= clGetEventProfilingInfo(evento, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL); double run_time_gpu = (double)(ev_end_time - ev_start_time)/1000; // in usec errNum = clEnqueueReadBuffer ( fila, Cbuffer, CL_TRUE, 0, sizeof(cl_uint)*l*n, C, 0, NULL, NULL); checkErr(errNum, "clEnqueueReadBuffer"); /* // Imprimindo saída do resultado for(int x = l-1; x < l; x++) { for( int y=0; y<m; y++) { std::cout << A[x][y] << " "; } std::cout << std::endl; } std::cout << std::endl; for(int x = m-1; x < m; x++) { for( int y=0; y<n; y++) { std::cout << B[x][y] << " "; } std::cout << std::endl; } std::cout << std::endl; for(int x = l-1; x < l; x++) { for( int y=0; y<n; y++) { std::cout << C[x][y] << " "; } std::cout << std::endl; } */ std::cout << std::endl << std::fixed; std::cout << "Tempo de execução: " << std::setprecision(6) << run_time_gpu/1000000 << "ms"; std::cout << std::endl; std::cout << run_time_gpu*1.0e-6; std::cout << std::endl; return 0; }
// main() for simple buffer and sub-buffer example // int main(int argc, char** argv) { std::cout << "Simple Image Processing Example" << std::endl; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); std::cout << "Number of platforms: \t" << numPlatforms << std::endl; errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); std::ifstream srcFile("gaussian_filter.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); deviceIDs = NULL; DisplayPlatformInfo( platformIDs[PLATFORM_INDEX], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){ checkErr(errNum, "clGetDeviceIDs"); } deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[PLATFORM_INDEX], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext"); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, "-I.", NULL, NULL); if (errNum != CL_SUCCESS){ // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in OpenCL C source: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // Create a command commands // if(!(commands = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum))) { std::cout << "Failed to create a command commands!" << std::endl; cleanKill(EXIT_FAILURE); } cl_kernel kernel = clCreateKernel(program, "gaussian_filter", &errNum); checkErr(errNum, "clCreateKernel(gaussian_filter)"); if(!doesGPUSupportImageObjects){ cleanKill(EXIT_FAILURE); } inputImage = LoadImage(context, (char*)"rgba.png", width, height); cl_image_format format; format.image_channel_order = CL_RGBA; format.image_channel_data_type = CL_UNORM_INT8; outputImage = clCreateImage2D(context, CL_MEM_WRITE_ONLY, &format, width, height, 0, NULL, &errNum); if(there_was_an_error(errNum)){ std::cout << "Output Image Buffer creation error!" << std::endl; cleanKill(EXIT_FAILURE); } if (!inputImage || !outputImage ){ std::cout << "Failed to allocate device memory!" << std::endl; cleanKill(EXIT_FAILURE); } char *buffer = new char [width * height * 4]; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { width, height, 1}; sampler = clCreateSampler(context, CL_FALSE, // Non-normalized coordinates CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &errNum); if(there_was_an_error(errNum)){ std::cout << "Error creating CL sampler object." << std::endl; cleanKill(EXIT_FAILURE); } // Set the kernel arguments errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernel arguments." << std::endl; std::cerr << print_cl_errstring(errNum) << std::endl; cleanKill(EXIT_FAILURE); } //errNum = clGetKernelWorkGroupInfo(kernel, deviceIDs, CL_KERNEL_WORK_GROUP_SIZE, sizeof(unsigned short)* height*width*4, &local, NULL); // if (errNum != CL_SUCCESS) // { // cout << print_cl_errstring(err) << endl; // if(err == CL_INVALID_VALUE){ // cout << "if param_name is not valid, or if size in bytes specified by param_value_size " // << "is less than the size of return type as described in the table above and " // << "param_value is not NULL." << endl; // } // cout << "Error: Failed to retrieve kernel work group info!" << err << endl; // cleanKill(EXIT_FAILURE); // } std::cout << "Max work group size is " << CL_DEVICE_MAX_WORK_GROUP_SIZE << std::endl; std::cout << "Max work item size is " << CL_DEVICE_MAX_WORK_ITEM_SIZES << std::endl; size_t localWorkSize[2]; size_t globalWorkSize[2]; localWorkSize[0] = 1; localWorkSize[1] = localWorkSize[0]; globalWorkSize[0] = width*height; globalWorkSize[1] = globalWorkSize[0]; //CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size //size_t globalWorkSize[2] = { RoundUp(localWorkSize[0], width), RoundUp(localWorkSize[1], height)}; // size_t globalWorkSize[1] = {sizeof(unsigned short)* height * width}; // size_t localWorkSize[1] = {64}; // Queue the kernel up for execution errNum = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS){ std::cerr << "Error queuing kernel for execution." << std::endl; std::cerr << print_cl_errstring(errNum) << std::endl; cleanKill(EXIT_FAILURE); } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back computed data errNum = clEnqueueReadImage(commands, outputImage, CL_TRUE, origin, region, 0, 0, buffer, 0, NULL, NULL); SaveImage((char*)"outRGBA.png", (char*)buffer, width, height); std::cout << "Program completed successfully" << std::endl; return 0; }
int main (int argc, char ** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_paltform_id * platformIDs; cl_device_id * deviceIDs; cl_context context = NULL; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem inputSignalBuffer; cl_mem outputSignalBuffer; cl_mem maskBuffer; errNum = clGetPlatformIDs (0, NULL, &numPlatforms); checkErr ((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); errNum = clGetPlatformIDs (numPlatforms, platformIDs, NULL); checkErr ((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); deviceIDs = NULL; cl_uint i; for (i = 0; i < numPlatforms; i++) { errNum = clGetDeviceIDs (platformIDs[i], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) { checkErr (errNum, "clGetDeviceIDs"); } else if (numDevices > 0) { deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs (platformIDs[i], CL_DEVICE_TYPE_CPU, numDevices, &deviceIDs[0], NULL); checkErr (errNum, "clGetDeviceIDs"); break; } } if (deviceIDs == NULL) { std::cout << "No CPU device found" << std::endl; exit (-1); } cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[i], 0 }; context = clCreateContext (contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum); checkErr (errNum, "clCreateContext"); std::ifstream srcFile ("Convolution.cl"); checkErr (srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl"); std::string srcProg(std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); program = clCreateProgramWithSource (context, 1, &src, &length, &errNum); checkErr (errNum, "clCreateProgramWithSource"); inputSignalBuffer = clCreateBuffer (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight * inputSignalWidth, static_cast<void *>(inputSignal), &errNum); checkErr (errNum, "clCreateBuffer (inputSignal)"); maskBuffer = clCreateBuffer (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void *>(mask), &errNum); checkErr (errNum, "clCreateBuffer(mask)"); outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum); checkErr (errNum, "clCreateBuffer(outputSignal)"); queue = clCreateCommandQueue (context, deviceIDs[0], 0, &errNum); checkErr (errNum, "clCreateCommandQueue"); errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &inputSignalBuffer); errNum |= clSetKernelArg (kernel, 1, sizeof(cl_mem), &maskBuffer); errNum |= clSetKernelArg (kernel, 2, sizeof(cl_mem), &outputSignalBuffer); errNum |= clSetKernelArg (kernel, 3, sizeof(cl_uint), &inputSignalWidth); errNum |= clSetKernelArg (kernel, 4, sizeof(cl_uint), &maskWidth); checkErr (errNum, "clSetKernelArg"); const size_t globalWorkSize[1] = { outputSignalWidth * outputSingalHeight }; const size_t localWorkSize[1] = { 1 }; errNum = clEnqueueNDRangeKernel ( queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL ); checkErr (errNum, "clEnqueueNDRangeKernel"); errNum = clEnqueueReadBuffer (queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalHeight, outputSignal, 0, NULL, NULL); checkErr (errNum, "clEnqueueReadBuffer"); for (int y = 0; y < outputSignalHeight; y++) { for (int x = 0; x < outputSignalWidth; x++) { std::cout << outputSignal[x][y] << " "; } std::cout << std::endl; } return 0; }
// main() for simple buffer and sub-buffer example // int main(int argc, char** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_platform_id * platformIDs; cl_device_id * deviceIDs; cl_context context; cl_program program; std::vector<cl_kernel> kernels; std::vector<cl_command_queue> queues; std::vector<cl_mem> buffers; int * inputOutput; std::cout << "Simple buffer and sub-buffer Example" << std::endl; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); std::cout << "Number of platforms: \t" << numPlatforms << std::endl; errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); std::ifstream srcFile("simple.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); deviceIDs = NULL; DisplayPlatformInfo( platformIDs[PLATFORM_INDEX], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){ checkErr(errNum, "clGetDeviceIDs"); } deviceIDs = (cl_device_id *)alloca( sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[PLATFORM_INDEX], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext"); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, "-I.", NULL, NULL); if (errNum != CL_SUCCESS){ // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in OpenCL C source: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // create buffers and sub-buffers inputOutput = new int[NUM_BUFFER_ELEMENTS * numDevices]; for (unsigned int i = 0; i < NUM_BUFFER_ELEMENTS * numDevices; i++) { inputOutput[i] = i; } // create a single buffer to cover all the input data cl_mem buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, NULL, &errNum); checkErr(errNum, "clCreateBuffer"); buffers.push_back(buffer); // now for all devices other than the first create a sub-buffer for (unsigned int i = 1; i < numDevices; i++) { cl_buffer_region region = { NUM_BUFFER_ELEMENTS * i * sizeof(int), NUM_BUFFER_ELEMENTS * sizeof(int) }; buffer = clCreateSubBuffer( buffers[0], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &errNum); checkErr(errNum, "clCreateSubBuffer"); buffers.push_back(buffer); } // Create command queues for (int i = 0; i < numDevices; i++) { InfoDevice<cl_device_type>::display(deviceIDs[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE"); cl_command_queue queue = clCreateCommandQueue( context, deviceIDs[i], 0, &errNum); checkErr(errNum, "clCreateCommandQueue"); queues.push_back(queue); cl_kernel kernel = clCreateKernel( program, "square", &errNum); checkErr(errNum, "clCreateKernel(square)"); errNum = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&buffers[i]); checkErr(errNum, "clSetKernelArg(square)"); kernels.push_back(kernel); // Write input data clEnqueueWriteBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); std::vector<cl_event> events; // call kernel for each device for (int i = 0; i < queues.size(); i++) { cl_event event; size_t gWI = NUM_BUFFER_ELEMENTS; errNum = clEnqueueNDRangeKernel( queues[i], kernels[i], 1, NULL, (const size_t*)&gWI, (const size_t*)NULL, 0, 0, &event); events.push_back(event); } // Technically don't need this as we are doing a blocking read // with in-order queue. clWaitForEvents(events.size(), events.data()); // Read back computed data clEnqueueReadBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); // Display output in rows for (unsigned i = 0; i < numDevices; i++) { for (unsigned elems = i * NUM_BUFFER_ELEMENTS; elems < ((i+1) * NUM_BUFFER_ELEMENTS); elems++) { std::cout << " " << inputOutput[elems]; } std::cout << std::endl; } std::cout << "Program completed successfully" << std::endl; return 0; } }
/// // main() for Convoloution example // int main(int argc, char** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_platform_id * platformIDs; cl_device_id * deviceIDs; cl_context context = NULL; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem inputSignalBuffer; cl_mem outputSignalBuffer; cl_mem maskBuffer; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca( sizeof(cl_platform_id) * numPlatforms); errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); // Iterate through the list of platforms until we find one that supports // a CPU device, otherwise fail with an error. deviceIDs = NULL; cl_uint i; for (i = 0; i < numPlatforms; i++) { errNum = clGetDeviceIDs( platformIDs[i], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) { checkErr(errNum, "clGetDeviceIDs"); } else if (numDevices > 0) { deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[i], CL_DEVICE_TYPE_CPU, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); break; } } // Check to see if we found at least one CPU device, otherwise return if (deviceIDs == NULL) { std::cout << "No CPU device found" << std::endl; exit(-1); } // Next, create an OpenCL context on the selected platform. cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[i], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum); checkErr(errNum, "clCreateContext"); std::ifstream srcFile("../convolution/Convolution.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, NULL, NULL, NULL); if (errNum != CL_SUCCESS) { // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in kernel: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // Create kernel object kernel = clCreateKernel( program, "convolve", &errNum); checkErr(errNum, "clCreateKernel"); // Now allocate buffers inputSignalBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight * inputSignalWidth, static_cast<void *>(inputSignal), &errNum); checkErr(errNum, "clCreateBuffer(inputSignal)"); maskBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void *>(mask), &errNum); checkErr(errNum, "clCreateBuffer(mask)"); outputSignalBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum); checkErr(errNum, "clCreateBuffer(outputSignal)"); // Pick the first device and create command queue. queue = clCreateCommandQueue( context, deviceIDs[0], 0, &errNum); checkErr(errNum, "clCreateCommandQueue"); errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth); checkErr(errNum, "clSetKernelArg"); const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight }; const size_t localWorkSize[1] = { 1 }; // Queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); checkErr(errNum, "clEnqueueNDRangeKernel"); errNum = clEnqueueReadBuffer( queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalHeight, outputSignal, 0, NULL, NULL); checkErr(errNum, "clEnqueueReadBuffer"); // Output the result buffer for (int y = 0; y < outputSignalHeight; y++) { for (int x = 0; x < outputSignalWidth; x++) { std::cout << outputSignal[x][y] << " "; } std::cout << std::endl; } std::cout << std::endl << "Executed program succesfully." << std::endl; return 0; }