int main(void) { const size_t OUTPUT_SIZE = 5; const char *input = "PING\0"; char output[OUTPUT_SIZE]; float a = 23456.0f; int b = 2000001; try { std::vector<cl::Platform> platformList; // Pick platform cl::Platform::get(&platformList); // Pick first platform cl_context_properties cprops[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0}; cl::Context context(CL_DEVICE_TYPE_GPU, cprops); // Query the set of devices attched to the context std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); assert (devices.size() == 1); cl::Device device = devices.at(0); assert (strncmp(device.getInfo<CL_DEVICE_NAME>().c_str(), "tta", 3) == 0); a = poclu_bswap_cl_float (device(), a); b = poclu_bswap_cl_int (device(), b); // Create and program from source cl::Program::Sources sources({kernelSourceCode}); cl::Program program(context, sources); // Build program program.build(devices); cl::Buffer inputBuffer = cl::Buffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, strlen (input) + 1, (void *) &input[0]); // Create buffer for that uses the host ptr C cl::Buffer outputBuffer = cl::Buffer( context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, OUTPUT_SIZE, (void *) &output[0]); // Create kernel object cl::Kernel kernel(program, "test_kernel"); // Set kernel args kernel.setArg(0, inputBuffer); kernel.setArg(1, outputBuffer); kernel.setArg(2, a); kernel.setArg(3, b); // Create command queue cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE); cl::Event enqEvent; // Do the work queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(1), cl::NullRange, NULL, &enqEvent); cl::Event mapEvent; void *outVal = queue.enqueueMapBuffer( outputBuffer, CL_TRUE, // block CL_MAP_READ, 0, OUTPUT_SIZE, NULL, &mapEvent); char* outStr = (char*)(outVal); if (std::string(outStr) == "PONG") std::cout << "OK\n"; else std::cerr << "FAIL, received: " << outStr << "\n"; cl::Event unmapEvent; // Finally release our hold on accessing the memory queue.enqueueUnmapMemObject( outputBuffer, (void*)(outVal), NULL, &unmapEvent); queue.finish(); assert (enqEvent.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE); assert (mapEvent.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE); assert (unmapEvent.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE); assert ( enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() <= enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()); assert ( enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>() <= enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()); assert ( enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() < enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>()); #if 0 std::cerr << "exec time: " << enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() - enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() << std::endl; #endif assert ( mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() <= mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()); assert ( mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>() <= mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()); assert ( mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() <= mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>()); assert ( unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() <= unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()); assert ( unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>() <= unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()); assert ( unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() <= unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>()); assert (enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() <= mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>()); assert (mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() <= unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>()); } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; }
int main(void) { float A[BUFFER_SIZE]; cl_int err; try { std::vector<cl::Platform> platformList; // Pick platform cl::Platform::get(&platformList); // Pick first platform cl_context_properties cprops[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0}; cl::Context context(CL_DEVICE_TYPE_CPU|CL_DEVICE_TYPE_GPU, cprops); // Query the set of devices attched to the context std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); // Create and program from source cl::Program::Sources sources(1, std::make_pair(kernelSourceCode, 0)); cl::Program program(context, sources); cl_device_id dev_id = devices.at(0)(); int scalar = poclu_bswap_cl_int (dev_id, 4); for (int i = 0; i < BUFFER_SIZE; ++i) A[i] = poclu_bswap_cl_float(dev_id, i); // Build program program.build(devices); cl::Buffer aBuffer = cl::Buffer( context, CL_MEM_COPY_HOST_PTR, BUFFER_SIZE * sizeof(float), (void *) &A[0]); cl::Buffer localBuffer = cl::Buffer( context, 0, BUFFER_SIZE * sizeof(int), NULL); // Create kernel object cl::Kernel kernel(program, "test_kernel"); // Set kernel args kernel.setArg(0, aBuffer); kernel.setArg(1, localBuffer); kernel.setArg(2, scalar); // Create command queue cl::CommandQueue queue(context, devices[0], 0); // Do the work queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(WORK_ITEMS), cl::NullRange); // Map aBuffer to host pointer. This enforces a sync with // the host backing space, remember we choose GPU device. float * res = (float *) queue.enqueueMapBuffer( aBuffer, CL_TRUE, // block CL_MAP_READ, 0, BUFFER_SIZE * sizeof(float)); res[0] = poclu_bswap_cl_float (dev_id, res[0]); res[1] = poclu_bswap_cl_float (dev_id, res[1]); bool ok = res[0] == 8 && res[1] == 10; if (ok) { return EXIT_SUCCESS; } else { std::cout << "NOK " << res[0] << " " << res[1] << std::endl; std::cout << "res@" << std::hex << res << std::endl; return EXIT_FAILURE; } // Finally release our hold on accessing the memory err = queue.enqueueUnmapMemObject( aBuffer, (void *) res); // There is no need to perform a finish on the final unmap // or release any objects as this all happens implicitly with // the C++ Wrapper API. } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; }
int main(void) { cl_float A[BUFFER_SIZE]; cl_int R[WORK_ITEMS]; for (int i = 0; i < BUFFER_SIZE; i++) { A[i] = i; } for (int i = 0; i < WORK_ITEMS; i++) { R[i] = i; } try { std::vector<cl::Platform> platformList; // Pick platform cl::Platform::get(&platformList); // Pick first platform cl_context_properties cprops[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0}; cl::Context context(CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, cprops); // Query the set of devices attched to the context std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); // Create and program from source cl::Program::Sources sources(1, std::make_pair(kernelSourceCode, 0)); cl::Program program(context, sources); cl_device_id dev_id = devices.at(0)(); poclu_bswap_cl_float_array(dev_id, A, BUFFER_SIZE); poclu_bswap_cl_int_array(dev_id, R, WORK_ITEMS); // Build program program.build(devices); // Create buffer for A and copy host contents cl::Buffer aBuffer = cl::Buffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BUFFER_SIZE * sizeof(float), (void *) &A[0]); // Create buffer for that uses the host ptr C cl::Buffer cBuffer = cl::Buffer( context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, WORK_ITEMS * sizeof(int), (void *) &R[0]); // Create kernel object cl::Kernel kernel(program, "test_kernel"); // Set kernel args kernel.setArg(0, aBuffer); kernel.setArg(1, cBuffer); // Create command queue cl::CommandQueue queue(context, devices[0], 0); // Do the work queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(WORK_ITEMS), cl::NullRange); // Map cBuffer to host pointer. This enforces a sync with // the host backing space, remember we choose GPU device. int * output = (int *) queue.enqueueMapBuffer( cBuffer, CL_TRUE, // block CL_MAP_READ, 0, WORK_ITEMS * sizeof(int)); bool ok = true; for (int i = 0; i < WORK_ITEMS; i++) { float global_sum = 0.0f; int j; float result; result = global_sum; for (j=0; j < 32; ++j) { float value = poclu_bswap_cl_float (dev_id, A[i+j]); global_sum += value; } result = result + global_sum; for (j=0; j < 32; ++j) { float value = poclu_bswap_cl_float (dev_id, A[i+j]); global_sum += value; } result = result + global_sum; if ((int)result != poclu_bswap_cl_int (dev_id, R[i])) { std::cout << "F(" << i << ": " << (int)result << " != " << R[i] << ") "; ok = false; } } if (ok) return EXIT_SUCCESS; else return EXIT_FAILURE; // Finally release our hold on accessing the memory queue.enqueueUnmapMemObject( cBuffer, (void *) output); // There is no need to perform a finish on the final unmap // or release any objects as this all happens implicitly with // the C++ Wrapper API. } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; }
int main(void) { //float a = 23456.0f; float a = 3.f; // test the poclu's half conversion functions printf("through conversion: %.0f\n", poclu_cl_half_to_float(poclu_float_to_cl_half(42.0f))); try { std::vector<cl::Platform> platformList; // Pick platform cl::Platform::get(&platformList); // Pick first platform cl_context_properties cprops[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0}; cl::Context context(CL_DEVICE_TYPE_GPU, cprops); // Query the set of devices attched to the context std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); assert (devices.size() == 1); cl::Device device = devices.at(0); assert (strncmp(device.getInfo<CL_DEVICE_NAME>().c_str(), "ttasim", 6)==0 ); a = poclu_bswap_cl_float (device(), a); // Create and program from source cl::Program::Sources sources(1, std::make_pair(kernelSourceCode, 0)); cl::Program program(context, sources); // Build program program.build(devices); // Create kernel object cl::Kernel kernel(program, "test_kernel"); // Set kernel args kernel.setArg(0, a); // Create command queue cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE); cl::Event enqEvent; // Do the work queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(8), cl::NullRange, NULL, &enqEvent); queue.finish(); assert (enqEvent.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE); assert ( enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() <= enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()); assert ( enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>() <= enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()); assert ( enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() < enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>()); #if 0 std::cerr << "exec time: " << enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() - enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() << std::endl; #endif } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; }