Beispiel #1
0
/// Tells LibOI that the image source is located in OpenGL device memory at the location
/// specified.  You must also indicate whether the OpenGL location is a
///  OPENGL_FRAMEBUFFER | OPENGL_TEXTUREBUFFER
/// All subsequent CopyImageToBuffer commands will read from this location.
void CLibOI::SetImageSource(GLuint gl_device_memory, LibOIEnums::ImageTypes type)
{
	mImageType = type;

	int status = CL_SUCCESS;

	switch(type)
	{
	case LibOIEnums::OPENGL_FRAMEBUFFER:
		mImage_gl = clCreateFromGLBuffer(mOCL->GetContext(), CL_MEM_READ_ONLY, gl_device_memory, &status);
		CHECK_OPENCL_ERROR(status, "clCreateFromGLBuffer failed.");

		break;

	case LibOIEnums::OPENGL_TEXTUREBUFFER:
#if defined(DETECTED_OPENCL_1_0) || defined(DETECTED_OPENCL_1_1) || defined(DETECTED_OPENCL_UNKNOWN_VERSION)
		mImage_gl = clCreateFromGLTexture3D(mOCL->GetContext(), CL_MEM_READ_ONLY, GL_TEXTURE_3D, 0, gl_device_memory, &status);
#else
		mImage_gl = clCreateFromGLTexture(mOCL->GetContext(), CL_MEM_READ_ONLY, GL_TEXTURE_2D_ARRAY, 0, gl_device_memory, &status);
#endif // defined(DETECTED_OPENCL_1_0) || defined(DETECTED_OPENCL_1_1)
		CHECK_OPENCL_ERROR(status, "clCreateFromGLTexture failed.");

		break;

	case LibOIEnums::OPENGL_RENDERBUFFER:
		// TODO: note that the clCreateFromGLTexture2D was depreciated in the OpenCL 1.2 specifications.
		mImage_gl = clCreateFromGLRenderbuffer(mOCL->GetContext(), CL_MEM_READ_ONLY, gl_device_memory, &status);
		CHECK_OPENCL_ERROR(status, "clCreateFromGLRenderbuffer failed.");

		break;

	default:
		// We don't know what type of image this is!
		assert(false);
		break;
	}
}
Beispiel #2
0
/// Copies host memory to a cl_mem buffer
void CLibOI::CopyImageToBuffer(float * host_mem, cl_mem cl_buffer, int width, int height, int layer)
{
	int status = CL_SUCCESS;
	int size = width *  height;

	cl_float * tmp = new cl_float[size];
	for(int i = 0; i < size; i++)
		tmp[i] = host_mem[i];

	// Enqueue a blocking write
    status = clEnqueueWriteBuffer(mOCL->GetQueue(), cl_buffer, CL_TRUE, 0, sizeof(cl_float) * size, tmp, 0, NULL, NULL);
	CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed.");

	delete[] tmp;
}
Beispiel #3
0
/// Copies the current image in mCLImage to the floating point buffer, image, iff the sizes match exactly.
void CLibOI::ExportImage(float * image, unsigned int width, unsigned int height, unsigned int depth)
{
	if(width != mImageWidth || height != mImageHeight || depth != mImageDepth)
		return;

	int status = CL_SUCCESS;
	size_t num_elements = mImageWidth * mImageHeight * mImageDepth;
	cl_float tmp[num_elements];
	status |= clEnqueueReadBuffer(mOCL->GetQueue(), mImage_cl, CL_TRUE, 0, num_elements * sizeof(cl_float), tmp, 0, NULL, NULL);
	CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

	// Copy to the output buffer, converting as we go.
	for(size_t i = 0; i < num_elements; i++)
		image[i] = tmp[i];

}
Beispiel #4
0
OCL_Device::OCL_Device(int iPlatformNum, int iDeviceNum)
{
	// For error checking
	cl_int err;

	// Get Platfom Info
	cl_uint iNumPlatforms = 0;
	err = clGetPlatformIDs(NULL, NULL, &iNumPlatforms); 
	CHECK_OPENCL_ERROR(err);

	cl_platform_id* vPlatformIDs = 
		(cl_platform_id *) new cl_platform_id[iNumPlatforms];
	err = clGetPlatformIDs(iNumPlatforms, vPlatformIDs, NULL); 
	CHECK_OPENCL_ERROR(err);
	if (iPlatformNum >= iNumPlatforms)
	{
		printf("Platform index must me between 0 and %d.\n",iNumPlatforms-1);
		delete[] vPlatformIDs;
		return;
	}
	m_platform_id = vPlatformIDs[iPlatformNum];
	delete[] vPlatformIDs;

	// Get Device Info
	cl_uint iNumDevices = 0;
	err = clGetDeviceIDs(m_platform_id, CL_DEVICE_TYPE_ALL, NULL, NULL, 
		&iNumDevices); 
	CHECK_OPENCL_ERROR(err);

	cl_device_id* vDeviceIDs = (cl_device_id*) new cl_device_id[iNumDevices];	
	err = clGetDeviceIDs(m_platform_id, CL_DEVICE_TYPE_ALL, iNumDevices, 
		vDeviceIDs, &iNumDevices); 
	CHECK_OPENCL_ERROR(err);
	if (iDeviceNum >= iNumDevices)
	{
		printf("Device index must me between 0 and %d.\n", iNumDevices-1);
		delete[] vDeviceIDs;
		return;
	}
	m_device_id = vDeviceIDs[iDeviceNum];
	delete[] vDeviceIDs;

	cl_context_properties vProprieties[3] = {CL_CONTEXT_PLATFORM, 
		(cl_context_properties)m_platform_id, 0};
	m_context = clCreateContext(vProprieties, 1, &m_device_id, NULL, NULL, 
		&err); 
	CHECK_OPENCL_ERROR(err);

	m_queue = clCreateCommandQueue(m_context, m_device_id, NULL, &err); 
	CHECK_OPENCL_ERROR(err);
	
	char* m_sBuildOptions = "";
}
int
BoxFilterSeparable::cleanup()
{
    if(!byteRWSupport)
    {
        return SDK_SUCCESS;
    }

    // Releases OpenCL resources (Context, Memory etc.)
    cl_int status;

    status = clReleaseKernel(verticalKernel);
    CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.(vertical)");

    status = clReleaseKernel(horizontalKernel);
    CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.(Horizontal)");

    status = clReleaseProgram(program);
    CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.");

    status = clReleaseMemObject(inputImageBuffer);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

    status = clReleaseMemObject(outputImageBuffer);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

    status = clReleaseMemObject(tempImageBuffer);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

    status = clReleaseCommandQueue(commandQueue);
    CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.");

    status = clReleaseContext(context);
    CHECK_OPENCL_ERROR(status, "clReleaseContext failed.");

    // release program resources (input memory etc.)
    FREE(inputImageData);
    FREE(outputImageData);
    FREE(verificationOutput);
    FREE(devices);

    return SDK_SUCCESS;
}
Beispiel #6
0
int ComputeBench::mapBuffer(cl_mem deviceBuffer, T* &hostPointer,
        size_t sizeInBytes, cl_map_flags flags)
{
    cl_int status;
    hostPointer = (T*) clEnqueueMapBuffer(commandQueue,
            deviceBuffer,
            CL_TRUE,
            flags,
            0,
            sizeInBytes,
            0,
            NULL,
            NULL,
            &status);
    CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer failed");

    return SDK_SUCCESS;
}
Beispiel #7
0
void CLHelper::printAllPlatformsAndDevices()
{
	cl_int err;

	std::vector<cl::Platform> platforms;
	err = cl::Platform::get(&platforms);
	CHECK_OPENCL_ERROR(err, "cl::Platform::get() failed.");

	std::cout << std::endl;
	std::cout << "Listing platform vendors and devices" << std::endl;
	std::cout << "===========================================" << std::endl;

	std::vector<cl::Platform>::iterator platform;
	for(platform = platforms.begin(); platform != platforms.end(); platform++) {
		CLHelper::printVendor(*platform);
		CLHelper::printDevices(*platform, CL_DEVICE_TYPE_ALL);
		std::cout << "===========================================" << std::endl;
	}
}
int AtomicCounters::cleanup() {
  // Releases OpenCL resources (Context, Memory etc.)
  cl_int status;
  status = clReleaseMemObject(inBuf);
  CHECK_OPENCL_ERROR(status, "clReleaseMemObject(inBuf) failed.");
  status = clReleaseMemObject(counterOutBuf);
  CHECK_OPENCL_ERROR(status, "clReleaseMemObject(counterOutBuf) failed.");
  status = clReleaseMemObject(globalOutBuf);
  CHECK_OPENCL_ERROR(status, "clReleaseMemObject(globalOutBuf) failed.");
  status = clReleaseKernel(counterKernel);
  CHECK_OPENCL_ERROR(status, "clReleaseKernel(counterKernel) failed.");
  status = clReleaseKernel(globalKernel);
  CHECK_OPENCL_ERROR(status, "clReleaseKernel(globalKernel) failed.");
  status = clReleaseProgram(program);
  CHECK_OPENCL_ERROR(status, "clReleaseProgram(program) failed.");
  status = clReleaseCommandQueue(commandQueue);
  CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue(commandQueue) failed.");
  status = clReleaseContext(context);
  CHECK_OPENCL_ERROR(status, "clReleaseContext(context) failed.");
  free(input);
  return SDK_SUCCESS;
}
int 
MatrixMulImage::cleanup()
{
    // Releases OpenCL resources (Context, Memory etc.
    cl_int status;

    status = clReleaseKernel(kernel);
    CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.(kernel)");
    
    status = clReleaseProgram(program);
    CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.(program)");
   
    status = clReleaseMemObject(inputBuffer0);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(inputBuffer0)");
    
    status = clReleaseMemObject(inputBuffer1);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(inputBuffer1)");
    
    status = clReleaseMemObject(outputBuffer);
    CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.(outputBuffer)");
    
    status = clReleaseCommandQueue(commandQueue);
    CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.(commandQueue)");
    
    status = clReleaseContext(context);
    CHECK_OPENCL_ERROR(status, "clReleaseContext failed.(context)");
    
    // release program resources (input memory etc.)
    FREE(input0);

    FREE(input1);

    FREE(output);

    FREE(verificationOutput);

    // release device list
    FREE(devices);


    return SDK_SUCCESS;
}
Beispiel #10
0
void CLHelper::compileProgram(
	cl::Program& program,
	std::vector<cl::Device>& devices,
	const char* options,
	void (CL_CALLBACK * notifyFptr)(cl_program, void *),
	void* data)
{
	cl_int err;

	err = program.build(devices, options, NULL, NULL);
	if(err != CL_SUCCESS) {
		std::cout << "Build error! Showing build log:" << std::endl << std::endl;

		std::string errorLog;
		std::vector<cl::Device>::iterator device;
		for(device = devices.begin(); device != devices.end(); device++)
		{
			errorLog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device);
			std::cout << errorLog << std::endl;
		}
		CHECK_OPENCL_ERROR(err, "cl::Program::build() failed.");
	}
}
int	MotionDetector::setupKernel(std::string name){

	cl_int status = CL_SUCCESS;

	// create a CL program using the kernel source
	buildProgramData buildData;

	buildData.kernelName = std::string(name+"_Kernel.cl");
	buildData.devices = devices;
	buildData.deviceId = sampleArgs->deviceId;
	buildData.flagsStr = std::string("");
	if (sampleArgs->isLoadBinaryEnabled())
	{
		buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
	}

	if (sampleArgs->isComplierFlagsSpecified())
	{
		buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
	}

	int retValue = buildOpenCLProgram(program, context, buildData);
	CHECK_ERROR(retValue, 0, "buildOpenCLProgram() failed");

	// get a kernel object handle for a kernel with the given name
	char* charname = &name[0];
	kernl = clCreateKernel(
		program,
		charname,
		&status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

	status = kernelInfo.setKernelWorkGroupInfo(kernl, devices[sampleArgs->deviceId]);
	CHECK_ERROR(status, SDK_SUCCESS, "kernelInfo.setKernelWorkGroupInfo() failed");

	return SDK_SUCCESS;
}
int DwtHaar1D::cleanup()
{
    // Releases OpenCL resources (Context, Memory etc.)
    cl_int status;

    status = clReleaseMemObject(inDataBuf);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(inDataBuf)");

    status = clReleaseMemObject(dOutDataBuf);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(dOutDataBuf)");

    status = clReleaseMemObject(dPartialOutDataBuf);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(dPartialOutDataBuf)");


    status = clReleaseKernel(kernel);
    CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.(kernel)");

    status = clReleaseProgram(program);
    CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.(program)");

    status = clReleaseCommandQueue(commandQueue);
    CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.(commandQueue)");

    status = clReleaseContext(context);
    CHECK_OPENCL_ERROR(status, "clReleaseContext failed.(context)");

    // Release program resources (input memory etc.)
    FREE(inData);
    FREE(dOutData);
    FREE(dPartialOutData);
    FREE(hOutData);
    FREE(devices);

    return SDK_SUCCESS;
}
int 
ConstantBandwidth::cleanup()
{
    // Releases OpenCL resources (Context, Memory etc.)
    cl_int status;

    for(int i = 0; i < NUM_KERNELS; i++)
    {
        status = clReleaseKernel(kernel[i]);
        CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.");
    }

    status = clReleaseProgram(program);
    CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.");
 
    status = clReleaseMemObject(constantBuffer);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

    status = clReleaseMemObject(outputBuffer);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

    status = clReleaseCommandQueue(commandQueue);
     CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.");

    status = clReleaseContext(context);
    CHECK_OPENCL_ERROR(status, "clReleaseContext failed.");

    // release program resources (input memory etc.)
    FREE(input);

    FREE(output);

    FREE(verificationOutput);
	

    // release device list
   FREE(devices);

    return SDK_SUCCESS;
}
int
MersenneTwister::cleanup()
{
    // Releases OpenCL resources
    cl_int status;

    status = clReleaseMemObject(seedsBuf);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(seedsBuf)");

    status = clReleaseMemObject(resultBuf);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(resultBuf)");

    status = clReleaseKernel(kernel);
    CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.(kernel)");

    status = clReleaseProgram(program);
    CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.(program)");

    status = clReleaseCommandQueue(commandQueue);
    CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.(commandQueue)");

    status = clReleaseContext(context);
    CHECK_OPENCL_ERROR(status, "clReleaseContext failed.(context)");

    // Release program resources
    FREE(deviceResult);

#if defined (_WIN32)
    ALIGNED_FREE(seeds);
#else
    FREE(seeds);
#endif

    FREE(devices);

    return SDK_SUCCESS;
}
int AtomicCounters::setupCL(void) {
  cl_int status = 0;
  cl_device_type dType;
  if (sampleArgs->deviceType.compare("cpu") == 0) {
    dType = CL_DEVICE_TYPE_CPU;
  } else  // deviceType = "gpu"
  {
    dType = CL_DEVICE_TYPE_GPU;
    if (sampleArgs->isThereGPU() == false) {
      std::cout << "GPU not found. Falling back to CPU" << std::endl;
      dType = CL_DEVICE_TYPE_CPU;
    }
  }
  cl_platform_id platform = NULL;
  int retValue = getPlatform(platform, sampleArgs->platformId,
                             sampleArgs->isPlatformEnabled());
  CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed.");
  // Display available devices.
  retValue = displayDevices(platform, dType);
  CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed.");
  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
                                  (cl_context_properties)platform, 0};
  context = clCreateContextFromType(cps, dType, NULL, NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");
  // getting device on which to run the sample
  status = getDevices(context, &devices, sampleArgs->deviceId,
                      sampleArgs->isDeviceIdEnabled());
  CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed ");
  // Set device info of given cl_device_id
  retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
  CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");
  // Check device extensions
  if (!strstr(deviceInfo.extensions, "cl_ext_atomic_counters_32")) {
    OPENCL_EXPECTED_ERROR(
        "Device does not support cl_ext_atomic_counters_32 extension!");
  }
  if (!strstr(deviceInfo.extensions, "cl_khr_local_int32_base_atomics")) {
    OPENCL_EXPECTED_ERROR(
        "Device does not support cl_khr_local_int32_base_atomics extension!");
  }
  // Get OpenCL device version
  std::string deviceVersionStr = std::string(deviceInfo.deviceVersion);
  size_t vStart = deviceVersionStr.find(" ", 0);
  size_t vEnd = deviceVersionStr.find(" ", vStart + 1);
  std::string vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
// Check of OPENCL_C_VERSION if device version is 1.1 or later
#ifdef CL_VERSION_1_1
  if (deviceInfo.openclCVersion) {
    // Exit if OpenCL C device version is 1.0
    deviceVersionStr = std::string(deviceInfo.openclCVersion);
    vStart = deviceVersionStr.find(" ", 0);
    vStart = deviceVersionStr.find(" ", vStart + 1);
    vEnd = deviceVersionStr.find(" ", vStart + 1);
    vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
    if (vStrVal.compare("1.0") <= 0) {
      OPENCL_EXPECTED_ERROR(
          "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
    }
  } else {
    OPENCL_EXPECTED_ERROR(
        "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
  }
#else
  OPENCL_EXPECTED_ERROR(
      "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1");
#endif
  // Setup application data
  if (setupAtomicCounters() != SDK_SUCCESS) {
    return SDK_FAILURE;
  }
  cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
  commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId],
                                      props, &status);
  CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed(commandQueue)");
  // Set Persistent memory only for AMD platform
  cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
  if (sampleArgs->isAmdPlatform()) {
    inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;
  }
  // Create buffer for input array
  inBuf = clCreateBuffer(context, inMemFlags, length * sizeof(cl_uint), NULL,
                         &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(inBuf)");
  // Set up data for input array
  cl_event writeEvt;
  status =
      clEnqueueWriteBuffer(commandQueue, inBuf, CL_FALSE, 0,
                           length * sizeof(cl_uint), input, 0, NULL, &writeEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(inBuf) failed..");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
  counterOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint),
                                 NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(counterOutBuf).");
  globalOutBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint),
                                NULL, &status);
  CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(globalOutBuf).");
  // create a CL program using the kernel source
  buildProgramData buildData;
  buildData.kernelName = std::string("AtomicCounters_Kernels.cl");
  buildData.devices = devices;
  buildData.deviceId = sampleArgs->deviceId;
  buildData.flagsStr = std::string("");
  if (sampleArgs->isLoadBinaryEnabled()) {
    buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
  }
  if (sampleArgs->isComplierFlagsSpecified()) {
    buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
  }
  retValue = buildOpenCLProgram(program, context, buildData);
  CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");
  // ConstantBuffer bandwidth from single access
  counterKernel = clCreateKernel(program, "atomicCounters", &status);
  CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(counterKernel).");
  globalKernel = clCreateKernel(program, "globalAtomics", &status);
  CHECK_OPENCL_ERROR(status, "clCreateKernel(globalKernel) failed.");
  status = kernelInfoC.setKernelWorkGroupInfo(counterKernel,
                                              devices[sampleArgs->deviceId]);
  CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");
  status = kernelInfoG.setKernelWorkGroupInfo(globalKernel,
                                              devices[sampleArgs->deviceId]);
  CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");
  if (counterWorkGroupSize > kernelInfoC.kernelWorkGroupSize) {
    if (!sampleArgs->quiet) {
      std::cout << "Out of Resources!" << std::endl;
      std::cout << "Group Size specified : " << counterWorkGroupSize
                << std::endl;
      std::cout << "Max Group Size supported on the kernel(readKernel) : "
                << kernelInfoC.kernelWorkGroupSize << std::endl;
      std::cout << "Falling back to " << kernelInfoC.kernelWorkGroupSize
                << std::endl;
    }
    counterWorkGroupSize = kernelInfoC.kernelWorkGroupSize;
  }
  if (globalWorkGroupSize > kernelInfoG.kernelWorkGroupSize) {
    if (!sampleArgs->quiet) {
      std::cout << "Out of Resources!" << std::endl;
      std::cout << "Group Size specified : " << globalWorkGroupSize
                << std::endl;
      std::cout << "Max Group Size supported on the kernel(writeKernel) : "
                << kernelInfoG.kernelWorkGroupSize << std::endl;
      std::cout << "Falling back to " << kernelInfoG.kernelWorkGroupSize
                << std::endl;
    }
    globalWorkGroupSize = kernelInfoG.kernelWorkGroupSize;
  }
  // Wait for event and release event
  status = waitForEventAndRelease(&writeEvt);
  CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
  return SDK_SUCCESS;
}
int AtomicCounters::runGlobalAtomicKernel() {
  cl_int status = CL_SUCCESS;
  // Set Global and Local work items
  size_t globalWorkItems = length;
  size_t localWorkItems = globalWorkGroupSize;
  // Initialize the counter value
  cl_event writeEvt;
  status =
      clEnqueueWriteBuffer(commandQueue, globalOutBuf, CL_FALSE, 0,
                           sizeof(cl_uint), &initValue, 0, NULL, &writeEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed.");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush() failed.");
  // Wait for event and release event
  status = waitForEventAndRelease(&writeEvt);
  CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
  // Set kernel arguments
  status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
  status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
  status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf);
  CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed.");
  // Run Kernel
  cl_event ndrEvt;
  status = clEnqueueNDRangeKernel(commandQueue, globalKernel, 1, NULL,
                                  &globalWorkItems, &localWorkItems, 0, NULL,
                                  &ndrEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed.");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
  cl_int eventStatus = CL_QUEUED;
  while (eventStatus != CL_COMPLETE) {
    status = clGetEventInfo(ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &eventStatus, NULL);
    CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
  }
  cl_ulong startTime;
  cl_ulong endTime;
  // Get profiling information
  status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_START,
                                   sizeof(cl_ulong), &startTime, NULL);
  CHECK_OPENCL_ERROR(
      status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
  status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_END,
                                   sizeof(cl_ulong), &endTime, NULL);
  CHECK_OPENCL_ERROR(
      status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
  double sec = 1e-9 * (endTime - startTime);
  kTimeAtomGlobal += sec;
  status = clReleaseEvent(ndrEvt);
  CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
  // Get the occurrences of Value from atomicKernel
  cl_event readEvt;
  status = clEnqueueReadBuffer(commandQueue, globalOutBuf, CL_FALSE, 0,
                               sizeof(cl_uint), &globalOut, 0, NULL, &readEvt);
  CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed.");
  status = clFlush(commandQueue);
  CHECK_OPENCL_ERROR(status, "clFlush() failed.");
  // Wait for event and release event
  status = waitForEventAndRelease(&readEvt);
  CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");
  return SDK_SUCCESS;
}
Beispiel #17
0
void CLHelper::DeviceInfo::setDeviceInfo(cl::Device device) {
    cl_int err = CL_SUCCESS;

    //Get device type
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_TYPE,
                    sizeof(cl_device_type),
                    &dType,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_TYPE) failed");

    //Get vender ID
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_VENDOR_ID,
                    sizeof(cl_uint),
                    &venderId,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_VENDOR_ID) failed");

    //Get max compute units
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_COMPUTE_UNITS,
                    sizeof(cl_uint),
                    &maxComputeUnits,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_COMPUTE_UNITS) failed");

    //Get max work item dimensions
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                    sizeof(cl_uint),
                    &maxWorkItemDims,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed");

    //Get max work item sizes
    delete maxWorkItemSizes;
    maxWorkItemSizes = new size_t[maxWorkItemDims];

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_WORK_ITEM_SIZES,
                    maxWorkItemDims * sizeof(size_t),
                    maxWorkItemSizes,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed");

    // Maximum work group size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_WORK_GROUP_SIZE,
                    sizeof(size_t),
                    &maxWorkGroupSize,
                    NULL);
   CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");

    // Preferred vector sizes of all data types
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
                    sizeof(cl_uint),
                    &preferredCharVecWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
                    sizeof(cl_uint),
                    &preferredShortVecWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
                    sizeof(cl_uint),
                    &preferredIntVecWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
                    sizeof(cl_uint),
                    &preferredLongVecWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
                    sizeof(cl_uint),
                    &preferredFloatVecWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                    sizeof(cl_uint),
                    &preferredDoubleVecWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF,
                    sizeof(cl_uint),
                    &preferredHalfVecWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) failed");

    // Clock frequency
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_CLOCK_FREQUENCY,
                    sizeof(cl_uint),
                    &maxClockFrequency,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_CLOCK_FREQUENCY) failed");

    // Address bits
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_ADDRESS_BITS,
                    sizeof(cl_uint),
                    &addressBits,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_ADDRESS_BITS) failed");

    // Maximum memory alloc size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                    sizeof(cl_ulong),
                    &maxMemAllocSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed");

    // Image support
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_IMAGE_SUPPORT,
                    sizeof(cl_bool),
                    &imageSupport,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_IMAGE_SUPPORT) failed");

    // Maximum read image arguments
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_READ_IMAGE_ARGS,
                    sizeof(cl_uint),
                    &maxReadImageArgs,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_READ_IMAGE_ARGS) failed");

    // Maximum write image arguments
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_WRITE_IMAGE_ARGS,
                    sizeof(cl_uint),
                    &maxWriteImageArgs,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) failed");

    // 2D image and 3D dimensions
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_IMAGE2D_MAX_WIDTH,
                    sizeof(size_t),
                    &image2dMaxWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_IMAGE2D_MAX_WIDTH) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_IMAGE2D_MAX_HEIGHT,
                    sizeof(size_t),
                    &image2dMaxHeight,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_IMAGE2D_MAX_HEIGHT) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_IMAGE3D_MAX_WIDTH,
                    sizeof(size_t),
                    &image3dMaxWidth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_IMAGE3D_MAX_WIDTH) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_IMAGE3D_MAX_HEIGHT,
                    sizeof(size_t),
                    &image3dMaxHeight,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_IMAGE3D_MAX_HEIGHT) failed");

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_IMAGE3D_MAX_DEPTH,
                    sizeof(size_t),
                    &image3dMaxDepth,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_IMAGE3D_MAX_DEPTH) failed");

    // Maximum samplers
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_SAMPLERS,
                    sizeof(cl_uint),
                    &maxSamplers,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_SAMPLERS) failed");

    // Maximum parameter size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_PARAMETER_SIZE,
                    sizeof(size_t),
                    &maxParameterSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_PARAMETER_SIZE) failed");

    // Memory base address align
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MEM_BASE_ADDR_ALIGN,
                    sizeof(cl_uint),
                    &memBaseAddressAlign,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MEM_BASE_ADDR_ALIGN) failed");

    // Minimum data type align size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE,
                    sizeof(cl_uint),
                    &minDataTypeAlignSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE) failed");

    // Single precision floating point configuration
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_SINGLE_FP_CONFIG,
                    sizeof(cl_device_fp_config),
                    &singleFpConfig,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_SINGLE_FP_CONFIG) failed");

    // Double precision floating point configuration
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_DOUBLE_FP_CONFIG,
                    sizeof(cl_device_fp_config),
                    &doubleFpConfig,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_DOUBLE_FP_CONFIG) failed");

    // Global memory cache type
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_GLOBAL_MEM_CACHE_TYPE,
                    sizeof(cl_device_mem_cache_type),
                    &globleMemCacheType,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) failed");

    // Global memory cache line size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,
                    sizeof(cl_uint),
                    &globalMemCachelineSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) failed");

    // Global memory cache size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
                    sizeof(cl_ulong),
                    &globalMemCacheSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) failed");

    // Global memory size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_GLOBAL_MEM_SIZE,
                    sizeof(cl_ulong),
                    &globalMemSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed");

    // Maximum constant buffer size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
                    sizeof(cl_ulong),
                    &maxConstBufSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed");

    // Maximum constant arguments
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_MAX_CONSTANT_ARGS,
                    sizeof(cl_uint),
                    &maxConstArgs,
                    NULL);
   CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_MAX_CONSTANT_ARGS) failed");

    // Local memory type
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_LOCAL_MEM_TYPE,
                    sizeof(cl_device_local_mem_type),
                    &localMemType,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_LOCAL_MEM_TYPE) failed");

    // Local memory size
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_LOCAL_MEM_SIZE,
                    sizeof(cl_ulong),
                    &localMemSize,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_LOCAL_MEM_SIZE) failed");

    // Error correction support
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_ERROR_CORRECTION_SUPPORT,
                    sizeof(cl_bool),
                    &errCorrectionSupport,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_ERROR_CORRECTION_SUPPORT) failed");

    // Profiling timer resolution
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PROFILING_TIMER_RESOLUTION,
                    sizeof(size_t),
                    &timerResolution,
                    NULL);
   CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PROFILING_TIMER_RESOLUTION) failed");

    // Endian little
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_ENDIAN_LITTLE,
                    sizeof(cl_bool),
                    &endianLittle,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_ENDIAN_LITTLE) failed");

    // Device available
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_AVAILABLE,
                    sizeof(cl_bool),
                    &available,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_AVAILABLE) failed");

    // Device compiler available
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_COMPILER_AVAILABLE,
                    sizeof(cl_bool),
                    &compilerAvailable,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_COMPILER_AVAILABLE) failed");

    // Device execution capabilities
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_EXECUTION_CAPABILITIES,
                    sizeof(cl_device_exec_capabilities),
                    &execCapabilities,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_EXECUTION_CAPABILITIES) failed");

    // Device queue properities
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_QUEUE_PROPERTIES,
                    sizeof(cl_command_queue_properties),
                    &queueProperties,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_QUEUE_PROPERTIES) failed");

    // Platform
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PLATFORM,
                    sizeof(cl_platform_id),
                    &platform,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PLATFORM) failed");

    // Device name
    size_t tempSize = 0;
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_NAME,
                    0,
                    NULL,
                    &tempSize);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NAME) failed");

    delete name;
    name = new char[tempSize];

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_NAME,
                    sizeof(char) * tempSize,
                    name,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NAME) failed");

    // Vender name
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_VENDOR,
                    0,
                    NULL,
                    &tempSize);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_VENDOR) failed");

    delete vendorName;
    vendorName = new char[tempSize];

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_VENDOR,
                    sizeof(char) * tempSize,
                    vendorName,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_VENDOR) failed");

    // Driver name
    err = clGetDeviceInfo(
                    device(),
                    CL_DRIVER_VERSION,
                    0,
                    NULL,
                    &tempSize);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DRIVER_VERSION) failed");

    delete driverVersion;
    driverVersion = new char[tempSize];

    err = clGetDeviceInfo(
                    device(),
                    CL_DRIVER_VERSION,
                    sizeof(char) * tempSize,
                    driverVersion,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DRIVER_VERSION) failed");

    // Device profile
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PROFILE,
                    0,
                    NULL,
                    &tempSize);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PROFILE) failed");

    delete profileType;
    profileType = new char[tempSize];

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_PROFILE,
                    sizeof(char) * tempSize,
                    profileType,
                    NULL);
   CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_PROFILE) failed");

    // Device version
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_VERSION,
                    0,
                    NULL,
                    &tempSize);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_VERSION) failed");

    delete deviceVersion;
    deviceVersion = new char[tempSize];

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_VERSION,
                    sizeof(char) * tempSize,
                    deviceVersion,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_VERSION) failed");

    // Device extensions
    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_EXTENSIONS,
                    0,
                    NULL,
                    &tempSize);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_EXTENSIONS) failed");

    delete extensions;
    extensions = new char[tempSize];

    err = clGetDeviceInfo(
                    device(),
                    CL_DEVICE_EXTENSIONS,
                    sizeof(char) * tempSize,
                    extensions,
                    NULL);
    CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_EXTENSIONS) failed");

    // Device parameters of OpenCL 1.1 Specification
#ifdef CL_VERSION_1_1
    std::string deviceVerStr(deviceVersion);
    size_t vStart = deviceVerStr.find(" ", 0);
    size_t vEnd = deviceVerStr.find(" ", vStart + 1);
    std::string vStrVal = deviceVerStr.substr(vStart + 1, vEnd - vStart - 1);
    if(vStrVal.compare("1.0") > 0)
    {
        // Native vector sizes of all data types
        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR,
                        sizeof(cl_uint),
                        &nativeCharVecWidth,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) failed");

        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT,
                        sizeof(cl_uint),
                        &nativeShortVecWidth,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) failed");

        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,
                        sizeof(cl_uint),
                        &nativeIntVecWidth,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) failed");

        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG,
                        sizeof(cl_uint),
                        &nativeLongVecWidth,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) failed");

        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT,
                        sizeof(cl_uint),
                        &nativeFloatVecWidth,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) failed");

        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE,
                        sizeof(cl_uint),
                        &nativeDoubleVecWidth,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) failed");

        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF,
                        sizeof(cl_uint),
                        &nativeHalfVecWidth,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) failed");

        // Host unified memory
        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_HOST_UNIFIED_MEMORY,
                        sizeof(cl_bool),
                        &hostUnifiedMem,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_HOST_UNIFIED_MEMORY) failed");

        // Device OpenCL C version
        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_OPENCL_C_VERSION,
                        0,
                        NULL,
                        &tempSize);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_OPENCL_C_VERSION) failed");

        delete openclCVersion;
        openclCVersion = new char[tempSize];

        err = clGetDeviceInfo(
                        device(),
                        CL_DEVICE_OPENCL_C_VERSION,
                        sizeof(char) * tempSize,
                        openclCVersion,
                        NULL);
        CHECK_OPENCL_ERROR(err, "clGetDeviceIDs(CL_DEVICE_OPENCL_C_VERSION) failed");
    }
#endif
}
int DwtHaar1D::runDwtHaar1DKernel()
{
	
    cl_int status;
	
    status = this->setWorkGroupSize();
    CHECK_ERROR(status, SDK_SUCCESS, "setWorkGroupSize failed");

    // Force write to inData Buf to update its values 
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                commandQueue,
                inDataBuf,
                CL_FALSE,
                0,
                curSignalLength * sizeof(cl_float),
                inData,
                0,
                NULL,
                &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inDataBuf)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt1) Failed");

	ParaClass *paraClass = new ParaClass;//new a paraclass

	this->classObj = clCreateBuffer(context,CL_MEM_USE_HOST_PTR,sizeof(ParaClass),paraClass,&status);
	 CHECK_OPENCL_ERROR(status, "clclCreateBuffer failed. (inDataBuf)");

	cl_event mapEvt;

	paraClass=(ParaClass *)clEnqueueMapBuffer(commandQueue,this->classObj,CL_FALSE,CL_MAP_WRITE,0,sizeof(ParaClass),0,NULL,&mapEvt,&status);

	 CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer failed. (classObj)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&mapEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(mapEvt1) Failed");

	paraClass->setValue(this->totalLevels,this->curSignalLength,this->levelsDone,this->maxLevelsOnDevice);
	
	 cl_event unmapEvt;
	 status=clEnqueueUnmapMemObject(commandQueue,this->classObj,paraClass,0,NULL,&unmapEvt);//class is passed to the Device

	  CHECK_OPENCL_ERROR(status, "clEnqueueunMapBuffer failed. (classObj)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&unmapEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(mapEvt1) Failed");


    // Whether sort is to be in increasing order. CL_TRUE implies increasing 
    status = clSetKernelArg(kernel, 
                            0, 
                            sizeof(cl_mem), 
                            (void*)&inDataBuf); 
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inDataBuf)");

    status = clSetKernelArg(kernel, 
                            1, 
                            sizeof(cl_mem), 
                            (void*)&dOutDataBuf); 
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dOutDataBuf)");

    status = clSetKernelArg(kernel, 
                            2, 
                            sizeof(cl_mem), 
                            (void*)&dPartialOutDataBuf); 
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dPartialOutData)");

    status = clSetKernelArg(kernel, 
                            3, 
                            (localThreads * 2 * sizeof(cl_float)),
                            NULL); 
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)");

	status = clSetKernelArg(kernel, 
                            4, 
                            sizeof(cl_mem),
							(void*)&this->classObj); 
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (global memory)");

    /* 
    * Enqueue a kernel run call.
    */
    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                commandQueue,
                kernel,
                1,
                NULL,
                &globalThreads,
                &localThreads,
                0,
                NULL,
                &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&ndrEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed");

    // Enqueue the results to application pointer
    cl_event readEvt1;
    status = clEnqueueReadBuffer(
                commandQueue, 
                dOutDataBuf, 
                CL_FALSE,
                0,
                signalLength * sizeof(cl_float),
                dOutData,
                0,
                NULL,
                &readEvt1);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    // Enqueue the results to application pointer
    cl_event readEvt2;
    status = clEnqueueReadBuffer(
                commandQueue, 
                dPartialOutDataBuf, 
                CL_FALSE,
                0,
                signalLength * sizeof(cl_float),
                dPartialOutData,
                0,
                NULL,
                &readEvt2);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&readEvt1);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt1) Failed");

    status = sampleCommon->waitForEventAndRelease(&readEvt2);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt2) Failed");

	delete paraClass;
	 
	clReleaseMemObject(this->classObj);


    return SDK_SUCCESS;
}
Beispiel #19
0
void CLHelper::findSpecifiedDevices(
	const std::string& defaultVendor,
	const cl_device_type defaultDeviceType,
	const cl_int defaultDeviceId,
	std::vector<cl::Device>* deviceList,
	std::vector<CLHelper::DeviceInfo>* deviceInfoList)
{
	cl_int err;

	std::vector<cl::Platform> platforms;
	err = cl::Platform::get(&platforms);
	CHECK_OPENCL_ERROR(err, "cl::Platform::get failed");

	std::vector<cl::Platform>::iterator platform;
	for(platform = platforms.begin(); platform != platforms.end(); platform++)
	{
		std::string platformVendorString;
		platform->getInfo(CL_PLATFORM_VENDOR, &platformVendorString);

		if(defaultVendor.length() > 0 && platformVendorString.find(defaultVendor) == std::string::npos) {
			continue;
		}

		std::vector<cl::Device> devices;
		cl_int anyDevicesFound = platform->getDevices(defaultDeviceType, &devices);
		if(anyDevicesFound != CL_SUCCESS) continue;

		bool foundSpecificDevice = false;
		cl_int deviceId = 0;
		std::vector<cl::Device>::iterator device;
		for(device = devices.begin(); device != devices.end(); device++, deviceId++)
		{
			if(deviceId == defaultDeviceId) {
				CLHelper::DeviceInfo deviceInfo;
				deviceInfo.setDeviceInfo((*device)());

				deviceList->push_back((*device)());
				deviceInfoList->push_back(deviceInfo);
				
				// Fix (possibly) faulty vendor string
				if(defaultVendor.length() > 0) {
					delete deviceInfo.vendorName;
					size_t len = platformVendorString.length();
					deviceInfo.vendorName = new char[len+1];
					deviceInfo.vendorName[len] = 0;
					memcpy(deviceInfo.vendorName, platformVendorString.c_str(), len);
				}
				// End of fix

				foundSpecificDevice = true;
				break;
			}
		}
		if(foundSpecificDevice)
			break;
	}

	if(deviceList->empty()) {
		std::cerr << "No devices found which match the criteria. Exiting..." << std::endl;
		exit(1);
	}
}
int
MatrixMulImage::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");
    
    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, 0, "sampleCommon::getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "deviceInfo.setDeviceInfo. failed");

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(
                           context, 
                           devices[deviceId], 
                           prop, 
                           &status);
        CHECK_ERROR(retValue, SDK_SUCCESS, "clCreateCommandQueue. failed");
    }

    cl_image_format imageFormat;
    imageFormat.image_channel_data_type = CL_FLOAT;
    imageFormat.image_channel_order = CL_RGBA;

    if(!deviceInfo.imageSupport)
    {
        std::cout << "Expected Error: Image is not supported on the Device" << std::endl;
        return SDK_EXPECTED_FAILURE;
    }

    cl_image_desc imageDesc;
    memset(&imageDesc, '\0', sizeof(cl_image_desc));
    imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;

    // Create image for matrix A
    imageDesc.image_width = width0 / 4;
    imageDesc.image_height = height0;
    inputBuffer0 = clCreateImage(context,
                                 CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                 &imageFormat,
                                 &imageDesc,
                                 input0,
                                 &status);
    CHECK_OPENCL_ERROR(status, "clCreateImage failed. (inputBuffer0)");
   
    // Create image for matrix B
    imageDesc.image_width = width1 / 4;
    imageDesc.image_height = height1;
    inputBuffer1 = clCreateImage(context,
                                 CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                 &imageFormat,
                                 &imageDesc,
                                 input1,
                                 &status);
    CHECK_OPENCL_ERROR(status, "clCreateImage failed. (inputBuffer1)");
    
    // Create image for matrix C
    imageDesc.image_width = width1 / 4;
    imageDesc.image_height = height0;
    outputBuffer = clCreateImage(context,
                                 CL_MEM_WRITE_ONLY,
                                 &imageFormat,
                                 &imageDesc,
                                 0,
                                 &status);
    CHECK_OPENCL_ERROR(status, "clCreateImage failed. (outputBuffer)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("MatrixMulImage_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");
    
    kernel = clCreateKernel(program, "mmmKernel3", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel)");
   
    return SDK_SUCCESS;
}
int
DwtHaar1D::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    // If we could find our platform, use it. Otherwise use just available platform.

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
        dType,
        NULL,
        NULL,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");


    commandQueue = clCreateCommandQueue(context, 
        devices[deviceId], 
        0, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    if(isAmdPlatform())
        inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    inDataBuf = clCreateBuffer(context,
        inMemFlags,
        sizeof(cl_float) * signalLength,
        NULL,
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inDataBuf)");

    dOutDataBuf = clCreateBuffer(context, 
        CL_MEM_WRITE_ONLY,
        signalLength * sizeof(cl_float),
        NULL, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (dOutDataBuf)");

    dPartialOutDataBuf = clCreateBuffer(context, 
        CL_MEM_WRITE_ONLY,
        signalLength * sizeof(cl_float),
        NULL, 
        &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (dPartialOutDataBuf)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("DwtHaar1DCPPKernel_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("-x clc++ ");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name 
    kernel = clCreateKernel(program, "dwtHaar1D", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    status = kernelInfo.setKernelWorkGroupInfo(kernel,devices[deviceId]);
    CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed");

    return SDK_SUCCESS;
}
int 
MatrixMulImage::runCLKernels(void)
{
    cl_int   status;

    /* 
     * Kernel runs over complete output matrix with blocks of blockSize x blockSize 
     * running concurrently
     */
    size_t globalThreads[2]= {width1 / 4, height0 / 8};
    size_t localThreads[2] = {blockSize, blockSize};

    status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]);
    CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");
    
    availableLocalMemory = deviceInfo.localMemSize - kernelInfo.localMemoryUsed;
    neededLocalMemory = 2 * blockSize * blockSize * sizeof(cl_float); 
    if(neededLocalMemory > availableLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_SUCCESS;
    }

    if((cl_uint)(localThreads[0]*localThreads[1]) > kernelInfo.kernelWorkGroupSize)
    {
       if(kernelInfo.kernelWorkGroupSize >= 64)
        {
            blockSize = 8; 
            localThreads[0] = blockSize;
            localThreads[1] = blockSize;
        }
        else if(kernelInfo.kernelWorkGroupSize >= 32)
        {
            blockSize = 4; 
            localThreads[0] = blockSize;
            localThreads[1] = blockSize;
        }
        else
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << localThreads[0] * localThreads[1] << std::endl;
            std::cout << "Max Group Size supported on the kernel : " 
                      << kernelInfo.kernelWorkGroupSize<<std::endl;
            return SDK_FAILURE;
        }
    }

    if(localThreads[0] > deviceInfo.maxWorkItemSizes[0] ||
       localThreads[1] > deviceInfo.maxWorkItemSizes[1] ||
       localThreads[0]*localThreads[1] > deviceInfo.maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device does not support requested number of work items." << std::endl;
        return SDK_FAILURE;
    }

    //For small matrix sizes
    while(globalThreads[0] % localThreads[0])
        localThreads[0] /= 2;

    while(globalThreads[1] % localThreads[1])
        localThreads[1] /= 2;

    // Set appropriate arguments to the kernel
    
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer0);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputBuffer)");
   
    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer1);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputBuffer0)");
    
    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&outputBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputBuffer1)");
   
    status = clSetKernelArg(kernel, 3, sizeof(cl_int),(void*)&width0);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (width0)");
    
    status = clSetKernelArg(kernel, 4, sizeof(cl_int), &width1);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (width1)");
   
    // Enqueue a kernel run call
    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 kernel,
                 2,
                 NULL,
                 globalThreads,
                 localThreads,
                 0,
                 NULL,
                 &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");
   
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");
   
    cl_int eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                        ndrEvt, 
                        CL_EVENT_COMMAND_EXECUTION_STATUS, 
                        sizeof(cl_int),
                        &eventStatus,
                        NULL);
        CHECK_OPENCL_ERROR(status, "clGetEventInfo failed.");
        
    }

    // Calculate performance
    cl_ulong startTime;
    cl_ulong endTime;
    
    // Get kernel profiling info
    status = clGetEventProfilingInfo(ndrEvt,
                                     CL_PROFILING_COMMAND_START,
                                     sizeof(cl_ulong),
                                     &startTime,
                                     0);
    CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(startTime)");
   

    status = clGetEventProfilingInfo(ndrEvt,
                                     CL_PROFILING_COMMAND_END,
                                     sizeof(cl_ulong),
                                     &endTime,
                                     0);
    CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(endTime)");

    status = clReleaseEvent(ndrEvt);
    CHECK_OPENCL_ERROR(status, "clReleaseEvent failed.(ndrEvt)");
    
    // Print performance numbers
    double sec = 1e-9 * (endTime - startTime);
    std::cout << "KernelTime (ms) : " << sec * 1000 << std::endl;

    double flops = 2 * width0 * width1;
    double perf = (flops / sec) * height0 * 1e-9;
    
    std::cout << "GFlops achieved : " << perf << std::endl << std::endl;

    size_t origin[] = {0, 0, 0};
    size_t region[] = {width1 / 4, height0, 1};
    cl_event readEvt;
    status = clEnqueueReadImage(commandQueue,
                                outputBuffer,
                                CL_FALSE,
                                origin,
                                region,
                                0,
                                0,
                                output,
                                0,
                                NULL,
                                &readEvt);
    CHECK_OPENCL_ERROR(status, "outputBuffer failed.(clEnqueueReadImage)");
    
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");
    
    status = sampleCommon->waitForEventAndRelease(&readEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed");

    return SDK_SUCCESS;
}
float CRoutine_Sum_NVidia::Sum(cl_mem input_buffer)
{
	// First zero out the temporary sum buffer.
	mrZero->Zero(mTempBuffer1, mBufferSize);

	int status = CL_SUCCESS;
	// Copy the input buffer into mTempBuffer1
	// The work was all completed on the GPU.  Copy the summed value to the final buffer:
	status = clEnqueueCopyBuffer(mQueue, input_buffer, mTempBuffer1, 0, 0, mInputSize * sizeof(cl_float), 0, NULL, NULL);
	CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed.");

	status = clFinish(mQueue);
	CHECK_OPENCL_ERROR(status, "clFinish failed.");

	// Init locals:
	cl_float gpu_result = 0;
	int numThreads = mThreads[0];

	int threads = 0;
	int blocks = 0;
	cl_mem buff1 = mTempBuffer1;
	cl_mem buff2 = mTempBuffer2;
    size_t globalWorkSize[1];
    size_t localWorkSize[1];
    cl_kernel reductionKernel;

	for(int kernel_id = 0; kernel_id < mReductionPasses; kernel_id++)
	{
		threads = mThreads[kernel_id];
		blocks = mBlocks[kernel_id];

		globalWorkSize[0] = blocks * threads;
		localWorkSize[0] = threads;
		reductionKernel = mKernels[kernel_id];

		clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &buff1);
		clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &buff2);
		clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &mBufferSize);
		clSetKernelArg(reductionKernel, 3, sizeof(cl_float) * numThreads, NULL);
		status = clEnqueueNDRangeKernel(mQueue, reductionKernel, 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL);
		CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

		buff1 = buff2;
	}

	clFinish(mQueue);

	// If a few elements remain, we will need to compute their sum on the CPU:
    if (mFinalS > 1)
    {
    	cl_float h_odata[mFinalS];
        // copy result from device to host
    	status = clEnqueueReadBuffer(mQueue, mTempBuffer2, CL_TRUE, 0, mFinalS * sizeof(cl_float), h_odata, 0, NULL, NULL);
		CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

        for(int i=0; i < mFinalS; i++)
        {
            gpu_result += h_odata[i];
        }

    }
    else
    {
    	// The work was all completed on the GPU.  Copy the summed value to the CPU:
		status = clEnqueueReadBuffer(mQueue, mTempBuffer2, CL_TRUE, 0, sizeof(cl_float), &gpu_result, 0, NULL, NULL);
		CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");
    }

	return float(gpu_result);
}
Beispiel #24
0
int ImageOperationECB(int argc, char** argv, bool bEncrypt = true)
{
	// Parse arguments
	// OpenCL arguments: platform and device
	cl_int err;
		
	int iPlatform  = GetArgInt   (argc, argv, "p");
	int iDevice    = GetArgInt   (argc, argv, "d");
	char* sInFile  = GetArgString(argc, argv, "in");
	char* sOutFile = GetArgString(argc, argv, "out");

	if (sInFile == NULL || sOutFile == NULL || !FileExists(sInFile))
	{
		PrintUsage();
		return -1;
	}

	// Initialize ImageMagick 
	Magick::InitializeMagick(*argv);
	
	ImageData img = ReadImageFile(sInFile);

	// Allocate Host Memory
	unsigned char key[16] = {
		0x2B, 0x7E, 0x15, 0x16, 
		0x28, 0xAE, 0xD2, 0xA6, 
		0xAB, 0xF7, 0x15, 0x88, 
		0x09, 0xCF, 0x4F, 0x3C};
	unsigned char* roundKeys = NULL;
	int rounds = 0;

	ComputeRoundKeys(&roundKeys, &rounds, 16, key);
	
	// Set-up OpenCL Platform
	OCL_Device* pOCL_Device = new OCL_Device(iPlatform, iDevice);
	pOCL_Device->SetBuildOptions("");
	pOCL_Device->PrintInfo();

	// Set up OpenCL 
	cl_kernel Kernel = pOCL_Device->GetKernel("aes-kernel.cl",
		bEncrypt ? "AES_ECB_Encrypt" : "AES_ECB_Decrypt");

	// Allocate Device Memory
	cl_mem d_A = pOCL_Device->DeviceMalloc(0, img.padded_bytes);
	cl_mem d_B = pOCL_Device->DeviceMalloc(1, img.padded_bytes);
	cl_mem d_C = pOCL_Device->DeviceMalloc(2, rounds * 16);
	
	// Copy Image to Device
	pOCL_Device->CopyBufferToDevice(img.data, 0, img.padded_bytes);	
	
	// Keys
	pOCL_Device->CopyBufferToDevice(roundKeys, 2, rounds * 16);
	

	// Set Kernel Arguments
	cl_int _num = img.padded_bytes / 16;
	err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), &d_A);   
	CHECK_OPENCL_ERROR(err);
	err = clSetKernelArg(Kernel, 1, sizeof(cl_mem), &d_B);    
	CHECK_OPENCL_ERROR(err);
	err = clSetKernelArg(Kernel, 2, sizeof(cl_mem), &d_C);    
	CHECK_OPENCL_ERROR(err);
	err = clSetKernelArg(Kernel, 3, sizeof(cl_int), &rounds); 
	CHECK_OPENCL_ERROR(err);
	err = clSetKernelArg(Kernel, 4, sizeof(cl_int), &_num);   
	CHECK_OPENCL_ERROR(err);
	
	
	// Wait for previous action to finish
	err = clFinish(pOCL_Device->GetQueue());
	CHECK_OPENCL_ERROR(err);
	
	size_t off = 0;
	size_t num = img.padded_bytes / 16;
	size_t threads = 256;

	// Run the kernel
	err = clEnqueueNDRangeKernel(pOCL_Device->GetQueue(), 
		Kernel, 1, NULL, &num, &threads, 0, NULL, NULL);
	CHECK_OPENCL_ERROR(err);
	
	
	// Wait for kernel to finish
	err = clFinish(pOCL_Device->GetQueue());
	CHECK_OPENCL_ERROR(err);


	// Copy Data From Device
	pOCL_Device->CopyBufferToHost  (img.data, 1, img.padded_bytes);
	
	// Free resources
	delete pOCL_Device;
	
	delete[] roundKeys;
	
	// Write Output data
	WriteImageFile(sOutFile, img);

	free(img.data);

	return 0;
}
int DwtHaar1D::runDwtHaar1DKernel()
{
    cl_int status;

    status = this->setWorkGroupSize();
    CHECK_ERROR(status, SDK_SUCCESS, "setWorkGroupSize failed");

    // Force write to inData Buf to update its values
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                 commandQueue,
                 inDataBuf,
                 CL_FALSE,
                 0,
                 curSignalLength * sizeof(cl_float),
                 inData,
                 0,
                 NULL,
                 &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inDataBuf)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt1) Failed");

    // Whether sort is to be in increasing order. CL_TRUE implies increasing
    status = clSetKernelArg(kernel,
                            0,
                            sizeof(cl_mem),
                            (void*)&inDataBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inDataBuf)");

    status = clSetKernelArg(kernel,
                            1,
                            sizeof(cl_mem),
                            (void*)&dOutDataBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dOutDataBuf)");

    status = clSetKernelArg(kernel,
                            2,
                            sizeof(cl_mem),
                            (void*)&dPartialOutDataBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (dPartialOutData)");

    status = clSetKernelArg(kernel,
                            3,
                            (localThreads * 2 * sizeof(cl_float)),
                            NULL);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (local memory)");

    status = clSetKernelArg(kernel,
                            4,
                            sizeof(cl_uint),
                            (void*)&totalLevels);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (totalLevels)");

    status = clSetKernelArg(kernel,
                            5,
                            sizeof(cl_uint),
                            (void*)&curSignalLength);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (curSignalLength)");

    status = clSetKernelArg(kernel,
                            6,
                            sizeof(cl_uint),
                            (void*)&levelsDone);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (levelsDone)");

    status = clSetKernelArg(kernel,
                            7,
                            sizeof(cl_uint),
                            (void*)&maxLevelsOnDevice);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (levelsDone)");

    /*
    * Enqueue a kernel run call.
    */
    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 kernel,
                 1,
                 NULL,
                 &globalThreads,
                 &localThreads,
                 0,
                 NULL,
                 &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&ndrEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt1) Failed");

    // Enqueue the results to application pointer
    cl_event readEvt1;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 dOutDataBuf,
                 CL_FALSE,
                 0,
                 signalLength * sizeof(cl_float),
                 dOutData,
                 0,
                 NULL,
                 &readEvt1);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    // Enqueue the results to application pointer
    cl_event readEvt2;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 dPartialOutDataBuf,
                 CL_FALSE,
                 0,
                 signalLength * sizeof(cl_float),
                 dPartialOutData,
                 0,
                 NULL,
                 &readEvt2);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = waitForEventAndRelease(&readEvt1);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt1) Failed");

    status = waitForEventAndRelease(&readEvt2);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt2) Failed");

    return SDK_SUCCESS;
}
Beispiel #26
0
int test1(int argc, char** argv)
{
	// Parse arguments
	// OpenCL arguments: platform and device
	cl_int err;
		
	int iPlatform = GetArgInt(argc, argv, "p");
	int iDevice   = GetArgInt(argc, argv, "d");
	char* sFileName = GetArgString(argc, argv, "n");

	// Allocate Host Memory
	unsigned char pattern[16] =
	{
		0x32, 0x43, 0xF6, 0xA8, 
		0x88, 0x5A, 0x30, 0x8D,
		0x31, 0x31, 0x98, 0xA2,
		0xE0, 0x37, 0x07, 0x34};
	unsigned char data[16*256];
	for (int i = 0; i < 256; i++)
		for (int k; k < 16; k++)
			data[i*16 + k] = pattern[k];

	for (int i = 0; i < 4; i++)
	{
		for (int j = 0; j < 4; j++)
			printf("%2X ", data[i + j*4]);
		printf("\n");
	}
	printf("\n");		

	unsigned char key[16] = {
		0x2B, 0x7E, 0x15, 0x16, 
		0x28, 0xAE, 0xD2, 0xA6, 
		0xAB, 0xF7, 0x15, 0x88, 
		0x09, 0xCF, 0x4F, 0x3C};
	unsigned char* roundKeys = NULL;
	int rounds = 0;

	ComputeRoundKeys(&roundKeys, &rounds, 16, key);
	
	// Set-up OpenCL Platform
	OCL_Device* pOCL_Device = new OCL_Device(iPlatform, iDevice);
	pOCL_Device->SetBuildOptions("");
	pOCL_Device->PrintInfo();

	// Set up OpenCL 
	cl_kernel kernel = pOCL_Device->GetKernel("aes-kernel.cl", "AES_ECB_Encypt4");

	// Allocate Device Memory
	cl_mem d_A = pOCL_Device->DeviceMalloc(0, 16);
	cl_mem d_B = pOCL_Device->DeviceMalloc(1, 16);
	cl_mem d_C = pOCL_Device->DeviceMalloc(2, rounds * 16);
	
	// Copy Image to Device
	pOCL_Device->CopyBufferToDevice(data, 0, 16);	
	
	// Keys
	pOCL_Device->CopyBufferToDevice(roundKeys, 2, rounds * 16);
	

	// Set Kernel Arguments
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_A); CHECK_OPENCL_ERROR(err);
	err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_B); CHECK_OPENCL_ERROR(err);
	err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_C); CHECK_OPENCL_ERROR(err);
	err = clSetKernelArg(kernel, 3, sizeof(cl_int), &rounds);   CHECK_OPENCL_ERROR(err);
	cl_int _num = 1;
	err = clSetKernelArg(kernel, 4, sizeof(cl_int), &_num);   CHECK_OPENCL_ERROR(err);
	
	// Wait for previous action to finish
	err = clFinish(pOCL_Device->GetQueue());
	CHECK_OPENCL_ERROR(err);

	double seconds = GetTime();
	// Run the kernel
	size_t off = 0;
	size_t num = 256;
	size_t threads = 256;
	err = clEnqueueNDRangeKernel(pOCL_Device->GetQueue(), kernel, 1, NULL, &num, &threads, 0, NULL, NULL);
	CHECK_OPENCL_ERROR(err);

	// Wait for kernel to finish
	err = clFinish(pOCL_Device->GetQueue());
	CHECK_OPENCL_ERROR(err);
	seconds = GetTime() - seconds;
	printf("Elapsed Time: %f s (%f MiB/s)\n" , seconds, 16 /seconds * 10.f / 1024.f / 1024.f);

	// Copy Data From Device
	pOCL_Device->CopyBufferToHost  (data, 1, 16);
	for (int i = 0; i < 4; i++)
	{
		for (int j = 0; j < 4; j++)
			printf("%2X ", data[i + j*4]);
		printf("\n");
	}
	printf("\n");		
	
	// Free resources
	delete pOCL_Device;
	
	delete[] roundKeys;


	//write test data
	//WriteImageFile("tux2.jpg", img);

	//free(img.data);

	return 0;
}
Beispiel #27
0
int benchmark_ctr(int argc, char** argv)
{
	// Parse arguments
	// OpenCL arguments: platform and device
	cl_int err;
	int count = 100;

		
	int iPlatform = GetArgInt(argc, argv, "p");
	int iDevice   = GetArgInt(argc, argv, "d");
	
	// Set-up  Encryption keys
	unsigned char key[16] = {
		0x2B, 0x7E, 0x15, 0x16, 
		0x28, 0xAE, 0xD2, 0xA6, 
		0xAB, 0xF7, 0x15, 0x88, 
		0x09, 0xCF, 0x4F, 0x3C};
	unsigned char nonce[12];
	srand(time(NULL));
	for (int i = 0; i < 12; i++)
		nonce[i] = rand() % 256;
	unsigned char* roundKeys = NULL;
	int rounds = 0;
	ComputeRoundKeys(&roundKeys, &rounds, 16, key);
	
	// Set-up OpenCL Platform
	OCL_Device* pOCL_Device = new OCL_Device(iPlatform, iDevice);
	pOCL_Device->SetBuildOptions("");
	pOCL_Device->PrintInfo();

	// Set up OpenCL 
	cl_kernel EncryptionKernel = pOCL_Device->GetKernel("aes-kernel.cl", 
		"AES_CTR_Encrypt");

	size_t MinSize = 16;		 // 16 B = 128 bits
	size_t MaxSize = 512 << 20;  // 512 MiB.

	// keys
	cl_mem d_C = pOCL_Device->DeviceMalloc(2, rounds * 16);
	pOCL_Device->CopyBufferToDevice(roundKeys, 2, rounds * 16);
	
	// nonce
	cl_mem d_D = pOCL_Device->DeviceMalloc(3, 12);
	pOCL_Device->CopyBufferToDevice(nonce, 3, 12);

	printf("\n");
	printf("Time is reported for %d passes.\n", count);
	printf("\n");
	
	printf("     MiB    , Encryption Speed (MiB/s), Encryption Time (s), Decryption Speed (MiB/s), Decryption Time (s)\n");

	for (size_t size = MinSize; size <= MaxSize; size *= 2)
	{
		printf("%12.8f, ", ((double)size) / 1024 / 1024);

		// Allocate Device Memory
		cl_mem d_A = pOCL_Device->DeviceMalloc(0, size);
		cl_mem d_B = pOCL_Device->DeviceMalloc(1, size);

		// Allocate Host Memory
		char* h_A  = new char[size];
		char* h_B  = new char[size];

		// Fill Host Memory

		for (size_t i = 0; i < size; i++)
		{
			h_A[i] = i % 27;
		}
	
		// Copy Data to Device
		pOCL_Device->CopyBufferToDevice(h_A, 0, size);	
		pOCL_Device->CopyBufferToDevice(h_A, 1, size);	
		// just to ensure that both buffers are on the device

		// Set Kernel Arguments
		// Encrypt kernel
		cl_int _num = size / 16;
		err = clSetKernelArg(EncryptionKernel, 0, sizeof(cl_mem), &d_A);
		CHECK_OPENCL_ERROR(err);
		err = clSetKernelArg(EncryptionKernel, 1, sizeof(cl_mem), &d_B);
		CHECK_OPENCL_ERROR(err);
		err = clSetKernelArg(EncryptionKernel, 2, sizeof(cl_mem), &d_C); 
		CHECK_OPENCL_ERROR(err);
		err = clSetKernelArg(EncryptionKernel, 3, sizeof(cl_int), &rounds);  
		CHECK_OPENCL_ERROR(err);
		err = clSetKernelArg(EncryptionKernel, 4, sizeof(cl_int), &_num);
		CHECK_OPENCL_ERROR(err);
		err = clSetKernelArg(EncryptionKernel, 5, sizeof(cl_mem), &d_D);  
		CHECK_OPENCL_ERROR(err);
	
		// Wait for previous action to finish
		err = clFinish(pOCL_Device->GetQueue());
		CHECK_OPENCL_ERROR(err);
			
		size_t off = 0;
		size_t num = (((size / 16) + 255) / 256) * 256;
		size_t threads = 256;
		
		// Run the encryption kernel
		double seconds = GetTime();
		for (int i = 0; i < count; i++)
			err = clEnqueueNDRangeKernel(pOCL_Device->GetQueue(), 
			EncryptionKernel, 1, NULL, &num, &threads, 0, NULL, NULL);
		CHECK_OPENCL_ERROR(err);
		err = clFinish(pOCL_Device->GetQueue());
		CHECK_OPENCL_ERROR(err);
		seconds = GetTime() - seconds;

		printf("     %12.4f,    ", size / seconds / 1024.f / 1024.f * count);
		printf("     %12.4f,    ", seconds);

		// Override Input Buffer
		err = clEnqueueCopyBuffer(pOCL_Device->GetQueue(), d_B, d_A, 0, 0, 
			size, 0, NULL, NULL);
		CHECK_OPENCL_ERROR(err);
		err = clFinish(pOCL_Device->GetQueue());
		CHECK_OPENCL_ERROR(err);

		
		err = clSetKernelArg(EncryptionKernel, 1, sizeof(cl_mem), &d_A);
		CHECK_OPENCL_ERROR(err);
		err = clSetKernelArg(EncryptionKernel, 0, sizeof(cl_mem), &d_B);
		CHECK_OPENCL_ERROR(err);

		// Run the decryption kernel
		seconds = GetTime();
		for (int i = 0; i < count; i++)
			err = clEnqueueNDRangeKernel(pOCL_Device->GetQueue(), 
			EncryptionKernel, 1, NULL, &num, &threads, 0, NULL, NULL);
		CHECK_OPENCL_ERROR(err);
		err = clFinish(pOCL_Device->GetQueue());
		CHECK_OPENCL_ERROR(err);
		seconds = GetTime() - seconds;
		printf("     %12.4f,    ", size / seconds / 1024.f / 1024.f * count);
		printf("     %12.4f,    ", seconds);
		

		// Copy Data From Device
		pOCL_Device->CopyBufferToHost  (h_B, 0, size);
		
		// Verify Data
		bool passed = true;
		for (size_t i = 0; i < size; i++)
		{
			if (h_A[i] != h_B[i])
			{
				passed = false;
				printf("\n");
				printf("Encountered an error when running the benchmark with size %d.\n", 
					size);
				printf("At element %d: %d != %d.\n", i, h_A[i], h_B[i]);
				printf("\n");
				break;
			}
		}
		if (passed)
			printf("        passed  ");
		else
			printf("        failed  ");
		printf("\n");
	}
	// Free resources
	delete pOCL_Device;
	
	delete[] roundKeys;

	return 0;
}
Beispiel #28
0
void OCL_Device::CopyBufferToHost  (void* h_Buffer, int idx, size_t size)
{
	cl_int err = clEnqueueReadBuffer (m_queue, m_buffers[idx], CL_TRUE, 0, 
		size, h_Buffer, 0, NULL, NULL);
	CHECK_OPENCL_ERROR(err);
}
Beispiel #29
0
int
ComputeBench::bandwidth(cl_kernel &kernel,
        cl_mem outputBuffer,
        double *timeTaken,
        double *gbps
        )
{
    cl_int status;

    // Check group size against kernelWorkGroupSize
    status = clGetKernelWorkGroupInfo(kernel,
            devices[sampleArgs->deviceId],
            CL_KERNEL_WORK_GROUP_SIZE,
            sizeof (size_t),
            &kernelWorkGroupSize,
            0);
    CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

    if (localThreads > kernelWorkGroupSize) {
        localThreads = kernelWorkGroupSize;
    }

    //Set appropriate arguments to the kernel
    int argIndex = 0;
    {
        status = clSetKernelArg(kernel,
                argIndex++,
                sizeof (cl_mem),
                (void *) &outputBuffer);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)");
    }

    double sec = 0;
    int iter = iterations;

    // Run the kernel for a number of iterations
    for (int i = 0; i < iter; i++) {
        // Enqueue a kernel run call
        cl_event ndrEvt;
        status = clEnqueueNDRangeKernel(commandQueue,
                kernel,
                1,
                NULL,
                &globalThreads,
                &localThreads,
                0,
                NULL,
                &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        // wait for the kernel call to finish execution
        status = clWaitForEvents(1, &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clWaitForEvents failed.");

        // Calculate performance
        cl_ulong startTime;
        cl_ulong endTime;

        // Get kernel profiling info
        status = clGetEventProfilingInfo(ndrEvt,
                CL_PROFILING_COMMAND_START,
                sizeof (cl_ulong),
                &startTime,
                0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(startTime)");

        status = clGetEventProfilingInfo(ndrEvt,
                CL_PROFILING_COMMAND_END,
                sizeof (cl_ulong),
                &endTime,
                0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(endTime)");

        // Cumulate time for each iteration
        sec += 1e-9 * (endTime - startTime);

        status = clReleaseEvent(ndrEvt);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(endTime)");

        status = clFinish(commandQueue);
        CHECK_OPENCL_ERROR(status, "clFinish failed");
    }

    // Copy bytes
    int bytesPerThread = FORLOOP;
    double bytes = (double) (iter * bytesPerThread);
    double perf = (bytes / sec) * 1e-9;
    perf *= globalThreads * vectorSize;

    *gbps = perf;
    *timeTaken = sec / iter;

    return SDK_SUCCESS;
}
Beispiel #30
0
int
ComputeBench::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if (sampleArgs->deviceType.compare("cpu") == 0) {
        dType = CL_DEVICE_TYPE_CPU;
    } else //deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if (sampleArgs->isThereGPU() == false) {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = getPlatform(platform, sampleArgs->platformId, sampleArgs->isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed");

    // Display available devices.
    retValue = displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties) platform,
        0
    };

    context = clCreateContextFromType(cps,
            dType,
            NULL,
            NULL,
            &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = getDevices(context, &devices, sampleArgs->deviceId,
            sampleArgs->isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    std::string deviceStr(deviceInfo.deviceVersion);
    size_t vStart = deviceStr.find(" ", 0);
    size_t vEnd = deviceStr.find(" ", vStart + 1);
    std::string vStrVal = deviceStr.substr(vStart + 1, vEnd - vStart - 1);


    // OpenCL 1.1 has inbuilt support for vec3 data types
    if (vec3 == true) {
        OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
    }
    // The block is to move the declaration of prop closer to its use
    /* Note: Using deprecated clCreateCommandQueue as CL_QUEUE_PROFILING_ENABLE flag not currently working 
     ***with clCreateCommandQueueWithProperties*/
    cl_command_queue_properties prop = 0;
    prop |= CL_QUEUE_PROFILING_ENABLE;

    commandQueue = clCreateCommandQueue(context,
            devices[sampleArgs->deviceId],
            prop,
            &status);
    CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");

    if (sampleArgs->isLoadBinaryEnabled()) {
        // Always assuming kernel was dumped for vector-width 1
        if (vectorSize != 0) {
            std::cout <<
                    "Ignoring specified vector-width. Assuming kernel was dumped for vector-width 1"
                    << std::endl;
        }
        vectorSize = 1;
    } else {
        // If vector-size is not specified in the command-line, choose the preferred size for the device
        if (vectorSize == 0) {
            vectorSize = deviceInfo.preferredFloatVecWidth;
        } else if (vectorSize == 3) {
            //Make vectorSize as 4 if -v option is 3.
            //This memory alignment is required as per OpenCL for type3 vectors
            vec3 = true;
            vectorSize = 4;
        } else if ((1 != vectorSize) && (2 != vectorSize) && (4 != vectorSize) &&
                (8 != vectorSize) && (16 != vectorSize)) {
            std::cout << "The vectorsize can only be one of 1,2,3(4),4,8,16!" << std::endl;
            return SDK_FAILURE;
        }
    }


    outputKadd = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof (cl_float) * vectorSize * length, 0, &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputKadd)");

    // create a CL program using the kernel source
    char buildOption[512];
    if (vectorSize == 1) {
        sprintf(buildOption, "-D DATATYPE=uint -D DATATYPE2=uint4 ");
        //sprintf(buildOption, "-D DATATYPE=float -D DATATYPE2=float4 ");
    } else {
        sprintf(buildOption, "-D DATATYPE=uint%d -D DATATYPE2=uint%d ", (vec3 == true) ? 3 : vectorSize, (vec3 == true) ? 3 : vectorSize);
        //sprintf(buildOption, "-D DATATYPE=float%d -D DATATYPE2=float%d ", (vec3 == true) ? 3 : vectorSize, (vec3 == true) ? 3 : vectorSize);
    }

    strcat(buildOption, "-D IDXTYPE=uint ");

    // create a CL program using the kernel source
    buildProgramData buildData;
    buildData.kernelName = std::string("ComputeBench.cl");
    buildData.devices = devices;
    buildData.deviceId = sampleArgs->deviceId;
    buildData.flagsStr = std::string(buildOption);
    if (sampleArgs->isLoadBinaryEnabled()) {
        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
    }

    if (sampleArgs->isComplierFlagsSpecified()) {
        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
    }

    retValue = buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");

    // Global memory bandwidth from read-single access
    kernel[0] = clCreateKernel(program, "Kadd", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(Kadd)");

    return SDK_SUCCESS;
}