void CMeanFilter_GPU::Apply(const Image& input, Image& output)
{
	const int radius = CFilterParameterInterpreter<int>::Convert(_parameters->GetParameter("Radius"));
	const int width = input.Width();
	const int height = input.Height();
	const int NbPixel = width * height;
	const int depth = input.Depth();
	const unsigned char *image_data = input.Data();
	unsigned char *output_data = output.Data();

	//int *TempPix = new int[ NbPixel * 2 ];
	unsigned char *TempMask = new unsigned char[NbPixel];
	//memset( TempPix, 0, NbPixel * 2 * sizeof( int ) );
	memset(TempMask, 255, NbPixel * sizeof(unsigned char));

	auto current_device = OpenCLUtils::Instance()->GetCurrentDevice();
	const cl::Context Context = current_device.Context();

	cl_int Error;
	cl_mem_flags InOutMemFlags = CL_MEM_READ_WRITE;
	cl_mem_flags InMemFlags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
	cl_mem_flags OutMemFlags = CL_MEM_WRITE_ONLY;

	// If you ever change the local size, don't forget to change the size in the kernel
	// Global size must be a multiple of local size
	unsigned int GlobalSizeY = height;
	unsigned int GlobalSizeX = width;

	cl::Buffer ImBuffer(Context, InMemFlags, NbPixel * depth * sizeof(unsigned char), (void*)image_data, &Error);
	cl::Buffer MaskBuffer(Context, InMemFlags, NbPixel * sizeof(unsigned char), (void*)TempMask, &Error);
	cl::Buffer OutBuffer(Context, OutMemFlags, NbPixel * depth * sizeof(unsigned char), 0, &Error);

	cl::CommandQueue Queue = current_device.CommandQueue();
	cl::Event Event;

	// Horizontal pass
	int Index = 0;
	m_KernelH.setArg(Index++, ImBuffer);
	m_KernelH.setArg(Index++, MaskBuffer);
	m_KernelH.setArg(Index++, OutBuffer);
	m_KernelH.setArg(Index++, height);
	m_KernelH.setArg(Index++, width);
	m_KernelH.setArg(Index++, radius);

	Queue.enqueueNDRangeKernel(m_KernelH, cl::NullRange, cl::NDRange(GlobalSizeX, GlobalSizeY), cl::NullRange, 0, &Event);
	Event.wait();

	Queue.enqueueReadBuffer(OutBuffer, true, 0, NbPixel * depth * sizeof(unsigned char), (void*)output_data, 0, &Event);
	Event.wait();

	Queue.finish();

	delete[] TempMask;
}
Esempio n. 2
0
void FiltersVector::GaussianBlur(ImageBuffer& Source, ImageBuffer& Dest, float Sigma)
{
   CheckCompatibility(Source, Dest);

   // Prepare mask
   int MaskSize = int(ceil(3 * Sigma));

   if (Sigma <= 0 || MaskSize > 31)
      throw cl::Error(CL_INVALID_ARG_VALUE, "Invalid sigma used with GaussianBlur - allowed : 0.01-10");

   uint NbElements = (MaskSize * 2 + 1 ) * (MaskSize * 2 + 1 );

   std::vector<float> Mask(NbElements);

   GenerateBlurMask(Mask, Sigma, MaskSize);
   // NOTE : Maybe we should generate the mask in the device to prevent having to send that buffer


   // Send mask to device
   ReadBuffer MaskBuffer(*m_CL, Mask.data(), NbElements);

   // Execute kernel
   Kernel(gaussian_blur, In(Source), Out(Dest), Source.Step(), Dest.Step(), Source.Height(), MaskBuffer, MaskSize);
}
void CMeanFilter_GPUPad::Apply(const Image& input, Image& output)
{
	const int radius = CFilterParameterInterpreter<int>::Convert(_parameters->GetParameter("Radius"));
	const int GroupSize = 32;
	const int width = input.Width();
	const int height = input.Height();
	const int depth = input.Depth();
	const int NbPixel = width * height;
	const uchar *image_data = input.Data();
	uchar *output_data = output.Data();
	const int NewHeight = Utils::GetNextMultipleOf(height, GroupSize) + radius * 2 + GroupSize;
	const int NewWidth = Utils::GetNextMultipleOf(width, GroupSize) + radius * 2 + GroupSize;
	const int NewNbPixel = NewHeight * NewWidth;

	Image NewImage(NewWidth, NewHeight, depth);
	int *TempPix = new int[NewNbPixel * 2 * depth];
	unsigned char *TempMask = new unsigned char[NewNbPixel];
	Image NewOut(NewWidth, NewHeight, depth);
	memset(TempPix, 0, NewNbPixel * 2 * depth * sizeof(int));
	memset(TempMask, 0, NewNbPixel * sizeof(unsigned char));

	// Copy image into subrect
	int NewImageIndex = 0;
	int OldImageIndex = 0;
	int temp_val = 0;
	for (int i = 0; i < height; ++i)
	{
		for (int j = 0; j < width; ++j)
		{
			for (int k = 0; k < depth; ++k)
			{
				NewImage(i + radius,j + radius,k) = input(i,j,k);
			}
			TempMask[(i+radius) * NewWidth + j + radius] = 1;
		}
	}

	auto current_device = OpenCLUtils::Instance()->GetCurrentDevice();
	const cl::Context Context = current_device.Context();

	cl_int Error;
	cl_mem_flags InOutMemFlags = CL_MEM_READ_WRITE;
	cl_mem_flags InMemFlags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
	cl_mem_flags OutMemFlags = CL_MEM_WRITE_ONLY;

	// If you ever change the local size, don't forget to change the size in the kernel
	unsigned int LocalSizeY = 1;
	unsigned int LocalSizeX = GroupSize;
	// Global size must be a multiple of local size
	unsigned int GlobalSizeY = (height + LocalSizeY - 1) / LocalSizeY;
	GlobalSizeY *= LocalSizeY;
	unsigned int GlobalSizeX = (width + LocalSizeX - 1) / LocalSizeX;
	GlobalSizeX *= LocalSizeX;

	cl::Buffer ImBuffer(Context, InMemFlags, NewNbPixel * depth * sizeof(unsigned char), (void*)NewImage.Data(), &Error);
	cl::Buffer MaskBuffer(Context, InMemFlags, NewNbPixel * sizeof(unsigned char), (void*)TempMask, &Error);
	cl::Buffer ImTempBuffer(Context,
		InOutMemFlags, NewNbPixel * 2 * depth * sizeof(int),
		0, &Error);
	cl::Buffer OutBuffer(Context, OutMemFlags, NewNbPixel * depth * sizeof(unsigned char), 0, &Error);

	cl::CommandQueue Queue = current_device.CommandQueue();
	cl::Event Event;

	// Zero out temp buffer
	m_KernelZero.setArg(0, ImTempBuffer);
	Queue.enqueueNDRangeKernel(m_KernelZero, cl::NullRange, cl::NDRange(NewNbPixel), cl::NullRange, 0, &Event);
	Event.wait();

	// Horizontal pass
	int Index = 0;
	m_KernelH.setArg(Index++, ImBuffer);
	m_KernelH.setArg(Index++, MaskBuffer);
	m_KernelH.setArg(Index++, ImTempBuffer);
	m_KernelH.setArg(Index++, NewHeight);
	m_KernelH.setArg(Index++, NewWidth);
	m_KernelH.setArg(Index++, radius);

	Queue.enqueueNDRangeKernel(m_KernelH, cl::NullRange, cl::NDRange(GlobalSizeX, GlobalSizeY), cl::NDRange(LocalSizeX, LocalSizeY), 0, &Event);
	Event.wait();

	// Vertical pass
	LocalSizeY = GroupSize;
	LocalSizeX = 1;
	// Global size must be a multiple of local size
	GlobalSizeY = (height + LocalSizeY - 1) / LocalSizeY;
	GlobalSizeY *= LocalSizeY;
	GlobalSizeX = (width + LocalSizeX - 1) / LocalSizeX;
	GlobalSizeX *= LocalSizeX;

	Index = 0;
	m_KernelV.setArg(Index++, ImTempBuffer);
	m_KernelV.setArg(Index++, OutBuffer);
	m_KernelV.setArg(Index++, NewHeight);
	m_KernelV.setArg(Index++, NewWidth);
	m_KernelV.setArg(Index++, radius);

	Queue.enqueueNDRangeKernel(m_KernelV, cl::NullRange, cl::NDRange(GlobalSizeY, GlobalSizeX), cl::NDRange(LocalSizeY, LocalSizeX), 0, &Event);
	Event.wait();

	Queue.enqueueReadBuffer(OutBuffer, true, 0, NewNbPixel * depth * sizeof(unsigned char), (void*)NewOut.Data(), 0, &Event);
	Event.wait();

	Queue.finish();

	//PrintToFile( NewOut, (size_t)NewWidth, (size_t)NewHeight, "GPUPad.txt" );

	// Read Sub rect
	for (int i = 0; i < height; ++i)
	{
		for (int j = 0; j < width; ++j)
		{
			for (int k = 0; k < depth; ++k)
			{
				output(i,j,k) = NewOut(i+radius, j+radius,k);
			}
		}
	}

	delete[] TempPix;
	delete[] TempMask;
}