bool VNNclAlgorithm::reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes)
{
	mMeasurementNames.clear();

	int numBlocks = 10; // FIXME? needs to be the same as the number of input bscans to the voxel_method kernel

	// Split input US into blocks
	// Splits and copies data from the processed input in the way the kernel will processes it, which is per frameBlock
	frameBlock_t* inputBlocks = new frameBlock_t[numBlocks];
	size_t nPlanes_numberOfInputImages = input->getDimensions()[2];
	this->initializeFrameBlocks(inputBlocks, numBlocks, input);

	// Allocate CL memory for each frame block
	VECTOR_CLASS<cl::Buffer> clBlocks;
	report("Allocating OpenCL input block buffers");
	for (int i = 0; i < numBlocks; i++)
	{
		//TODO why does the context suddenly contain a "dummy" device?
		cl::Buffer buffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputBlocks[i].length, inputBlocks[i].data, "block buffer "+QString::number(i).toStdString());
		clBlocks.push_back(buffer);
	}
	// Allocate output memory
	int *outputDims = outputData->GetDimensions();

	size_t outputVolumeSize = outputDims[0] * outputDims[1] * outputDims[2] * sizeof(unsigned char);

	report(QString("Allocating CL output buffer, size %1").arg(outputVolumeSize));

	cl_ulong globalMemUse = 10 * inputBlocks[0].length + outputVolumeSize + sizeof(float) * 16 * nPlanes_numberOfInputImages + sizeof(cl_uchar) * input->getDimensions()[0] * input->getDimensions()[1];
	if(isUsingTooMuchMemory(outputVolumeSize, inputBlocks[0].length, globalMemUse))
		return false;

	cl::Buffer outputBuffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_WRITE_ONLY, outputVolumeSize, NULL, "output volume buffer");

	// Fill the plane matrices
	float *planeMatrices = new float[16 * nPlanes_numberOfInputImages]; //4x4 (matrix) = 16
	this->fillPlaneMatrices(planeMatrices, input);

	cl::Buffer clPlaneMatrices = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nPlanes_numberOfInputImages * sizeof(float) * 16, planeMatrices, "plane matrices buffer");

	// US Probe mask
	cl::Buffer clMask = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
			sizeof(cl_uchar) * input->getMask()->GetDimensions()[0] * input->getMask()->GetDimensions()[1],
			input->getMask()->GetScalarPointer(), "mask buffer");

	double *out_spacing = outputData->GetSpacing();
	float spacings[2];
	float f_out_spacings[3];
	f_out_spacings[0] = out_spacing[0];
	f_out_spacings[1] = out_spacing[1];
	f_out_spacings[2] = out_spacing[2];


	spacings[0] = input->getSpacing()[0];
	spacings[1] = input->getSpacing()[1];

	//TODO why 4? because float4 is used??
	size_t planes_eqs_size =  sizeof(cl_float)*4*nPlanes_numberOfInputImages;

	// Find the optimal local work size
	size_t local_work_size;
	unsigned int deviceNumber = 0;
	cl::Device device = mOulContex->getDevice(deviceNumber);
	mKernel.getWorkGroupInfo(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &local_work_size);

	size_t close_planes_size = this->calculateSpaceNeededForClosePlanes(mKernel, device, local_work_size, nPlanes_numberOfInputImages, nClosePlanes);

	this->setKernelArguments(
			mKernel,
			outputDims[0],
			outputDims[1],
			outputDims[2],
			f_out_spacings[0],
			f_out_spacings[1],
			f_out_spacings[2],
			input->getDimensions()[0],
			input->getDimensions()[1],
			spacings[0],
			spacings[1],
			clBlocks,
			outputBuffer,
			clPlaneMatrices,
			clMask,
			planes_eqs_size,
			close_planes_size,
			radius);

	report(QString("Using %1 as local workgroup size").arg(local_work_size));

	// We will divide the work into cubes of CUBE_DIM^3 voxels. The global work size is the total number of voxels divided by that.
	int cube_dim = 4;
	int cube_dim_pow3 = cube_dim * cube_dim * cube_dim;
	// Global work items:
	size_t global_work_size = (((outputDims[0] + cube_dim) * (outputDims[1] + cube_dim) * (outputDims[2] + cube_dim)) / cube_dim_pow3); // = number of cubes = number of kernels to run

	// Round global_work_size up to nearest multiple of local_work_size
	if (global_work_size % local_work_size)
		global_work_size = ((global_work_size / local_work_size) + 1) * local_work_size; // ceil(...)

	unsigned int queueNumber = 0;
	cl::CommandQueue queue = mOulContex->getQueue(queueNumber);
	this->measureAndExecuteKernel(queue, mKernel, global_work_size, local_work_size, mKernelMeasurementName);
	this->measureAndReadBuffer(queue, outputBuffer, outputVolumeSize, outputData->GetScalarPointer(), "vnncl_read_buffer");
	setDeepModified(outputData);
	// Cleaning up
	report(QString("Done, freeing GPU memory"));
	this->freeFrameBlocks(inputBlocks, numBlocks);
	delete[] inputBlocks;

	inputBlocks = NULL;

	return true;
}
void IGTLinkConversionFixture::setValue(vtkImageDataPtr data, int x, int y, int z, unsigned char val)
{
    *reinterpret_cast<unsigned char*>(data->GetScalarPointer(x,y,z)) = val;
}