void checkImagesEqual(vtkImageDataPtr input1, vtkImageDataPtr input2) { REQUIRE(input1.Get()!=(vtkImageData*)NULL); REQUIRE(input2.Get()!=(vtkImageData*)NULL); REQUIRE(input1->GetDataDimension() == input2->GetDataDimension()); REQUIRE(input1->GetScalarType() == input2->GetScalarType()); REQUIRE(input1->GetNumberOfScalarComponents() == input2->GetNumberOfScalarComponents()); REQUIRE(Eigen::Array3i(input1->GetDimensions()).isApprox(Eigen::Array3i(input2->GetDimensions()))); CHECK(Eigen::Array3d(input1->GetSpacing()).isApprox(Eigen::Array3d(input2->GetSpacing()), 1.0E-2)); CHECK(Eigen::Array3d(input1->GetOrigin()).isApprox(Eigen::Array3d(input2->GetOrigin()))); // check spacing, dim, type, origin vtkImageMathematicsPtr diff = vtkImageMathematicsPtr::New(); diff->SetOperationToSubtract(); diff->SetInput1Data(input1); diff->SetInput2Data(input2); diff->Update(); vtkImageAccumulatePtr histogram = vtkImageAccumulatePtr::New(); histogram->SetInputData(0, diff->GetOutput()); histogram->Update(); Eigen::Array3d histogramRange = Eigen::Array3d(histogram->GetMax()) - Eigen::Array3d(histogram->GetMin()); for (int i=0; i<input1->GetNumberOfScalarComponents(); ++i) { CHECK(histogramRange[i] < 0.01); CHECK(histogramRange[i] > -0.01); } }
bool VNNclAlgorithm::reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes) { mMeasurementNames.clear(); int numBlocks = 10; // FIXME? needs to be the same as the number of input bscans to the voxel_method kernel // Split input US into blocks // Splits and copies data from the processed input in the way the kernel will processes it, which is per frameBlock frameBlock_t* inputBlocks = new frameBlock_t[numBlocks]; size_t nPlanes_numberOfInputImages = input->getDimensions()[2]; this->initializeFrameBlocks(inputBlocks, numBlocks, input); // Allocate CL memory for each frame block VECTOR_CLASS<cl::Buffer> clBlocks; report("Allocating OpenCL input block buffers"); for (int i = 0; i < numBlocks; i++) { //TODO why does the context suddenly contain a "dummy" device? cl::Buffer buffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputBlocks[i].length, inputBlocks[i].data, "block buffer "+QString::number(i).toStdString()); clBlocks.push_back(buffer); } // Allocate output memory int *outputDims = outputData->GetDimensions(); size_t outputVolumeSize = outputDims[0] * outputDims[1] * outputDims[2] * sizeof(unsigned char); report(QString("Allocating CL output buffer, size %1").arg(outputVolumeSize)); cl_ulong globalMemUse = 10 * inputBlocks[0].length + outputVolumeSize + sizeof(float) * 16 * nPlanes_numberOfInputImages + sizeof(cl_uchar) * input->getDimensions()[0] * input->getDimensions()[1]; if(isUsingTooMuchMemory(outputVolumeSize, inputBlocks[0].length, globalMemUse)) return false; cl::Buffer outputBuffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_WRITE_ONLY, outputVolumeSize, NULL, "output volume buffer"); // Fill the plane matrices float *planeMatrices = new float[16 * nPlanes_numberOfInputImages]; //4x4 (matrix) = 16 this->fillPlaneMatrices(planeMatrices, input); cl::Buffer clPlaneMatrices = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nPlanes_numberOfInputImages * sizeof(float) * 16, planeMatrices, "plane matrices buffer"); // US Probe mask cl::Buffer clMask = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uchar) * input->getMask()->GetDimensions()[0] * input->getMask()->GetDimensions()[1], input->getMask()->GetScalarPointer(), "mask buffer"); double *out_spacing = outputData->GetSpacing(); float spacings[2]; float f_out_spacings[3]; f_out_spacings[0] = out_spacing[0]; f_out_spacings[1] = out_spacing[1]; f_out_spacings[2] = out_spacing[2]; spacings[0] = input->getSpacing()[0]; spacings[1] = input->getSpacing()[1]; //TODO why 4? because float4 is used?? size_t planes_eqs_size = sizeof(cl_float)*4*nPlanes_numberOfInputImages; // Find the optimal local work size size_t local_work_size; unsigned int deviceNumber = 0; cl::Device device = mOulContex->getDevice(deviceNumber); mKernel.getWorkGroupInfo(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &local_work_size); size_t close_planes_size = this->calculateSpaceNeededForClosePlanes(mKernel, device, local_work_size, nPlanes_numberOfInputImages, nClosePlanes); this->setKernelArguments( mKernel, outputDims[0], outputDims[1], outputDims[2], f_out_spacings[0], f_out_spacings[1], f_out_spacings[2], input->getDimensions()[0], input->getDimensions()[1], spacings[0], spacings[1], clBlocks, outputBuffer, clPlaneMatrices, clMask, planes_eqs_size, close_planes_size, radius); report(QString("Using %1 as local workgroup size").arg(local_work_size)); // We will divide the work into cubes of CUBE_DIM^3 voxels. The global work size is the total number of voxels divided by that. int cube_dim = 4; int cube_dim_pow3 = cube_dim * cube_dim * cube_dim; // Global work items: size_t global_work_size = (((outputDims[0] + cube_dim) * (outputDims[1] + cube_dim) * (outputDims[2] + cube_dim)) / cube_dim_pow3); // = number of cubes = number of kernels to run // Round global_work_size up to nearest multiple of local_work_size if (global_work_size % local_work_size) global_work_size = ((global_work_size / local_work_size) + 1) * local_work_size; // ceil(...) unsigned int queueNumber = 0; cl::CommandQueue queue = mOulContex->getQueue(queueNumber); this->measureAndExecuteKernel(queue, mKernel, global_work_size, local_work_size, mKernelMeasurementName); this->measureAndReadBuffer(queue, outputBuffer, outputVolumeSize, outputData->GetScalarPointer(), "vnncl_read_buffer"); setDeepModified(outputData); // Cleaning up report(QString("Done, freeing GPU memory")); this->freeFrameBlocks(inputBlocks, numBlocks); delete[] inputBlocks; inputBlocks = NULL; return true; }