void MultiScaleTransforms::PrepareTransform(double* kernel, double scale)
{
	ao::uvector<double> shape;
	size_t kernelSize;
	MakeShapeFunction(scale, shape, kernelSize);
	
	memset(kernel, 0, sizeof(double) * _width * _height);
	
	FFTConvolver::PrepareSmallKernel(kernel, _width, _height, shape.data(), kernelSize);
}
void MultiScaleTransforms::Transform(const ao::uvector<double*>& images, double* scratch, double scale)
{
	ao::uvector<double> shape;
	size_t kernelSize;
	MakeShapeFunction(scale, shape, kernelSize);
	
	memset(scratch, 0, sizeof(double) * _width * _height);
	
	FFTConvolver::PrepareSmallKernel(scratch, _width, _height, shape.data(), kernelSize);
	for(double*const* imageIter = images.begin(); imageIter!=images.end(); ++imageIter)
		FFTConvolver::ConvolveSameSize(_fftwManager, *imageIter, scratch, _width, _height);
}
void FastMultiScaleClean<ImageSetType>::executeMajorIterationForScale(double currentScale, double nextScale, bool& reachedStopGain, bool& canCleanFurther)
{
	ImageBufferAllocator& allocator = *_dataImageOriginal->Allocator();
	// Scale down the image so that the "scale size" is 10 pixels
	// 10 pixels is so that the peak finding is still reasonably accurate
	_rescaledWidth = size_t(ceil(double(_originalWidth)*(10.0/currentScale))),
	_rescaledHeight = size_t(ceil(double(_originalHeight)*(10.0/currentScale)));
	double rescaleFactor = double(_rescaledWidth) / _originalWidth;
	if(_rescaledWidth > _originalWidth || _rescaledHeight > _originalHeight)
	{
		_rescaledWidth = _originalWidth;
		_rescaledHeight = _originalHeight;
		rescaleFactor = 1.0;
	}
	
	ImageSetType
		largeScaleImage(_rescaledWidth*_rescaledHeight, *_dataImageOriginal),
		nextScaleImage(_rescaledWidth*_rescaledHeight, *_dataImageOriginal),
		currentScaleModel(_rescaledWidth*_rescaledHeight, *_modelImage);
	std::vector<double*> scaledPsfs(_originalPsfs->size());
	_dataImageLargeScale = &largeScaleImage;
	_dataImageNextScale = &nextScaleImage;
	_scaledPsfs = &scaledPsfs;
	
	canCleanFurther = true;
	size_t cpuCount = this->_threadCount;
	
	double thresholdBias = pow(this->_multiscaleThresholdBias, log2(currentScale));
	std::cout << "Threshold bias: " << thresholdBias << '\n';
	double oldSubtractionGain = this->_subtractionGain;
	this->_subtractionGain *= sqrt(thresholdBias);
	
	// Fill the large and next scale images with the rescaled images
	FFTResampler imageResampler(_originalWidth, _originalHeight, _rescaledWidth, _rescaledHeight, cpuCount, false);
	imageResampler.Start();
	for(size_t i=0; i!=_dataImageOriginal->ImageCount(); ++i)
		imageResampler.AddTask(_dataImageOriginal->GetImage(i), largeScaleImage.GetImage(i));
	for(size_t i=0; i!=_originalPsfs->size(); ++i)
	{
		scaledPsfs[i] = allocator.Allocate(_rescaledWidth*_rescaledHeight);
		imageResampler.AddTask((*_originalPsfs)[i], scaledPsfs[i]);
	}
	imageResampler.Finish();
	
	for(size_t i=0; i!=_dataImageOriginal->ImageCount(); ++i)
		memcpy(nextScaleImage.GetImage(i), largeScaleImage.GetImage(i), sizeof(double)*_rescaledWidth*_rescaledHeight);
	for(size_t i=0; i!=currentScaleModel.ImageCount(); ++i)
		memset(currentScaleModel.GetImage(i), 0, sizeof(double)*_rescaledWidth*_rescaledHeight);
	
	// Convolve the large and next scale images and the psfs
	double* kernelImage = allocator.Allocate(_rescaledWidth * _rescaledHeight);
	ao::uvector<double> shape;
	size_t kernelSize;
	MakeShapeFunction(currentScale * rescaleFactor, shape, kernelSize);
	memset(kernelImage, 0, sizeof(double) * _rescaledWidth * _rescaledHeight);
	FFTConvolver::PrepareSmallKernel(kernelImage, _rescaledWidth, _rescaledHeight, shape.data(), kernelSize);
	for(size_t i=0; i!=largeScaleImage.ImageCount(); ++i)
		FFTConvolver::ConvolveSameSize(largeScaleImage.GetImage(i), kernelImage, _rescaledWidth, _rescaledHeight);
	for(size_t i=0; i!=scaledPsfs.size(); ++i) {
		FFTConvolver::ConvolveSameSize(scaledPsfs[i], kernelImage, _rescaledWidth, _rescaledHeight);
		FFTConvolver::ConvolveSameSize(scaledPsfs[i], kernelImage, _rescaledWidth, _rescaledHeight);
	}
	
	MakeShapeFunction(nextScale * rescaleFactor, shape, kernelSize);
	memset(kernelImage, 0, sizeof(double) * _rescaledWidth * _rescaledHeight);
	FFTConvolver::PrepareSmallKernel(kernelImage, _rescaledWidth, _rescaledHeight, shape.data(), kernelSize);
	for(size_t i=0; i!=nextScaleImage.ImageCount(); ++i)
		FFTConvolver::ConvolveSameSize(nextScaleImage.GetImage(i), kernelImage, _rescaledWidth, _rescaledHeight);
	
	//FitsWriter writer;
	//writer.SetImageDimensions(_rescaledWidth, _rescaledHeight);
	//writer.Write("multiscale-kernel.fits", kernelImage);
	//writer.Write("multiscale-current-scale.fits", largeScaleImage.GetImage(0));
	//writer.Write("multiscale-next-scale.fits", nextScaleImage.GetImage(0));
	//writer.Write("multiscale-psf.fits", scaledPsfs[0]);
	//writer.SetImageDimensions(kernelSize, kernelSize);
	//writer.Write("multiscale-shape.fits", shape.data());
		
	size_t componentX=0, componentY=0;
	findPeak(componentX, componentY);
	std::cout << "Initial peak: " << peakDescription(largeScaleImage, componentX, componentY, rescaleFactor) << '\n';
	
	size_t peakIndex = componentX + componentY*_rescaledWidth;
	double peakNormalized = _dataImageLargeScale->JoinedValueNormalized(peakIndex) * rescaleFactor * rescaleFactor;
	double firstThreshold = this->_threshold, stopGainThreshold = fabs(peakNormalized*(1.0-this->_stopGain)/thresholdBias);
	std::cout << "Scale-adjusted threshold: " << firstThreshold*thresholdBias << ", major iteration stops at " << stopGainThreshold*thresholdBias << '\n';
	if(stopGainThreshold > firstThreshold)
	{
		firstThreshold = stopGainThreshold;
		std::cout << "Next major iteration for this scale at: " << stopGainThreshold << '\n';
	}
	else if(this->_stopGain != 1.0) {
		std::cout << "Major iteration threshold reached global threshold of " << this->_threshold << " for this scale.\n";
	}

	std::vector<ao::lane<CleanTask>*> taskLanes(cpuCount);
	std::vector<ao::lane<CleanResult>*> resultLanes(cpuCount);
	boost::thread_group threadGroup;
	for(size_t i=0; i!=cpuCount; ++i)
	{
		taskLanes[i] = new ao::lane<CleanTask>(1);
		resultLanes[i] = new ao::lane<CleanResult>(1);
		CleanThreadData cleanThreadData;
		cleanThreadData.startY = (_rescaledHeight*i)/cpuCount;
		cleanThreadData.endY = _rescaledHeight*(i+1)/cpuCount;
		cleanThreadData.parent = this;
		threadGroup.add_thread(new boost::thread(&FastMultiScaleClean<ImageSetType>::cleanThreadFunc, this, &*taskLanes[i], &*resultLanes[i], cleanThreadData));
	}
	
	while(fabs(peakNormalized) > firstThreshold*thresholdBias && this->_iterationNumber < this->_maxIter && !(largeScaleImage.IsComponentNegative(peakIndex) && this->_stopOnNegativeComponent))
	{
		if(this->_iterationNumber <= 10 ||
			(this->_iterationNumber <= 100 && this->_iterationNumber % 10 == 0) ||
			(this->_iterationNumber <= 1000 && this->_iterationNumber % 100 == 0) ||
			this->_iterationNumber % 1000 == 0)
			std::cout << "Iteration " << this->_iterationNumber << ": " << peakDescription(largeScaleImage, componentX, componentY, rescaleFactor) << '\n';
		
		CleanTask task;
		task.cleanCompX = componentX;
		task.cleanCompY = componentY;
		task.peak = largeScaleImage.Get(peakIndex);
		for(size_t i=0; i!=cpuCount; ++i)
			taskLanes[i]->write(task);
		
		currentScaleModel.AddComponent(largeScaleImage, peakIndex, this->_subtractionGain * rescaleFactor * rescaleFactor);
		
		double peakUnnormalized = 0.0;
		for(size_t i=0; i!=cpuCount; ++i)
		{
			CleanResult result;
			resultLanes[i]->read(result);
			if(result.peakLevelUnnormalized >= peakUnnormalized && std::isfinite(result.peakLevelUnnormalized))
			{
				peakUnnormalized = result.peakLevelUnnormalized;
				componentX = result.nextPeakX;
				componentY = result.nextPeakY;
			}
		}
		if(peakUnnormalized == 0.0)
		{
			std::cout << "No more components found at current scale: continuing to next scale.\n";
			canCleanFurther = false;
			break;
		}
		peakIndex = componentX + componentY*_rescaledWidth;
		peakNormalized = largeScaleImage.JoinedValueNormalized(peakIndex) * rescaleFactor * rescaleFactor;
		
		++this->_iterationNumber;
	}
	for(size_t i=0; i!=cpuCount; ++i)
		taskLanes[i]->write_end();
	threadGroup.join_all();
	for(size_t i=0; i!=cpuCount; ++i)
	{
		delete taskLanes[i];
		delete resultLanes[i];
	}
	std::cout << "Stopped on peak " << peakNormalized << '\n';
	reachedStopGain = fabs(peakNormalized) <= stopGainThreshold*thresholdBias;
	
	for(size_t i=0; i!=scaledPsfs.size(); ++i)
		allocator.Free(scaledPsfs[i]);
	
	MakeShapeFunction(currentScale * rescaleFactor, shape, kernelSize);
	memset(kernelImage, 0, sizeof(double) * _rescaledWidth * _rescaledHeight);
	FFTConvolver::PrepareSmallKernel(kernelImage, _rescaledWidth, _rescaledHeight, shape.data(), kernelSize);
	
	double* convolvedModel = allocator.Allocate(_originalWidth * _originalHeight);
	double* preparedPsf = allocator.Allocate(_originalWidth * _originalHeight);
	FFTResampler modelResampler(_rescaledWidth, _rescaledHeight, _originalWidth, _originalHeight, cpuCount, false);
	for(size_t i=0; i!=currentScaleModel.ImageCount(); ++i)
	{
		FFTConvolver::ConvolveSameSize(currentScaleModel.GetImage(i), kernelImage, _rescaledWidth, _rescaledHeight);
		//writer.SetImageDimensions(_rescaledWidth, _rescaledHeight);
		//writer.Write("multiscale-model-small.fits", currentScaleModel.GetImage(i));
		modelResampler.RunSingle(currentScaleModel.GetImage(i), convolvedModel);
		double *modelPtr = convolvedModel;
		double *dest = _modelImage->GetImage(i), *destEnd = dest + _originalWidth*_originalHeight;
		while(dest != destEnd) {
			*dest += *modelPtr;
			++modelPtr;
			++dest;
		}
		FFTConvolver::PrepareKernel(preparedPsf, (*_originalPsfs)[_modelImage->PSFIndex(i)], _originalWidth, _originalHeight);
		FFTConvolver::ConvolveSameSize(convolvedModel, preparedPsf, _originalWidth, _originalHeight);
		dest = _dataImageOriginal->GetImage(i);
		destEnd = dest + _originalWidth*_originalHeight;
		modelPtr = convolvedModel;
		while(dest != destEnd) {
			*dest -= *modelPtr;
			++modelPtr;
			++dest;
		}
	}
	allocator.Free(preparedPsf);
	allocator.Free(convolvedModel);
	allocator.Free(kernelImage);
	
	this->_subtractionGain = oldSubtractionGain;
}