//===============================
//=====================================================================
void newiteration_residOpticalFlow_mt(int iter, const ap::real_1d_array& x, double f,const ap::real_1d_array& g, void *params)
{
	globs_LBFGS_ *glob_param=(globs_LBFGS_*)params;
	double normG=0.0;
	for(int ii=g.getlowbound();ii<=g.gethighbound();ii++) normG+=g(ii)*g(ii);
	normG=sqrt(normG);
	cout<<"Iter="<<iter<<";fData="<<glob_param->fData<<";fSmooth="<<glob_param->fSmooth<<";fData+lambda*fSmooth="<<f<<";RMS(g)="<<normG/((double)(g.gethighbound()-g.getlowbound()+1))<<endl;
}
/*
 *
 * The cost function considered here is the following (with u(x,y,z),v(x,y,z),w(x,y,z) the flow at each pixel or super-pixel)
 *
 * E(u,v,w)=\sum_{s\inS}\sum_{n\in s} Huber(r(u_s,v_s,w_s),deltaHdataTerm) + \lambda \sum_{s \in S}\sum_{a\in Neigh(s) s.t. a<s} \left[ Huber(u_s-u_a,deltaHsmoothTerm) + Huber(v_s-v_a,deltaHsmoothTerm) + Huber(w_s-w_a,deltaHsmoothTerm) \right]
 *
 * S is the set of superpixels (stores in the partition). Thus, we impose that the flow is the same for voxels belonging to teh same supervoxel
 *
 * r(u,v,w)=I(x+u,y+v,z+w,t+1)-I(x,y,z,t)=imTarget(p+uvw)-imSource(p)
 *
 * I(x+u,y+v,z+w,t+1) and its partial derivatives need to be calculated using interpolation. As usual it is a trade-off between accuracy and speed.
 */
void funcgrad_residOpticalFlow_mt (ap::real_1d_array x, double& f, ap::real_1d_array& g, void *params)
{
#define INTERP_TRILINEAR //decides the kind of interpolation we want

	globs_LBFGS_ *glob_param=(globs_LBFGS_*)params;
	mylib::Partition* imTargetPartition=glob_param->imTargetPartition;
	mylib::Array* imSource=glob_param->imSource;
	mylib::Array* imTarget=glob_param->imTarget;
	mylib::Array* imTarget_dx=glob_param->imTarget_dx;
	mylib::Array* imTarget_dy=glob_param->imTarget_dy;
	mylib::Array* imTarget_dz=glob_param->imTarget_dz;
	vector<vector<pair<int,double> > >* partitionNeigh=glob_param->partitionNeigh;
	//mylib::Region** regionPvec=glob_param->regionPvec;

	double lambda=glob_param->lambda;
	double deltaHsmoothTerm=glob_param->deltaHsmoothTerm;
	//double deltaHdataTerm=glob_param->deltaHdataTerm;
	//float* scale=glob_param->scale;

	if(imSource->type!=mylib::UINT16_TYPE || imTarget->type!=mylib::UINT16_TYPE)
	{
		cout<<"ERROR: funcgrad_residOpticalFlow: code expects UINT16 images"<<endl;
		exit(2);
	}

	//mylib::uint16* imSourcePtr=(mylib::uint16*)(imSource->data);

	//initialize residual and gradient
	int numPartitions=mylib::Get_Partition_Vertex_Count(imTargetPartition);
	f=0;
	double fSmooth=0;
	memset(g.getcontent(),0,sizeof(double)*3*numPartitions);

	int sizeX=x.gethighbound()-x.getlowbound()+1;
	if(3*numPartitions!=sizeX)
	{
		cout<<"ERROR: funcgrad_residOpticalFlow: size of unknowns does not much number of image regions"<<endl;
		exit(2);
	}


	int numPpos=0,numNeighPos=0;
	//int *listedges=NULL;
	//int nedges=0;;
	double fAux,gAux,auxU,auxV,auxW;
	//int ndims=imTarget->ndims;

	//--------------------------
	/*
	 //needed if we calculate data term directly in here instead of calling the function calculateDataTermOneRegion
	mylib::Size_Type k;
	mylib::Indx_Type p;
	double auxI,der;
	mylib::Coordinate *c=mylib::Make_Array(mylib::PLAIN_KIND,mylib::DIMN_TYPE,1,&ndims);
	//memset(gAux,0,sizeof(double)*(ndims));
	mylib::Region* regionP=NULL;
	double* xx=new double[imTarget->ndims];
	*/
	//-------------------------

#ifdef INTERP_TRILINEAR
	const mylib::Array** imTargetDer=(const mylib::Array**)malloc(sizeof(mylib::Array*)*imTarget->ndims);

	imTargetDer[0]=imTarget_dx;imTargetDer[1]=imTarget_dy;imTargetDer[2]=imTarget_dz;
#endif
	mylib::Use_Array_Basis(imTarget);//set basis to get all the coordinate indexes

//-------------------------------------------
	/*
	const int numThreads=12;
	double* fVec=new double[numThreads];
	boost::thread_group threads;
	int numPini=0,numPend=0;
	int step=(int)floor(double(numPartitions)/double(numThreads));
    for (int i = 0; i < numThreads-1; ++i)
	{
		numPend+=step;
        threads.create_thread(fDataTermThread(regionPvec,imTarget,imTargetDer,imSourcePtr,x.getcontent(),scale,deltaHdataTerm,fVec[i],g.getcontent(),numPini,numPend));
		numPini+=step;
	}
	threads.create_thread(fDataTermThread(regionPvec,imTarget,imTargetDer,imSourcePtr,x.getcontent(),scale,deltaHdataTerm,fVec[numThreads-1],g.getcontent(),numPini,numPartitions));//residual

    threads.join_all();
	for (int i = 0; i < numThreads; ++i) f+=fVec[i];
	delete[] fVec;
	*/
	//---------------------------------------------------------------------

	/*
	//single thread execution
	for(int numP=0;numP<numPartitions;numP++)
	{
		numPpos=numP*ndims;//for parallelization
		calculateDataTermOneRegion(regionPvec[numP],imTarget,imTargetDer,imSourcePtr,x.getcontent()+numPpos,scale,deltaHdataTerm,fAux,g.getcontent()+numPpos);
		f+=fAux;
		//for(int aa=0;aa<ndims;aa++) g(numPpos+1+aa)+=df[aa];//by passing g directly we avoid double copy
	}
	*/

	//not needed anymore since I precomputed before minimization
	//mylib::Free_Region(regionP);

	//-------------------------------------------------------------------------
	/*
		//calculate smooth term (value and gradient) using only 2n-connectivity regions (very limited)
		listedges=mylib::Get_Partition_Neighbors (imTargetPartition, numP, &nedges);
		mylib::P_Edge* pEdge=NULL;
		for(int ii=0;ii<nedges;ii++)
		{
			//pEdge->region1 < pEdge->region2 ALWAYS
			pEdge=mylib::Get_Partition_Edge (imTargetPartition, listedges[ii]);
			if(numP==(pEdge->region1))//to avoid double counting we only include edges with numP<neigh
			{
				numNeighPos=3*(pEdge->region2);
				auxU=x(numPpos+1)-x(numNeighPos+1);
				auxV=x(numPpos+2)-x(numNeighPos+2);
				auxW=x(numPpos+3)-x(numNeighPos+3);
				HuberCostAndDer(auxU,deltaHsmoothTerm,fAux,gAux);
				fSmooth+=fAux;
				g(numPpos+1)+=lambda*gAux;// *1.0
				g(numNeighPos+1)-=lambda*gAux;

				HuberCostAndDer(auxV,deltaHsmoothTerm,fAux,gAux);
				fSmooth+=fAux;
				g(numPpos+2)+=lambda*gAux;// *1.0
				g(numNeighPos+2)-=lambda*gAux;

				HuberCostAndDer(auxW,deltaHsmoothTerm,fAux,gAux);
				fSmooth+=fAux;
				g(numPpos+3)+=lambda*gAux;// *1.0
				g(numNeighPos+3)-=lambda*gAux;
			}else{
				break;//pEdge->region1 < pEdge->region2 ALWAYS
			}
		}
	 */
	//--------------------------------------------------------------------------------------------

	//calculate smooth term (value and gradient) using regions closer than maxDistance (precalculated ahead of time)
	numPpos=0;
	for(int numP=0;numP<numPartitions;numP++,numPpos+=3)
	{
		vector<pair<int,double> >* edges=&((*partitionNeigh)[numP]);
		double lambdaAux;
		for(vector<pair<int,double> >::const_iterator iter=edges->begin();iter!=edges->end();++iter)//we do not need to avoid double counting because it was already done during neighbor list creation
		{
			numNeighPos=3*(iter->first);
			lambdaAux=lambda*(iter->second);
			auxU=x(numPpos+1)-x(numNeighPos+1);
			auxV=x(numPpos+2)-x(numNeighPos+2);
			auxW=x(numPpos+3)-x(numNeighPos+3);
			HuberCostAndDer(auxU,deltaHsmoothTerm,fAux,gAux);
			fSmooth+=fAux*(iter->second);
			g(numPpos+1)+=lambdaAux*gAux;//*1.0
			g(numNeighPos+1)-=lambdaAux*gAux;

			HuberCostAndDer(auxV,deltaHsmoothTerm,fAux,gAux);
			fSmooth+=fAux*(iter->second);
			g(numPpos+2)+=lambdaAux*gAux;//*1.0
			g(numNeighPos+2)-=lambdaAux*gAux;

			HuberCostAndDer(auxW,deltaHsmoothTerm,fAux,gAux);
			fSmooth+=fAux*(iter->second);
			g(numPpos+3)+=lambdaAux*gAux;//*1.0
			g(numNeighPos+3)-=lambdaAux*gAux;
		}
	}

	//--------------------------------debug------------------------
	/*
	cout<<"DEBUGGING!!! funcgrad_residOpticalFlow"<<endl;
	//cout.precision(20);
	//cout.setf(ios::fixed,ios::floatfield);
	cout<<"fData="<<f<<";fSmooth="<<fSmooth<<";fData+lambda*fSmooth="<<f+lambda*fSmooth<<endl;

	cout<<"x=[";
	for(int ii=0;ii<3*numPartitions;ii+=3)
	{
		cout<<x(ii+1)<<" "<<x(ii+2)<<" "<<x(ii+3)<<";"<<endl;
	}
	cout<<"];"<<endl;

	cout<<"g=[";
	for(int ii=0;ii<3*numPartitions;ii+=3)
	{
		cout<<g(ii+1)<<" "<<g(ii+2)<<" "<<g(ii+3)<<";"<<endl;
	}
	cout<<"];"<<endl;
	//exit(2);
	 */
	//-------------------------------------------------------------

	//--------------------------------------debug: write out gradient based on superpixels-----------------------------------------
	/*
	for(int numP=0;numP<numPartitions;numP++,numPpos+=3)
	{
		//regionP=mylib::Record_P_Vertex(imTargetPartition,numP,1,0);//regions can have holes
		regionP=regionPvec[numP];
		for (k = 0; k < regionP->rastlen; k+=2)
		{
			for (p = regionP->raster[k]; p <= regionP->raster[k+1]; p++)//perform computation on voxel p
			{
				cout<<p<<" "<<numP+1<<" "<<g(3*numP+1)<<" "<<g(3*numP+2)<<" "<<g(3*numP+3)<<";"<<endl;
			}
		}
		//mylib::Free_Region(regionP);
	}
	exit(2);
	 */
	//------------------------------------------------------------------------------------

#ifdef INTERP_TRILINEAR
	//if(xx!=NULL) delete[] xx;
	if(imTargetDer!=NULL) free(imTargetDer);
#endif
	//save to display iteration info
	glob_param->fData=f;
	glob_param->fSmooth=fSmooth;

	f+=lambda*fSmooth;

}