void optimum_reparam(double *C1, double *C2, int n, int d, double w,
                     bool onlyDP, bool rotated, bool isclosed, int skipm, int autoselectC,
                     double *opt, bool swap, double *fopts, double *comtime)
{
    /* dimensions of input matrices */
    /* opt size is n + d*d +1 */
    /* fopts and comtime are 5 x 1*/
    integer n1, d1;
    n1 = static_cast<integer> (n);
    d1 = static_cast<integer> (d);
    bool swapi;

    std::string methodname = "";
    if (!onlyDP)
        methodname = "RBFGS";

    init_genrand(0);

    CheckMemoryDeleted = new std::map<integer *, integer>;

    integer numofmanis = 3;
    integer numofmani1 = 1;
    integer numofmani2 = 1;
    integer numofmani3 = 1;
    L2SphereVariable FNSV(n);
    OrthGroupVariable OGV(d);
    EucVariable EucV(1);
    ProductElement *Xopt = new ProductElement(numofmanis, &FNSV, numofmani1, &OGV, numofmani2, &EucV, numofmani3);

    integer ns, lms;

    DriverElasticCurvesRO(C1, C2, d1, n1, w, rotated, isclosed, onlyDP, skipm, methodname,
                          autoselectC, Xopt, swapi, fopts, comtime, ns, lms);

    swap = swapi;

    /* get output data */
    integer sizex = n1 + d1 * d1 + 1;
    const double *Xoptptr = Xopt->ObtainReadData();
    integer inc = 1;
    dcopy_(&sizex, const_cast<double *> (Xoptptr), &inc, opt, &inc);

    delete Xopt;

    std::map<integer *, integer>::iterator iter = CheckMemoryDeleted->begin();
    for (iter = CheckMemoryDeleted->begin(); iter != CheckMemoryDeleted->end(); iter++)
    {
        if (iter->second != 1)
            std::cout << "Global address:" << iter->first << ", sharedtimes:" << iter->second << std::endl;
    }
    delete CheckMemoryDeleted;
    return;
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
	if (nrhs < 9)
	{
		mexErrMsgTxt("The number of arguments should be nine.\n");
	}
	double *C1, *C2;
	double w = 0;
	C1 = mxGetPr(prhs[0]);
	C2 = mxGetPr(prhs[1]);
	/* dimensions of input matrices */
	integer d, n, rotated, isclosed, onlyDP, skipm, autoselectC;
	n = mxGetM(prhs[0]);
	d = mxGetN(prhs[0]);

	std::cout << "(n, d):" << n << "," << d << std::endl;

	if (mxGetM(prhs[1]) != n || mxGetN(prhs[1]) != d)
	{
		mexErrMsgTxt("The size of matrix C2 does not match the size of C1.\n");
	}
	w = mxGetScalar(prhs[2]);
	rotated = static_cast<integer> (mxGetScalar(prhs[3]));
	isclosed = static_cast<integer> (mxGetScalar(prhs[4]));
	onlyDP = static_cast<integer> (mxGetScalar(prhs[5]));
	skipm = static_cast<integer> (mxGetScalar(prhs[6]));
	char methodname[30] = "";
	mxGetString(prhs[7], methodname, 30);
	autoselectC = static_cast<integer> (mxGetScalar(prhs[8]));

	init_genrand(0);

	CheckMemoryDeleted = new std::map<integer *, integer>;

	integer numofmanis = 3;
	integer numofmani1 = 1;
	integer numofmani2 = 1;
	integer numofmani3 = 1;
	L2SphereVariable FNSV(n);
	OrthGroupVariable OGV(d);
	EucVariable EucV(1);
	ProductElement *Xopt = new ProductElement(numofmanis, &FNSV, numofmani1, &OGV, numofmani2, &EucV, numofmani3);

    bool swap;
	plhs[2] = mxCreateDoubleMatrix(5, 1, mxREAL);
	plhs[3] = mxCreateDoubleMatrix(5, 1, mxREAL);
	double *fopts = mxGetPr(plhs[2]), *comtime = mxGetPr(plhs[3]);
	integer ns, lms;

	DriverElasticCurvesRO(C1, C2, d, n, w, rotated != 0, isclosed != 0, onlyDP != 0, skipm, methodname,
		autoselectC, Xopt, swap, fopts, comtime, ns, lms);

	/*create output matrix*/
	integer sizex = n + d * d + 1;
	plhs[0] = mxCreateDoubleMatrix(sizex, 1, mxREAL);
	double *opt = mxGetPr(plhs[0]);
	plhs[1] = mxCreateDoubleScalar(static_cast<double> (swap));
	plhs[4] = mxCreateDoubleScalar(static_cast<double> (ns));
	plhs[5] = mxCreateDoubleScalar(static_cast<double> (lms));

	const double *Xoptptr = Xopt->ObtainReadData();
	integer inc = 1;
	dcopy_(&sizex, const_cast<double *> (Xoptptr), &inc, opt, &inc);

	delete Xopt;
	
	std::map<integer *, integer>::iterator iter = CheckMemoryDeleted->begin();
	for (iter = CheckMemoryDeleted->begin(); iter != CheckMemoryDeleted->end(); iter++)
	{
		if (iter->second != 1)
			std::cout << "Global address:" << iter->first << ", sharedtimes:" << iter->second << std::endl;
	}
	delete CheckMemoryDeleted;
	return;
}
void DriverElasticCurvesRO(double *C1, double *C2, integer d, integer n, double w, bool rotated, bool isclosed,
	bool onlyDP, integer skipm, std::string solverstr, integer autoselectC, ProductElement *Xopt, bool &swap, double *fopts, double *comtime, integer &Nsout, integer &numinitialx)
{ // The first and last point of C1 and C2 should be the same if they are viewed as closed curves, i.e., isclosed = true.
	double threshold = M_PI / 2;
	integer minSkip = skipm;
	integer randshift = 0;
	bool computeCD1 = false;

	Solvers *solver = nullptr;
	// Let C2 be the complex one
	double TAC1 = ComputeTotalAngle(C1, d, n);
	double TAC2 = ComputeTotalAngle(C2, d, n);
	double *temppt, TACtemp;
    swap = false;

	// autoselectC: 0: keep the order, 1 and 2: C2 is the simple one
	if (autoselectC != 0)
	{
		//if (autoselectC == 1)
		//{
		//	if (TAC1 > TAC2)
		//	{
		//		temppt = C1;
		//		C1 = C2;
		//		C2 = temppt;
		//		TACtemp = TAC1;
		//		TAC1 = TAC2;
		//		TAC2 = TACtemp;
		//		swap = true;
		//	}
		//}
		//else
		//{
			if (TAC1 < TAC2)
			{
				temppt = C1;
				C1 = C2;
				C2 = temppt;
				TACtemp = TAC1;
				TAC1 = TAC2;
				TAC2 = TACtemp;
				swap = true;
			}
		//}
	}

	// find initial breaks and Ns
	integer *ms = new integer[n];
	integer lms = 0, ns = n;
	if (isclosed)
	{
		if (onlyDP)
		{
			skipm = (skipm < 1) ? 1 : skipm;
			for (integer i = 0; i < n - 1; i += skipm)
			{
				ms[lms] = i;
				lms++;
			}
		}
		else
		{
			if (autoselectC != 1)
			{
				if (TAC2 > TAC1)
				{
					FindInitialBreaksAndNs(C2, d, n, minSkip, threshold, randshift, ms, lms, ns);
				}
				else
				{
					FindInitialBreaksAndNs(C1, d, n, minSkip, threshold, randshift, ms, lms, ns);
					for (integer i = 1; i < lms; i++)
					{
						ms[i] = n - ms[i];
					}
				}
			}
			else
			{
				if (TAC2 < TAC1)
				{
					FindInitialBreaksAndNs(C2, d, n, minSkip, threshold, randshift, ms, lms, ns);
					ns = static_cast<int> (static_cast<double> (n) / 3);
					ns = (ns > 30) ? 30 : ns;
					ns += static_cast<int> (TAC1 / M_PI * 2.0);
				}
				else
				{
					FindInitialBreaksAndNs(C1, d, n, minSkip, threshold, randshift, ms, lms, ns);
					for (integer i = 1; i < lms; i++)
					{
						ms[i] = n - ms[i];
					}
					ns = static_cast<int> (static_cast<double> (n) / 3);
					ns = (ns > 30) ? 30 : ns;
					ns += static_cast<int> (TAC2 / M_PI * 2.0);
				}
			}
		}
	}
	else
	{
		ms[0] = 0;
		lms = 1;
		if (!onlyDP)
		{
			ns = static_cast<int> (static_cast<double> (n) / 3);
			ns = (ns > 30) ? 30 : ns;
			ns += static_cast<int> (TAC2 / M_PI * 2.0);
		}
	}
	Nsout = ns;
	numinitialx = lms;

	// create manifold and initial iterate objects.
	integer numofmanis = 3;
	integer numofmani1 = 1;
	integer numofmani2 = 1;
	integer numofmani3 = 1;
	L2Sphere TNS(n);
	OrthGroup OG(d);
	Euclidean Euc(1);
	ProductManifold *Domain = nullptr;
	Domain = new ProductManifold(numofmanis, &TNS, numofmani1, &OG, numofmani2, &Euc, numofmani3);

// 	Domain->SetIsIntrApproach(false);

	L2SphereVariable TNSV(n);
	OrthGroupVariable OGV(d);
	EucVariable EucV(1);
	ProductElement *InitialX = nullptr;
	InitialX = new ProductElement(numofmanis, &TNSV, numofmani1, &OGV, numofmani2, &EucV, numofmani3);
	double *Xptr = InitialX->ObtainWriteEntireData();

	// initialize rotation and shift:
	Xptr[n + d * d] = 0;
	for (integer j = 0; j < d; j++)
	{
		Xptr[n + j + j * d] = 1;
		for (integer k = j + 1; k < d; k++)
		{
			Xptr[n + k + j * d] = 0;
			Xptr[n + j + k * d] = 0;
		}
	}


	// find initialX for each break and run the solver
	ElasticCurvesRO *ECRO = nullptr;
	double *C2shift = new double[5 * d * n + d * d + n + lms + 2 * d * d];
	double *q2shift = C2shift + d * n;
	double *q1 = q2shift + d * n;
	double *O = q1 + d * n;
	double *Rotq2shift = O + d * d;
	double *RotC2shift = Rotq2shift + d * n;
	double *DPgam = RotC2shift + d * n;
	double *msV = DPgam + n;
	double *O2 = msV + lms;
	double *O3 = O2 + d * d; // d * d

	double *C1s = nullptr, *C2s = nullptr, *q1s = nullptr, *q2s = nullptr, *DPgams = nullptr;
	if (!onlyDP)
	{
		C1s = new double[4 * d * ns + ns];
		C2s = C1s + d * ns;
		q1s = C2s + d * ns;
		q2s = q1s + d * ns;
		DPgams = q2s + d * ns; // ns
	}
	double *C2_coefs = nullptr, *q2 = nullptr;
	if (onlyDP)
	{
		C2_coefs = new double[4 * d * (n - 1) + n * d];
		q2 = C2_coefs + 4 * d * (n - 1);
	}

	CurveToQ(C1, d, n, q1, isclosed);

	char *transn = const_cast<char *> ("n"), *transt = const_cast<char *> ("t");
	double one = 1, zero = 0;
	integer dd = d * d, inc = 1;
	unsigned long starttime = getTickCount();
	double minmsV = 10000;

	if (!onlyDP)
	{
		GetCurveSmall(C1, C1s, d, n, ns, isclosed);
		CurveToQ(C1s, d, ns, q1s, isclosed);
	}
	//Rcpp::Rcout << "lms:" << lms << ", ns:" << ns << std::endl;//----
	//for (integer i = 0; i < lms; i++) //---
	//{
	//	Rcpp::Rcout << ms[i] << std::endl;
	//}

	double *Xoptptr = Xopt->ObtainWriteEntireData();
	Xoptptr[n + d * d] = 0;

    for(integer i = 0; i < 5; i++)
    {
        fopts[i] = 1000;
        comtime[i] = static_cast<double>(getTickCount() - starttime) / CLK_PS;
    }

	for (integer i = 0; i < lms; i++) //lms
	{
		//Rcpp::Rcout << ms[i] << ", ";
        starttime = getTickCount();
		// obtain initial reparameterization
		ShiftC(C2, d, n, C2shift, ms[i]);
		CurveToQ(C2shift, d, n, q2shift, isclosed);

		if (rotated)
		{
			FindBestRotation(q1, q2shift, d, n, O);
// 			//ForDebug::Print("O best rot:", O, d, d);//---
//             for(integer j = 0; j < d; j++)//--------
//             {
//                 O[j + j * d] = 1;
//                 for(integer k = j + 1; k < d; k++)
//                 {
//                     O[j + k * d] = 0;
//                     O[k + j * d] = 0;
//                 }
//             }//-------------
			dgemm_(transn, transt, &n, &d, &d, &one, q2shift, &n, O, &d, &zero, Rotq2shift, &n);
			dgemm_(transn, transt, &n, &d, &d, &one, C2shift, &n, O, &d, &zero, RotC2shift, &n);
		}
		else
		{
			integer nd = n * d, inc = 1;
			dcopy_(&nd, q2shift, &inc, Rotq2shift, &inc);
			dcopy_(&nd, C2shift, &inc, RotC2shift, &inc);
		}

		if (!onlyDP)
		{
			GetCurveSmall(RotC2shift, C2s, d, n, ns, isclosed);
			CurveToQ(C2s, d, ns, q2s, isclosed);
 			DynamicProgramming(q2s, q1s, d, ns, DPgams, isclosed);
			//for (integer j = 0; j < ns; j++) //---
			//{
			//	DPgams[j] = static_cast<double> (j) / (ns - 1);///----
			//}//---
//         ForDebug::Print("DPgams:", DPgams, 1, ns);//----
			ReSampleGamma(DPgams, ns, DPgam, n);
		}
		else
		{
			DynamicProgramming(Rotq2shift, q1, d, n, DPgam, isclosed);

			if (rotated)
			{
				if (computeCD1)
				{
					if (isclosed)
						GradientPeriod(DPgam, n, 1.0 / (n - 1), Xptr);
					else
						Gradient(DPgam, n, 1.0 / (n - 1), Xptr);
					for (integer j = 0; j < n; j++)
					{
						Xptr[j] = sqrt(Xptr[j]);
					}
					ECRO = new ElasticCurvesRO(q1, Rotq2shift, d, n, w, rotated, isclosed);
					ECRO->SetDomain(Domain);
					//Rcpp::Rcout << "CD1 func:" << ECRO->f(InitialX) << std::endl;
				}

				if (isclosed)
				{
					for (integer j = 0; j < d; j++)
					{
						Spline::SplineUniformPeriodic(RotC2shift + j * n, n, 1.0 / (n - 1), C2_coefs + j * 4 * (n - 1));
					}
				}
				else
				{
					for (integer j = 0; j < d; j++)
					{
						Spline::SplineUniformSlopes(RotC2shift + j * n, n, 1.0 / (n - 1), C2_coefs + j * 4 * (n - 1));
					}
				}
				for (integer j = 0; j < n; j++)
				{
					for (integer k = 0; k < d; k++)
					{
						RotC2shift[j + k * n] = Spline::ValSplineUniform(C2_coefs + k * 4 * (n - 1), n, 1.0 / (n - 1), DPgam[j]);
					}
				}
				CurveToQ(RotC2shift, d, n, q2, isclosed);
				FindBestRotation(q1, q2, d, n, O2);
//                 ForDebug::Print("O:", O, d, d);//---
//                 ForDebug::Print("O2:", O2, d, d);//---
				dgemm_(transn, transn, &d, &d, &d, &one, O, &d, O2, &d, &zero, O3, &d);
//                 ForDebug::Print("O3:", O3, d, d);//---
				dcopy_(&dd, O3, &inc, O, &inc);
				dcopy_(&dd, O2, &inc, Xptr + n, &inc); // used to evaluate the cost function
			}
		}

		if (isclosed)
			GradientPeriod(DPgam, n, 1.0 / (n - 1), Xptr);
		else
			Gradient(DPgam, n, 1.0 / (n - 1), Xptr);
//         ForDebug::Print("DPgam:", DPgam, 1, n);//----
//         ForDebug::Print("Xptr:", Xptr, 1, n);//----

		for (integer j = 0; j < n; j++)
		{
			Xptr[j] = sqrt(Xptr[j]);
		}

//         ForDebug::Print("Xptr:", Xptr, 1, n);//----
		//ForDebug::Print("q1:", q1, n, d);//---
		//ForDebug::Print("Rotq2shift:", Rotq2shift, n, d);//---

		// Compute reparameterization for q1 and rotated and shifted q2;
		ECRO = new ElasticCurvesRO(q1, Rotq2shift, d, n, w, rotated, isclosed);
		ECRO->SetDomain(Domain);
		//Domain->SetHasHHR(true);//--
		//ECRO->CheckGradHessian(InitialX);//--
		if (onlyDP)
		{
			ECRO->w = 0;
			msV[i] = ECRO->f(InitialX);
			//Rcpp::Rcout << "CD1H func:" << msV[i] << std::endl;
		}
		if (!onlyDP)
		{
			//if (solverstr == "RNewton")
			//	solver = new RNewton(ECRO, InitialX);
			//else
			if (solverstr == "RBFGS")
			{
				solver = new RBFGS(ECRO, InitialX);
				dynamic_cast<SolversLS *> (solver)->Initstepsize = 0.001;
			}
			else
			if (solverstr == "LRBFGS")
			{
				solver = new LRBFGS(ECRO, InitialX);
				dynamic_cast<SolversLS *> (solver)->Initstepsize = 0.001;
			}
			else
			if (solverstr == "RCG")
			{
				solver = new RCG(ECRO, InitialX);
				dynamic_cast<SolversLS *> (solver)->Initstepsize = 0.001;
			}
			else
			if (solverstr == "RSD")
			{
				solver = new RSD(ECRO, InitialX);
				dynamic_cast<SolversLS *> (solver)->Initstepsize = 0.001;
			}
			else
			//if (solverstr == "RTRNewton")
			//	solver = new RTRNewton(ECRO, InitialX);
			//else
			if (solverstr == "RTRSR1")
			{
				solver = new RTRSR1(ECRO, InitialX);
				dynamic_cast<SolversTR *> (solver)->kappa = 0.1;
				dynamic_cast<SolversTR *> (solver)->theta = 1.0;
			}
			else
			if (solverstr == "LRTRSR1")
			{
				solver = new LRTRSR1(ECRO, InitialX);
				dynamic_cast<SolversTR *> (solver)->kappa = 0.1;
				dynamic_cast<SolversTR *> (solver)->theta = 1.0;
			}
			else
			if (solverstr == "RTRSD")
			{
				solver = new RTRSD(ECRO, InitialX);
			}
			else
			{
				Rcpp::Rcout << "This solver is not used in this problem!" << std::endl;
				delete ECRO;
				delete solver;
				delete[] C2shift;
				if (C2_coefs != nullptr)
				{
					delete[] C2_coefs;
				}
				if (C1s != nullptr)
				{
					delete[] C1s;
				}
				delete[] ms;
				delete Domain;
				delete InitialX;
				return;
			}

			//Domain->CheckIntrExtr(InitialX);//--------

			//solver->OutputGap = 100;
			solver->Max_Iteration = 500;
			solver->Min_Iteration = 10;
			solver->DEBUG = NOOUTPUT; //--FINALRESULT;//--NOOUTPUT; //ITERRESULT
			solver->Stop_Criterion = FUN_REL;
			solver->Tolerance = 1e-3;
			solver->Run();
			ECRO->w = 0;
			//--Xopt->RemoveAllFromTempData();
			msV[i] = ECRO->f(const_cast<Element *> (solver->GetXopt()));
			//Rcpp::Rcout << solverstr << "func:" << msV[i] << ", num of iter:" << solver->GetIter() << std::endl;//---
		}
		//ECRO->CheckGradHessian(solver->GetXopt());//--
		delete ECRO;

		if (msV[i] < minmsV)
		{
			minmsV = msV[i];
			if (onlyDP)
			{
				for (integer j = 0; j < n; j++)
				{
					Xoptptr[j] = DPgam[j];
				}
            //ForDebug::Print("O:", O, d, d);//-----
				dcopy_(&dd, O, &inc, Xoptptr + n, &inc);
				Xoptptr[n + d * d] = static_cast<double> (ms[i]) / (n - 1);
			}
			else
			{
				solver->GetXopt()->CopyTo(Xopt);
//                 solver->GetXopt()->Print("XOPT:");//---
				Xoptptr = Xopt->ObtainWritePartialData();
				for (integer j = 0; j < n; j++)
				{
					Xoptptr[j] *= Xoptptr[j];
				}
				double tmp1 = Xoptptr[0], tmp2 = 0;
                Xoptptr[0] = 0;
				for (integer j = 1; j < n; j++)
				{
					tmp2 = Xoptptr[j];
					Xoptptr[j] = Xoptptr[j - 1] + (tmp1 + tmp2) / 2 / (n - 1);
					tmp1 = tmp2;
				}
				//ForDebug::Print("XoptO1:", Xoptptr + n, d, d);//----
				dgemm_(transn, transt, &d, &d, &d, &one, O, &d, Xoptptr + n, &d, &zero, O2, &d);\
				dcopy_(&dd, O2, &inc, Xoptptr + n, &inc);
				//ForDebug::Print("XoptO2:", Xoptptr + n, d, d);//----

//                 Rcpp::Rcout << "ms[i]:" << ms[i] << ",:" << static_cast<double> (ms[i]) / (n - 1) << std::endl;//---
//                 Rcpp::Rcout << "Xoptptr[n + d * d]:" << Xoptptr[n + d * d] << std::endl;//---
				Xoptptr[n + d * d] = Xoptptr[n + d * d] + static_cast<double> (ms[i]) / (n - 1);
			}
		}
		if (!onlyDP)
			delete solver;

        comtime[0] += (double) (getTickCount() - starttime) / CLK_PS;
        if(msV[i] < fopts[0])
            fopts[0] = msV[i];
        if(i % 2 == 0)
        {
            comtime[1] += (double) (getTickCount() - starttime) / CLK_PS;
            if(msV[i] < fopts[1])
                fopts[1] = msV[i];
        }
        if(i % 4 == 0)
        {
            comtime[2] += (double) (getTickCount() - starttime) / CLK_PS;
            if(msV[i] < fopts[2])
                fopts[2] = msV[i];
        }
        if(i % 8 == 0)
        {
            comtime[3] += (double) (getTickCount() - starttime) / CLK_PS;
            if(msV[i] < fopts[3])
                fopts[3] = msV[i];
        }
        if(i % 16 == 0)
        {
            comtime[4] += (double) (getTickCount() - starttime) / CLK_PS;
            if(msV[i] < fopts[4])
                fopts[4] = msV[i];
        }
	}

	//Rcpp::Rcout << "min f:" << minmsV << std::endl;
	//Rcpp::Rcout << "time:" << comtime[0] << std::endl;
	delete[] C2shift;
	if (C2_coefs != nullptr)
	{
		delete[] C2_coefs;
	}
	if (C1s != nullptr)
	{
		delete[] C1s;
	}
	delete[] ms;
	delete InitialX;
	delete Domain;
};