int main() {
    printf("\n");
    printf("\n");
    printf("\n");
    printf(
        " HPMPC -- Library for High-Performance implementation of solvers for "
        "MPC.\n");
    printf(
        " Copyright (C) 2014-2015 by Technical University of Denmark. All "
        "rights reserved.\n");
    printf("\n");
    printf(" HPMPC is distributed in the hope that it will be useful,\n");
    printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
    printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
    printf(" See the GNU Lesser General Public License for more details.\n");
    printf("\n");
    printf("\n");
    printf("\n");

#if defined(TARGET_X64_INTEL_HASWELL) ||      \
    defined(TARGET_X64_INTEL_SABDY_BRIDGE) || \
    defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BULLDOZER)
    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);  // flush to zero subnormals !!!
                                                 // works only with one thread
                                                 // !!!
#endif

    int ii, jj;

    int rep, nrep = NREP;

    int nx = 8;   // number of states (it has to be even for the mass-spring
                  // system test problem)
    int nu = 3;   // number of inputs (controllers) (it has to be at least 1 and
                  // at most nx/2 for the mass-spring system test problem)
    int N = 15;   // horizon length
    int nb = 11;  // number of box constrained inputs and states
    int ng = 0;   // 4;  // number of general constraints
    int ngN = 4;  // 4;  // number of general constraints at the last stage

    //    int N2 = 3;   // horizon length of partially condensed problem

    int nbu = nu < nb ? nu : nb;
    int nbx = nb - nu > 0 ? nb - nu : 0;

    // stage-wise variant size
    int nxx[N + 1];
#if defined(ELIMINATE_X0)
    nxx[0] = 0;
#else
    nxx[0] = nx;
#endif
    for (ii = 1; ii <= N; ii++) nxx[ii] = nx;

    int nuu[N + 1];
    for (ii = 0; ii < N; ii++) nuu[ii] = nu;
    nuu[N] = 0;

    int nbb[N + 1];
#if defined(ELIMINATE_X0)
    nbb[0] = nbu;
#else
    nbb[0] = nb;
#endif
    for (ii = 1; ii < N; ii++) nbb[ii] = nb;
    nbb[N] = nbx;

    int ngg[N + 1];
    for (ii = 0; ii < N; ii++) ngg[ii] = ng;
    ngg[N] = ngN;

    printf(
        " Test problem: mass-spring system with %d masses and %d controls.\n",
        nx / 2, nu);
    printf("\n");
    printf(
        " MPC problem size: %d states, %d inputs, %d horizon length, %d "
        "two-sided box constraints, %d two-sided general constraints.\n",
        nx, nu, N, nb, ng);
    printf("\n");
    printf(
        " IP method parameters: predictor-corrector IP, double precision, %d "
        "maximum iterations, %5.1e exit tolerance in duality measure.\n",
        MAXITER, TOL);
    printf("\n");
#if defined(TARGET_X64_AVX2)
    printf(" HPMPC built for the AVX2 architecture\n");
#endif
#if defined(TARGET_X64_AVX)
    printf(" HPMPC built for the AVX architecture\n");
#endif
    printf("\n");

    /************************************************
     * dynamical system
     ************************************************/

    // state space matrices & initial state
    double *A;
    d_zeros(&A, nx, nx);  // states update matrix
    double *B;
    d_zeros(&B, nx, nu);  // inputs matrix
    double *b;
    d_zeros(&b, nx, 1);  // states offset
    double *x0;
    d_zeros(&x0, nx, 1);  // initial state

    // mass-spring system
    double Ts = 0.5;  // sampling time
    mass_spring_system(Ts, nx, nu, A, B, b, x0);

    for (jj = 0; jj < nx; jj++) b[jj] = 0.1;

    for (jj = 0; jj < nx; jj++) x0[jj] = 0;
    x0[0] = 2.5;
    x0[1] = 2.5;

//    d_print_mat(nx, nx, A, nx);
//    d_print_mat(nx, nu, B, nx);
//    d_print_mat(nx, 1, b, nx);
//    d_print_mat(nx, 1, x0, nx);

#if defined(ELIMINATE_X0)
    // compute b0 = b + A*x0
    double *b0;
    d_zeros(&b0, nx, 1);
    dcopy_3l(nx, b, 1, b0, 1);
    dgemv_n_3l(nx, nx, A, nx, x0, b0);
    //    d_print_mat(nx, 1, b, nx);
    //    d_print_mat(nx, 1, b0, nx);

    // then A0 is a matrix of size 0x0
    double *A0;
    d_zeros(&A0, 0, 0);
#endif

    /************************************************
     * box constraints
     ************************************************/

    int jj_end;

    int *idxb0;
    int_zeros(&idxb0, nbb[0], 1);
    double *lb0;
    d_zeros(&lb0, nbb[0], 1);
    double *ub0;
    d_zeros(&ub0, nbb[0], 1);
#if defined(ELIMINATE_X0)
    for (jj = 0; jj < nbb[0]; jj++) {
        lb0[jj] = -0.5;  // umin
        ub0[jj] = +0.5;  // umin
        idxb0[jj] = jj;
    }
#else
    jj_end = nbx < nbb[0] ? nbx : nbb[0];
    for (jj = 0; jj < jj_end; jj++) {
//        lb0[jj] = x0[jj - nbu];  // initial state
//        ub0[jj] = x0[jj - nbu];  // initial state
        lb0[jj] = x0[jj];  // initial state
        ub0[jj] = x0[jj];  // initial state
        idxb0[jj] = jj;
    }
    for (; jj < nbb[0]; jj++) {
        lb0[jj] = -0.5;  // umin
        ub0[jj] = +0.5;  // umax
        idxb0[jj] = jj;
    }
#endif
    //    int_print_mat(nbb[0], 1, idxb0, nbb[0]);
    //    d_print_mat(nbb[0], 1, lb0, nbb[0]);

    int *idxb1;
    int_zeros(&idxb1, nbb[1], 1);
    double *lb1;
    d_zeros(&lb1, nbb[1], 1);
    double *ub1;
    d_zeros(&ub1, nbb[1], 1);
    jj_end = nbx < nbb[1] ? nbx : nbb[1];
    for (jj = 0; jj < jj_end; jj++) {
        lb1[jj] = -4.0;  // xmin
        ub1[jj] = +4.0;  // xmax
        idxb1[jj] = jj;
    }
    for (; jj < nbb[1]; jj++) {
        lb1[jj] = -0.5;  // umin
        ub1[jj] = +0.5;  // umax
        idxb1[jj] = jj;
    }
    //    int_print_mat(nbb[1], 1, idxb1, nbb[1]);
    //    d_print_mat(nbb[1], 1, lb1, nbb[1]);

    int *idxbN;
    int_zeros(&idxbN, nbb[N], 1);
    double *lbN;
    d_zeros(&lbN, nbb[N], 1);
    double *ubN;
    d_zeros(&ubN, nbb[N], 1);
    jj_end = nbx < nbb[N] ? nbx : nbb[N];
    for (jj = 0; jj < jj_end; jj++) {
        lbN[jj] = -4.0;  // xmin
        ubN[jj] = +4.0;  // xmax
        idxbN[jj] = jj;
    }
    for (; jj < nbb[N]; jj++) {
        lbN[jj] = -0.5;  // umin
        ubN[jj] = +0.5;  // umax
        idxbN[jj] = jj;
    }
    //    int_print_mat(nbb[N], 1, idxbN, nbb[N]);
    //    d_print_mat(nbb[N], 1, lbN, nbb[N]);

    /************************************************
     * general constraints
     ************************************************/

    double *C;
    d_zeros(&C, ng, nx);
    double *D;
    d_zeros(&D, ng, nu);
    double *lg;
    d_zeros(&lg, ng, 1);
    double *ug;
    d_zeros(&ug, ng, 1);

    double *CN;
    d_zeros(&CN, ngN, nx);
    for (ii = 0; ii < ngN; ii++) CN[ii * (ngN + 1)] = 1.0;
    //    d_print_mat(ngN, nx, CN, ngN);
    double *lgN;
    d_zeros(&lgN, ngN, 1);  // force all states to 0 at the last stage
    double *ugN;
    d_zeros(&ugN, ngN, 1);  // force all states to 0 at the last stage

    /************************************************
     * cost function
     ************************************************/

    double *Q;
    d_zeros(&Q, nx, nx);
    for (ii = 0; ii < nx; ii++) Q[ii * (nx + 1)] = 1.0;

    double *R;
    d_zeros(&R, nu, nu);
    for (ii = 0; ii < nu; ii++) R[ii * (nu + 1)] = 2.0;

    double *S;
    d_zeros(&S, nu, nx);

    double *q;
    d_zeros(&q, nx, 1);
    for (ii = 0; ii < nx; ii++) q[ii] = 0.1;

    double *r;
    d_zeros(&r, nu, 1);
    for (ii = 0; ii < nu; ii++) r[ii] = 0.2;

#if defined(ELIMINATE_X0)
    // Q0 and q0 are matrices of size 0
    double *Q0;
    d_zeros(&Q0, 0, 0);
    double *q0;
    d_zeros(&q0, 0, 1);

    // compute r0 = r + S*x0
    double *r0;
    d_zeros(&r0, nu, 1);
    dcopy_3l(nu, r, 1, r0, 1);
    dgemv_n_3l(nu, nx, S, nu, x0, r0);

    // then S0 is a matrix of size nux0
    double *S0;
    d_zeros(&S0, nu, 0);
#endif

    /************************************************
     * problems data
     ************************************************/

    double *hA[N];
    double *hB[N];
    double *hb[N];
    double *hQ[N + 1];
    double *hS[N];
    double *hR[N];
    double *hq[N + 1];
    double *hr[N];
    double *hlb[N + 1];
    double *hub[N + 1];
    int *hidxb[N + 1];
    double *hC[N + 1];
    double *hD[N];
    double *hlg[N + 1];
    double *hug[N + 1];

#if defined(ELIMINATE_X0)
    hA[0] = A0;
    hb[0] = b0;
    hQ[0] = Q0;
    hS[0] = S0;
    hq[0] = q0;
    hr[0] = r0;
#else
    hA[0] = A;
    hb[0] = b;
    hQ[0] = Q;
    hS[0] = S;
    hq[0] = q;
    hr[0] = r;
#endif
    hB[0] = B;
    hR[0] = R;
    hlb[0] = lb0;
    hub[0] = ub0;
    hidxb[0] = idxb0;
    hC[0] = C;
    hD[0] = D;
    hlg[0] = lg;
    hug[0] = ug;
    for (ii = 1; ii < N; ii++) {
        hA[ii] = A;
        hB[ii] = B;
        hb[ii] = b;
        hQ[ii] = Q;
        hS[ii] = S;
        hR[ii] = R;
        hq[ii] = q;
        hr[ii] = r;
        hlb[ii] = lb1;
        hub[ii] = ub1;
        hidxb[ii] = idxb1;
        hC[ii] = C;
        hD[ii] = D;
        hlg[ii] = lg;
        hug[ii] = ug;
    }
    hQ[N] = Q;  // or maybe initialize to the solution of the DARE???
    hq[N] = q;  // or maybe initialize to the solution of the DARE???
    hlb[N] = lbN;
    hub[N] = ubN;
    hidxb[N] = idxbN;
    hC[N] = CN;
    hlg[N] = lgN;
    hug[N] = ugN;

    /************************************************
     * solution
     ************************************************/

    double *hx[N + 1];
    double *hu[N];
    double *hpi[N];
    double *hlam[N + 1];
    double *ht[N + 1];

    for (ii = 0; ii < N; ii++) {
        d_zeros(&hx[ii], nxx[ii], 1);
        d_zeros(&hu[ii], nuu[ii], 1);
        d_zeros(&hpi[ii], nxx[ii + 1], 1);
        d_zeros(&hlam[ii], 2 * nbb[ii] + 2 * ngg[ii], 1);
        d_zeros(&ht[ii], 2 * nbb[ii] + 2 * ngg[ii], 1);
    }
    d_zeros(&hx[N], nxx[N], 1);
    d_zeros(&hlam[N], 2 * nbb[N] + 2 * ngg[N], 1);
    d_zeros(&ht[N], 2 * nbb[N] + 2 * ngg[N], 1);

    /************************************************
     * create the in and out struct
     ************************************************/

    ocp_qp_in qp_in;
    qp_in.N = N;
    qp_in.nx = (const int *)nxx;
    qp_in.nu = (const int *)nuu;
    qp_in.nb = (const int *)nbb;
    qp_in.nc = (const int *)ngg;
    qp_in.A = (const double **)hA;
    qp_in.B = (const double **)hB;
    qp_in.b = (const double **)hb;
    qp_in.Q = (const double **)hQ;
    qp_in.S = (const double **)hS;
    qp_in.R = (const double **)hR;
    qp_in.q = (const double **)hq;
    qp_in.r = (const double **)hr;
    qp_in.idxb = (const int **)hidxb;
    qp_in.lb = (const double **)hlb;
    qp_in.ub = (const double **)hub;
    qp_in.Cx = (const double **)hC;
    qp_in.Cu = (const double **)hD;
    qp_in.lc = (const double **)hlg;
    qp_in.uc = (const double **)hug;

    ocp_qp_out qp_out;
    qp_out.x = hx;
    qp_out.u = hu;
    qp_out.pi = hpi;
    qp_out.lam = hlam;
    qp_out.t = ht;  // XXX why also the slack variables ???

    /************************************************
     * solver arguments (fully sparse)
     ************************************************/

    // solver arguments
    ocp_qp_condensing_hpipm_args *hpipm_args = ocp_qp_condensing_hpipm_create_arguments(&qp_in);
//    hpipm_args->mu_max = TOL;
//    hpipm_args->iter_max = MAXITER;
//    hpipm_args->alpha_min = MINSTEP;
    hpipm_args->mu0 = 1.0;  // 0.0

    /************************************************
     * work space (fully sparse)
     ************************************************/

    int work_space_size =
        ocp_qp_condensing_hpipm_calculate_workspace_size(&qp_in, hpipm_args);
    printf("\nwork space size: %d bytes\n", work_space_size);
    void *workspace = malloc(work_space_size);

    //    void *mem;
    //    ocp_qp_hpipm_create_memory(&qp_in, hpipm_args, &mem);
    int memory_size =
        ocp_qp_condensing_hpipm_calculate_memory_size(&qp_in, hpipm_args);
    printf("\nmemory: %d bytes\n", memory_size);
    void *memory = malloc(memory_size);

    ocp_qp_condensing_hpipm_memory *hpipm_memory =
        ocp_qp_condensing_hpipm_create_memory(&qp_in, hpipm_args);

    /************************************************
     * call the solver (fully sparse)
     ************************************************/

    int return_value;

    acados_timer timer;
    acados_tic(&timer);

    //  nrep = 1;
    for (rep = 0; rep < nrep; rep++) {
        // call the QP OCP solver
        //        return_value = ocp_qp_hpipm(&qp_in, &qp_out, hpipm_args,
        //        workspace);
        return_value =
            ocp_qp_condensing_hpipm(&qp_in, &qp_out, hpipm_args, hpipm_memory, workspace);
    }

    real_t time = acados_toc(&timer)/nrep;

    if (return_value == ACADOS_SUCCESS)
        printf("\nACADOS status: solution found in %d iterations\n",
               hpipm_memory->iter);

    if (return_value == ACADOS_MAXITER)
        printf("\nACADOS status: maximum number of iterations reached\n");

    if (return_value == ACADOS_MINSTEP)
        printf("\nACADOS status: below minimum step size length\n");

    printf("\nu = \n");
    for (ii = 0; ii < N; ii++) d_print_mat(1, nuu[ii], hu[ii], 1);

    printf("\nx = \n");
    for (ii = 0; ii <= N; ii++) d_print_mat(1, nxx[ii], hx[ii], 1);

    printf("\npi = \n");
    for (ii = 0; ii < N; ii++) d_print_mat(1, nxx[ii+1], hpi[ii], 1);

    printf("\nlam = \n");
    for (ii = 0; ii <= N; ii++) d_print_mat(1, 2*nbb[ii]+2*ngg[ii], hlam[ii], 1);

    printf("\n");
    printf(" inf norm res: %e, %e, %e, %e, %e\n", hpipm_memory->inf_norm_res[0],
           hpipm_memory->inf_norm_res[1], hpipm_memory->inf_norm_res[2],
           hpipm_memory->inf_norm_res[3], hpipm_memory->inf_norm_res[4]);
    printf("\n");
    printf(
        " Solution time for %d IPM iterations, averaged over %d runs: %5.2e "
        "seconds\n", hpipm_memory->iter, nrep, time);
    printf("\n\n");

    /************************************************
     * free memory
     ************************************************/

    d_free(A);
    d_free(B);
    d_free(b);
    d_free(x0);
    d_free(Q);
    d_free(S);
    d_free(R);
    d_free(q);
    d_free(r);
#if defined(ELIMINATE_X0)
    d_free(A0);
    d_free(b0);
    d_free(Q0);
    d_free(S0);
    d_free(q0);
    d_free(r0);
#endif
    int_free(idxb0);
    d_free(lb0);
    d_free(ub0);
    int_free(idxb1);
    d_free(lb1);
    d_free(ub1);
    int_free(idxbN);
    d_free(lbN);
    d_free(ubN);
    d_free(C);
    d_free(D);
    d_free(lg);
    d_free(ug);
    d_free(CN);
    d_free(lgN);
    d_free(ugN);

    for (ii = 0; ii < N; ii++) {
        d_free(hx[ii]);
        d_free(hu[ii]);
        d_free(hpi[ii]);
        d_free(hlam[ii]);
        d_free(ht[ii]);
    }
    d_free(hx[N]);
    d_free(hlam[N]);
    d_free(ht[N]);

    free(workspace);
    free(memory);

    return 0;
}
Beispiel #2
0
/* primal-dual interior-point method, hard constraints, time variant matrices, time variant size (mpc version) */
int d_ip2_hard_mpc_tv(int *kk, int k_max, double mu0, double mu_tol, double alpha_min, int warm_start, double *sigma_par, double *stat, int N, int *nx, int *nu, int *nb, int **idxb, int *ng, double **pBAbt, double **pQ, double **pDCt, double **d, double **ux, int compute_mult, double **pi, double **lam, double **t, double *double_work_memory, int *int_work_memory)
	{
	
	// indeces
	int jj, ll, ii, bs0;

	// constants
	const int bs = D_MR;
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line



	// matrices size
	// work_space_int_size_per_stage = 7
	int idx;
	int nxM = 0;
	int nzM = 0;
	int ngM = 0;
	int *ptr_int, *anx, *anz, *pnz, *pnb, *png, *cnx, *cnz;
	ptr_int = int_work_memory; // no alignmenr requirements
	anx = ptr_int; ptr_int += N+1;
	anz = ptr_int; ptr_int += N+1;
	pnz = ptr_int; ptr_int += N+1;
	pnb = ptr_int; ptr_int += N+1;
	png = ptr_int; ptr_int += N+1;
	cnx = ptr_int; ptr_int += N+1;
	cnz = ptr_int; ptr_int += N+1;

	for(jj=0; jj<=N; jj++)
		{
		anx[jj] = (nx[jj]+nal-1)/nal*nal;
		anz[jj] = (nu[jj]+nx[jj]+1+nal-1)/nal*nal;
		pnz[jj] = (nu[jj]+nx[jj]+1+bs-1)/bs*bs;
		pnb[jj] = (nb[jj]+bs-1)/bs*bs;
		png[jj] = (ng[jj]+bs-1)/bs*bs;
		cnx[jj] = (nx[jj]+ncl-1)/ncl*ncl;
		cnz[jj] = (nu[jj]+nx[jj]+1+ncl-1)/ncl*ncl;
		if(nx[jj]>nxM) nxM = nx[jj];
		if(nu[jj]+nx[jj]+1>nzM) nzM = nu[jj]+nx[jj]+1;
		if(ng[jj]>ngM) ngM = ng[jj];
		}



	// initialize work space
	// work_space_double_size_per_stage = pnz*cnl + 2*anz + 2*anx + 14*pnb + 10*png
	// work_space_double_size_const_max = pnz*cnxg + pnz
	double *ptr;
	ptr = double_work_memory; // supposed to be aligned to cache line boundaries

	double *(pL[N+1]);
	double *(l[N+1]);
	double *work;
	double *(q[N+1]);
	double *(dux[N+1]);
	double *(dpi[N+1]);
	double *(pd[N+1]); // pointer to diagonal of Hessian
	double *(pl[N+1]); // pointer to linear part of Hessian
	double *(bd[N+1]); // backup diagonal of Hessian
	double *(bl[N+1]); // backup linear part of Hessian
	double *diag;
	double *(dlam[N+1]);
	double *(dt[N+1]);
	double *(lamt[N+1]);
	double *(t_inv[N+1]);
	double *(Qx[N+1]);
	double *(qx[N+1]);
	double *(qx2[N+1]);
	double *(Pb[N]);

	// work space
	for(jj=0; jj<=N; jj++)
		{
		pL[jj] = ptr;
		ptr += pnz[jj] * ( cnx[jj]+ncl>cnz[jj] ? cnx[jj]+ncl : cnz[jj] ); // pnz*cnl
		}

	for(jj=0; jj<=N; jj++)
		{
		l[jj] = ptr;
		ptr += anz[jj];
		}

	work = ptr;
	ptr += ((nzM+bs-1)/bs*bs) * ((nxM+ngM+ncl-1)/ncl*ncl); // pnzM*cnxgM

	
	// inputs and states
	for(jj=0; jj<=N; jj++)
		{
		dux[jj] = ptr;
		ptr += anz[jj];
		}

	// equality constr multipliers
	for(jj=0; jj<=N; jj++)
		{
		dpi[jj] = ptr;
		ptr += anx[jj];
		}
	
	// backup of P*b
	for(jj=0; jj<N; jj++)
		{
		Pb[jj] = ptr;
		ptr += anx[jj+1];
		}

	// linear part of cost function
	for(jj=0; jj<=N; jj++)
		{
		q[jj] = ptr;
		ptr += anz[jj];
		for(ll=0; ll<nu[jj]+nx[jj]; ll++) q[jj][ll] = pQ[jj][(nu[jj]+nx[jj])/bs*bs*cnz[jj]+(nu[jj]+nx[jj])%bs+ll*bs];
		}

	// Hessian backup
	for(jj=0; jj<=N; jj++)
		{
		pd[jj] = ptr;
		pl[jj] = ptr + pnb[jj];
		bd[jj] = ptr + 2*pnb[jj];
		bl[jj] = ptr + 3*pnb[jj];
		ptr += 4*pnb[jj];
		// backup
		for(ll=0; ll<nb[jj]; ll++)
			{
			idx = idxb[jj][ll];
			bd[jj][ll] = pQ[jj][idx/bs*bs*cnz[jj]+idx%bs+idx*bs];
			bl[jj][ll] = q[jj][idx];
			}
		}

	diag = ptr;
	ptr += (nzM+bs-1)/bs*bs; // pnzM

	// slack variables, Lagrangian multipliers for inequality constraints and work space
	for(jj=0; jj<=N; jj++)
		{
		dlam[jj] = ptr;
		dt[jj]   = ptr + 2*pnb[jj]+2*png[jj];
		ptr += 4*pnb[jj]+4*png[jj];
		}

	for(jj=0; jj<=N; jj++)
		{
		lamt[jj] = ptr;
		ptr += 2*pnb[jj]+2*png[jj];
		}

	for(jj=0; jj<=N; jj++)
		{
		t_inv[jj] = ptr;
		ptr += 2*pnb[jj]+2*png[jj];
		}

	for(jj=0; jj<=N; jj++)
		{
		Qx[jj] = ptr;
		qx[jj] = ptr+pnb[jj]+png[jj];
		qx2[jj] = ptr+2*pnb[jj]+2*png[jj];
		ptr += 3*pnb[jj]+3*png[jj];
		}



	double temp0, temp1;
	double alpha, mu, mu_aff;
	double mu_scal = 0.0; 
	for(jj=0; jj<=N; jj++) mu_scal += 2*nb[jj] + 2*ng[jj];
	//printf("\nmu_scal = %f\n", mu_scal);
	mu_scal = 1.0 / mu_scal;
	//printf("\nmu_scal = %f\n", mu_scal);
	double sigma, sigma_decay, sigma_min;
	//for(ii=0; ii<=N; ii++)
	//	printf("\n%d %d\n", nb[ii], ng[ii]);
	//exit(1);

	sigma = sigma_par[0]; //0.4;
	sigma_decay = sigma_par[1]; //0.3;
	sigma_min = sigma_par[2]; //0.01;
	


	// initialize ux & t>0 (slack variable)
	d_init_var_hard_mpc_tv(N, nx, nu, nb, idxb, ng, ux, pi, pDCt, d, t, lam, mu0, warm_start);



	// initialize pi
	for(jj=0; jj<=N; jj++)
		for(ll=0; ll<nx[jj]; ll++)
			dpi[jj][ll] = 0.0;



	// initialize dux
	for(ll=0; ll<nx[0]; ll++)
		dux[0][nu[0]+ll] = ux[0][nu[0]+ll];




	// compute the duality gap
	//alpha = 0.0; // needed to compute mu !!!!!
	//d_compute_mu_hard_mpc(N, nx, nu, nb, &mu, mu_scal, alpha, lam, dlam, t, dt);
	mu = mu0;

	// set to zero iteration count
	*kk = 0;	

	// larger than minimum accepted step size
	alpha = 1.0;

	// update hessian in Riccati routine
	const int update_hessian = 1;

	int fast_rsqrt = 0;



	// IP loop		
	while( *kk<k_max && mu>mu_tol && alpha>=alpha_min )
		{
						


		//update cost function matrices and vectors (box constraints)
		d_update_hessian_hard_mpc_tv(N, nx, nu, nb, ng, 0.0, t, t_inv, lam, lamt, dlam, Qx, qx, qx2, bd, bl, pd, pl, d);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pd[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pl[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, ng[ii], Qx[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, ng[ii], qx[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, ng[ii], qx2[ii], 1);
if(*kk==1)
exit(1);
#endif


		// compute the search direction: factorize and solve the KKT system
#if defined(FAST_RSQRT)
		if(mu>1e-2)
			fast_rsqrt = 2;
		else
			{
			if(mu>1e-4)
				fast_rsqrt = 1;
			else
				fast_rsqrt = 0;
			}
#else
		fast_rsqrt = 0;
#endif
		//printf("\n%d %f\n", fast_rsqrt, mu);
		d_ric_sv_mpc_tv(N, nx, nu, pBAbt, pQ, dux, pL, work, diag, 1, Pb, compute_mult, dpi, nb, idxb, pd, pl, ng, pDCt, Qx, qx2, fast_rsqrt);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_pmat(nu[ii]+nx[ii]+1, nu[ii]+nx[ii]+1, bs, pQ[ii], cnz[ii]);
//exit(1);
#endif
#if 0
for(ii=0; ii<=N; ii++)
	d_print_pmat(pnz[ii], cnz[ii], bs, pL[ii], cnz[ii]);
//exit(1);
#endif
#if 0
printf("\ndux\n");
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], dux[ii], 1);
if(*kk==1)
exit(1);
#endif


#if 1

		// compute t_aff & dlam_aff & dt_aff & alpha
		alpha = 1.0;
		d_compute_alpha_hard_mpc_tv(N, nx, nu, nb, idxb, ng, &alpha, t, dt, lam, dlam, lamt, dux, pDCt, d);

		

		stat[5*(*kk)] = sigma;
		stat[5*(*kk)+1] = alpha;
			
		alpha *= 0.995;
		//printf("\nalpha = %f\n", alpha);



		// compute the affine duality gap
		d_compute_mu_hard_mpc_tv(N, nx, nu, nb, ng, &mu_aff, mu_scal, alpha, lam, dlam, t, dt);

		stat[5*(*kk)+2] = mu_aff;
		//printf("\nmu = %f\n", mu_aff);



		// compute sigma
		sigma = mu_aff/mu;
		sigma = sigma*sigma*sigma;
//		if(sigma<sigma_min)
//			sigma = sigma_min;
//printf("\n%f %f %f %f\n", mu_aff, mu, sigma, mu_scal);
//exit(1);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], dt[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], dlam[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], t_inv[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pl[ii], 1);
//exit(1);
#endif


		d_update_gradient_hard_mpc_tv(N, nx, nu, nb, ng, sigma*mu, dt, dlam, t_inv, pl, qx);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pl[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, ng[ii], qx[ii], 1);
if(*kk==1)
exit(1);
#endif


		// copy b into x
		for(ii=0; ii<N; ii++)
			for(jj=0; jj<nx[ii+1]; jj++) 
				dux[ii+1][nu[ii+1]+jj] = pBAbt[ii][(nu[ii]+nx[ii])/bs*bs*cnx[ii+1]+(nu[ii]+nx[ii])%bs+bs*jj]; // copy b



		// solve the system
		d_ric_trs_mpc_tv(N, nx, nu, pBAbt, pL, q, l, dux, work, 0, Pb, compute_mult, dpi, nb, idxb, pl, ng, pDCt, qx);

#if 0
printf("\ndux\n");
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], dux[ii], 1);
if(*kk==1)
exit(1);
#endif



#endif


		// compute t & dlam & dt & alpha
		alpha = 1.0;
		d_compute_alpha_hard_mpc_tv(N, nx, nu, nb, idxb, ng, &alpha, t, dt, lam, dlam, lamt, dux, pDCt, d);

		stat[5*(*kk)] = sigma;
		stat[5*(*kk)+3] = alpha;
			
		alpha *= 0.995;



		// update x, u, lam, t & compute the duality gap mu

		d_update_var_hard_mpc_tv(N, nx, nu, nb, ng, &mu, mu_scal, alpha, ux, dux, t, dt, lam, dlam, pi, dpi);

		stat[5*(*kk)+4] = mu;
		
		// update sigma
/*		sigma *= sigma_decay;*/
/*		if(sigma<sigma_min)*/
/*			sigma = sigma_min;*/
/*		if(alpha<0.3)*/
/*			sigma = sigma_par[0];*/



		// increment loop index
		(*kk)++;


		} // end of IP loop
	
	// restore Hessian
	for(jj=0; jj<=N; jj++)
		{
		for(ll=0; ll<nb[jj]; ll++)
			{
			idx = idxb[jj][ll];
			pQ[jj][idx/bs*bs*cnz[jj]+idx%bs+idx*bs] = bd[jj][ll];
			pQ[jj][(nu[jj]+nx[jj])/bs*bs*cnz[jj]+(nu[jj]+nx[jj])%bs+idx*bs] = bl[jj][ll];
			}
		}



	// successful exit
	if(mu<=mu_tol)
		return 0;
	
	// max number of iterations reached
	if(*kk>=k_max)
		return 1;
	
	// no improvement
	if(alpha<alpha_min)
		return 2;
	
	// impossible
	return -1;

	} // end of ipsolver
Beispiel #3
0
/* primal-dual interior-point method, hard constraints, time variant matrices (mpc version) ; version with A diagonal and nu & nx time-variant*/
int d_ip2_diag_mpc(int *kk, int k_max, double mu0, double mu_tol, double alpha_min, int warm_start, double *sigma_par, double *stat, int N, int *nx, int *nu, int *nb, int **idxb, double **dA, double **pBt, double **pR, double **pSt, double **pQ, double **b, double **d, double **rq, double **ux, int compute_mult, double **pi, double **lam, double **t, double *work_memory)
	{
	
	// indeces
	int jj, ll, ii, bs0, idx;

	// constants
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = D_MR*D_NCL; // number of doubles per cache line

//	const int nz   = nx+nu+1;
//	const int nxu  = nx+nu;
//	const int pnz  = bs*((nz+bs-1)/bs);
//	const int pnx  = bs*((nx+bs-1)/bs);
//	const int pnb  = bs*((nb+bs-1)/bs); // simd aligned number of two-sided box constraints !!!!!!!!!!!!!!!!!!
//	const int cnz  = ncl*((nz+ncl-1)/ncl);
//	const int cnx  = ncl*((nx+ncl-1)/ncl);
//	const int anz  = nal*((nz+nal-1)/nal);
//	const int anx  = nal*((nx+nal-1)/nal);
//	const int anb = nal*((2*nb+nal-1)/nal); // cache aligned number of box constraints
	//const int anb = nal*((nb+nal-1)/nal); // cache aligned number of two-sided box constraints !!!!!!!!!!!!!!!!!!

//	const int pad = (ncl-nx%ncl)%ncl; // packing between BAbtL & P
	//const int cnl = cnz<cnx+ncl ? nx+pad+cnx+ncl : nx+pad+cnz;
//	const int cnl = cnz<cnx+ncl ? cnx+ncl : cnz;

	//printf("\n%d %d %d %d %d\n", N, nx, nu, nb, ng);
	//d_print_pmat(nz, nx, bs, pBAbt[0], cnx);
	//d_print_pmat(nz, nx, bs, pBAbt[1], cnx);
	//d_print_pmat(nz, nx, bs, pBAbt[N-1], cnx);
	//d_print_pmat(nz, nz, bs, pQ[0], cnz);
	//d_print_pmat(nz, nz, bs, pQ[1], cnz);
	//d_print_pmat(nz, nz, bs, pQ[N], cnz);
	//d_print_pmat(nx+nu, ng, bs, pDCt[0], cng);
	//d_print_pmat(nx+nu, ng, bs, pDCt[1], cng);
	//d_print_pmat(nx+nu, ng, bs, pDCt[N], cng);
	//d_print_mat(1, 2*pnb+2*png, d[0], 1);
	//d_print_mat(1, 2*pnb+2*png, d[1], 1);
	//d_print_mat(1, 2*pnb+2*png, d[N], 1);
	//d_print_mat(1, nx+nu, ux[0], 1);
	//d_print_mat(1, nx+nu, ux[1], 1);
	//d_print_mat(1, nx+nu, ux[N], 1);
	//exit(1);

	double *ptr;
	ptr = work_memory;

	int *ptr_int, *anu, *anx, *pnu, *pnx, *pnb, *cnu, *cnx;
	ptr_int = (int *) ptr;
	anu = ptr_int; ptr_int += (N+1);
	anx = ptr_int; ptr_int += (N+1);
	pnu = ptr_int; ptr_int += (N+1);
	pnx = ptr_int; ptr_int += (N+1);
	pnb = ptr_int; ptr_int += (N+1);
	cnu = ptr_int; ptr_int += (N+1);
	cnx = ptr_int; ptr_int += (N+1);

	for(jj=0; jj<=N; jj++)
		{
		anu[jj] = (nu[jj]+nal-1)/nal*nal;
		anx[jj] = (nx[jj]+nal-1)/nal*nal;
		pnu[jj] = (nu[jj]+bs-1)/bs*bs;
		pnx[jj] = (nx[jj]+bs-1)/bs*bs;
		pnb[jj] = (nb[jj]+bs-1)/bs*bs;
		cnu[jj] = (nu[jj]+ncl-1)/ncl*ncl;
		cnx[jj] = (nx[jj]+ncl-1)/ncl*ncl;
		}
	
	int pnxM = 0; for(jj=0; jj<=N; jj++) pnxM = pnx[jj]>pnxM ? pnx[jj] : pnxM;
	int pnuM = 0; for(jj=0; jj<=N; jj++) pnuM = pnu[jj]>pnuM ? pnu[jj] : pnuM;
	int cnuM = 0; for(jj=0; jj<=N; jj++) cnuM = cnu[jj]>cnuM ? cnu[jj] : cnuM;
	


	/* align work space */
	size_t align = 64;
	size_t addr = (size_t) ptr_int;
	size_t offset = addr % align;
	ptr_int = ptr_int + offset / sizeof(int);
	ptr = (double *) ptr_int;




	// initialize work space
	double *(pL[N]);
	double *pK;
	double *(pP[N+1]);
	double *(dux[N+1]);
	double *(dpi[N+1]);
	double *(Pb[N]);
	double *(pd[N+1]);
	double *(pl[N+1]);
	double *(bd[N+1]);
	double *(bl[N+1]);
	double *(dlam[N+1]);
	double *(dt[N+1]);
	double *(lamt[N+1]);
	double *(t_inv[N+1]);
	double *work;

//	ptr += (N+1)*(pnx + pnz*cnl + 12*pnz) + 3*pnz;

	// hpL
	for(jj=0; jj<N; jj++)
		{
		pL[jj] = ptr;
		ptr += (pnu[jj]+pnx[jj])*cnu[jj];
		}
	
	// pK
	pK = ptr;
	ptr += pnxM*cnuM;

	// hpP
	for(jj=0; jj<=N; jj++)
		{
		pP[jj] = ptr;
		ptr += pnx[jj]*cnx[jj];
		}

	// inputs and states
	for(jj=0; jj<=N; jj++)
		{
		dux[jj] = ptr;
		ptr += anu[jj]+anx[jj];
		}

	// equality constr multipliers
	for(jj=0; jj<=N; jj++)
		{
		dpi[jj] = ptr;
		ptr += anx[jj];
		}
	
	// backup of P*b
	for(jj=0; jj<N; jj++)
		{
		Pb[jj] = ptr;
		ptr += anx[jj+1];
		}

	// Hessian
	for(jj=0; jj<=N; jj++)
		{
		pd[jj] = ptr; //pQ[jj];
		pl[jj] = ptr + 1*(pnb[jj]);
		bd[jj] = ptr + 2*(pnb[jj]);
		bl[jj] = ptr + 3*(pnb[jj]);
		ptr += 4*(pnb[jj]);
		// backup
		//for(ll=0; ll<nu[jj]; ll++)
		//	bd[jj][ll] = pR[jj][(ll/bs)*bs*cnu[jj]+ll%bs+ll*bs];
		//for(ll=0; ll<nx[jj]; ll++)
		//	bd[jj][nu[jj]+ll] = pQ[jj][(ll/bs)*bs*cnx[jj]+ll%bs+ll*bs];
		for(ll=0; ll<nb[jj] && idxb[jj][ll]<nu[jj]; ll++)
			{
			idx = idxb[jj][ll];
			bd[jj][ll] = pR[jj][idx/bs*bs*cnu[jj]+idx%bs+idx*bs];
			bl[jj][ll] = rq[jj][idx];
			}
		for(; ll<nb[jj]; ll++)
			{
			idx = idxb[jj][ll] - nu[jj];
			bd[jj][ll] = pQ[jj][idx/bs*bs*cnx[jj]+idx%bs+idx*bs];
			bl[jj][ll] = rq[jj][idx];
			}
		//d_print_mat(1, nb[jj], bd[jj], 1);
		}
	//exit(1);

	// slack variables, Lagrangian multipliers for inequality constraints and work space
	for(jj=0; jj<=N; jj++)
		{
		dlam[jj] = ptr;
		dt[jj]   = ptr + 2*pnb[jj];
		ptr += 4*pnb[jj];
		}

	for(jj=0; jj<=N; jj++)
		{
		lamt[jj] = ptr;
		ptr += 2*pnb[jj];
		}

	for(jj=0; jj<=N; jj++)
		{
		t_inv[jj] = ptr;
		ptr += 2*pnb[jj];
		}
	
	work = ptr;
	ptr += pnxM + pnuM;



	double temp0, temp1;
	double alpha, mu, mu_aff;
	double mu_scal = 0.0;
	for(jj=0; jj<=N; jj++) mu_scal += nb[jj];
	mu_scal = 0.5/mu_scal;
	double sigma, sigma_decay, sigma_min;

	sigma = sigma_par[0]; //0.4;
	sigma_decay = sigma_par[1]; //0.3;
	sigma_min = sigma_par[2]; //0.01;
	

	// initialize ux & t>0 (slack variable)
	d_init_var_diag_mpc(N, nx, nu, nb, idxb, ux, pi, d, t, lam, mu0, warm_start);


#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], ux[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], t[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], lam[ii], 1);
exit(1);
#endif

	// initialize pi
	for(jj=0; jj<=N; jj++)
		for(ll=0; ll<nx[jj]; ll++)
			dpi[jj][ll] = 0.0;



	// initialize dux
	for(ll=0; ll<nx[0]; ll++)
		dux[0][nu[0]+ll] = ux[0][nu[0]+ll];



	// compute the duality gap
	//alpha = 0.0; // needed to compute mu !!!!!
	//d_compute_mu_hard_mpc(N, nx, nu, nb, &mu, mu_scal, alpha, lam, dlam, t, dt);
	mu = mu0;

	// set to zero iteration count
	*kk = 0;	

	// larger than minimum accepted step size
	alpha = 1.0;

	// update hessian in Riccati routine
	const int update_hessian = 1;

	//int fast_rsqrt = 0;



	// IP loop		
	while( *kk<k_max && mu>mu_tol && alpha>=alpha_min )
		{
						


		//update cost function matrices and vectors (box constraints)
		d_update_hessian_diag_mpc(N, nx, nu, nb, 0.0, t, t_inv, lam, lamt, dlam, bd, bl, pd, pl, d);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], t[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], t_inv[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], lam[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], lamt[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], dlam[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], bd[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pd[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], bl[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pl[ii], 1);
if(*kk==1)
exit(1);
#endif
#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pd[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pl[ii], 1);
//if(*kk==1)
exit(1);
#endif



		// update hessian & jacobian
		for(jj=0; jj<=N; jj++)
			{
			for(ll=0; ll<nb[jj] && idxb[jj][ll]<nu[jj]; ll++)
				{
				idx = idxb[jj][ll];
				pR[jj][idx/bs*bs*cnu[jj]+idx%bs+idx*bs] = pd[jj][ll];
				rq[jj][idx] = pl[jj][ll];
				}
			for(; ll<nb[jj]; ll++)
				{
				idx = idxb[jj][ll] - nu[jj];
				pQ[jj][idx/bs*bs*cnx[jj]+idx%bs+idx*bs] = pd[jj][ll];
				idx = idxb[jj][ll];
				rq[jj][idx] = pl[jj][ll];
				}
			}



#if 0
for(ii=0; ii<N; ii++)
	d_print_pmat(nu[ii], nu[ii], bs, pR[ii], cnu[ii]);
for(ii=0; ii<=N; ii++)
	d_print_pmat(nx[ii], nx[ii], bs, pQ[ii], cnx[ii]);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], rq[ii], 1);
exit(1);
#endif



		// compute the search direction: factorize and solve the KKT system
		//printf("\n%d %f\n", fast_rsqrt, mu);
		d_ric_diag_trf_mpc(N, nx, nu, dA, pBt, pR, pSt, pQ, pL, pK, pP, work);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_pmat(nx[ii], nx[ii], bs, pP[ii], cnx[ii]);
#endif

		d_ric_diag_trs_mpc(N, nx, nu, dA, pBt, pL, pP, b, rq, dux, 1, Pb, compute_mult, dpi, work);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_pmat(pnu[ii]+pnx[ii], cnu[ii], bs, pL[ii], cnu[ii]);
exit(1);
#endif
#if 0
printf("\ndux\n");
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], dux[ii], 1);
if(*kk==1)
exit(1);
#endif


#if 1
		// compute t_aff & dlam_aff & dt_aff & alpha
		for(jj=0; jj<=N; jj++)
			for(ll=0; ll<2*nb[jj]; ll++)
				dlam[jj][ll] = 0.0;


		alpha = 1.0;
		d_compute_alpha_diag_mpc(N, nx, nu, nb, idxb, &alpha, t, dt, lam, dlam, lamt, dux, d);

		

		stat[5*(*kk)] = sigma;
		stat[5*(*kk)+1] = alpha;
			
		alpha *= 0.995;
		//printf("\nalpha = %f\n", alpha);



		// compute the affine duality gap
		d_compute_mu_diag_mpc(N, nx, nu, nb, &mu_aff, mu_scal, alpha, lam, dlam, t, dt);

		stat[5*(*kk)+2] = mu_aff;
		//printf("\nmu = %f\n", mu_aff);

//mu_aff = 1.346982; // TODO remove !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


		// compute sigma
		sigma = mu_aff/mu;
		sigma = sigma*sigma*sigma;
//		if(sigma<sigma_min)
//			sigma = sigma_min;
//printf("\n%f %f %f %f\n", mu_aff, mu, sigma, mu_scal);
//exit(1);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], dt[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], dlam[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], t_inv[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pl[ii], 1);
//exit(1);
#endif



		d_update_gradient_diag_mpc(N, nx, nu, nb, sigma*mu, dt, dlam, t_inv, pl);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], pl[ii], 1);
if(*kk==1)
exit(1);
#endif


		// update jacobian
		for(jj=0; jj<=N; jj++)
			{
			for(ll=0; ll<nb[jj] && idxb[jj][ll]<nu[jj]; ll++)
				{
				idx = idxb[jj][ll];
				rq[jj][idx] = pl[jj][ll];
				}
			for(; ll<nb[jj]; ll++)
				{
				idx = idxb[jj][ll];
				rq[jj][idx] = pl[jj][ll];
				}
			}




		// solve the system
		d_ric_diag_trs_mpc(N, nx, nu, dA, pBt, pL, pP, b, rq, dux, 0, Pb, compute_mult, dpi, work);
		//d_ric_trs_mpc(nx, nu, N, pBAbt, pL, pl, dux, work, 1, Pb, compute_mult, dpi, nb, ng, ngN, pDCt, qx);
#endif


#if 0
printf("\ndux\n");
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], dux[ii], 1);
if(*kk==1)
exit(1);
#endif




		// compute t & dlam & dt & alpha
		alpha = 1.0;
		d_compute_alpha_diag_mpc(N, nx, nu, nb, idxb, &alpha, t, dt, lam, dlam, lamt, dux, d);
		//printf("\n%f\n", alpha);
		//exit(1);

		stat[5*(*kk)] = sigma;
		stat[5*(*kk)+3] = alpha;
			
		alpha *= 0.995;



		// update x, u, lam, t & compute the duality gap mu

		d_update_var_diag_mpc(N, nx, nu, nb, &mu, mu_scal, alpha, ux, dux, t, dt, lam, dlam, pi, dpi);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], ux[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], t[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, 2*pnb[ii], lam[ii], 1);
exit(1);
#endif

		stat[5*(*kk)+4] = mu;
		
		// update sigma
//		sigma *= sigma_decay;
//		if(sigma<sigma_min)
//			sigma = sigma_min;
//		if(alpha<0.3)
//			sigma = sigma_par[0];


#if 0
d_print_mat(1, 2*pnb+2*png, lam[0], 1);
d_print_mat(1, 2*pnb+2*png, lam[1], 1);
d_print_mat(1, 2*pnb+2*png, lam[N], 1);
d_print_mat(1, 2*pnb+2*png, t[0], 1);
d_print_mat(1, 2*pnb+2*png, t[1], 1);
d_print_mat(1, 2*pnb+2*png, t[N], 1);
printf("\n%f\n", mu);
exit(1);
#endif

//mu = 13.438997;

		// increment loop index
		(*kk)++;



		} // end of IP loop
	

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], rq[ii], 1);
#endif

	// restore Hessian
	//for(jj=0; jj<=N; jj++)
	//	{
	//	for(ll=0; ll<nu[jj]; ll++)
	//		pR[jj][(ll/bs)*bs*cnu[jj]+ll%bs+ll*bs] = bd[jj][ll];
	//	for(ll=0; ll<nx[jj]; ll++)
	//		pQ[jj][(ll/bs)*bs*cnx[jj]+ll%bs+ll*bs] = bd[jj][nu[jj]+ll];
	//	}
	for(jj=0; jj<=N; jj++)
		{
		for(ll=0; ll<nb[jj] && idxb[jj][ll]<nu[jj]; ll++)
			{
			idx = idxb[jj][ll];
			pR[jj][idx/bs*bs*cnu[jj]+idx%bs+idx*bs] = bd[jj][ll];
			rq[jj][idx] = bl[jj][ll];
			}
		for(; ll<nb[jj]; ll++)
			{
			idx = idxb[jj][ll] - nu[jj];
			pQ[jj][idx/bs*bs*cnx[jj]+idx%bs+idx*bs] = bd[jj][ll];
			idx = idxb[jj][ll];
			rq[jj][idx] = bl[jj][ll];
			}
		}

#if 0
for(ii=0; ii<N; ii++)
	d_print_pmat(nu[ii], nu[ii], bs, pR[ii], cnu[ii]);
for(ii=0; ii<=N; ii++)
	d_print_pmat(nx[ii], nx[ii], bs, pQ[ii], cnx[ii]);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nb[ii], bl[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu[ii]+nx[ii], rq[ii], 1);
exit(1);
#endif


	// successful exit
	if(mu<=mu_tol)
		return 0;
	
	// max number of iterations reached
	if(*kk>=k_max)
		return 1;
	
	// no improvement
	if(alpha<alpha_min)
		return 2;
	
	// impossible
	return -1;

	} // end of ipsolver
Beispiel #4
0
int main()
	{
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");
	
	int ii, jj, ll;

	double **dummy;
	int ** int_dummy;

	
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line
	
	int nx, nu, N, nrep;

	// timing variables
	float time_ric_diag, time_ric_full, time_ric_full_tv, time_ip_diag, time_ip_full, time_ip_full_tv;

/************************************************
* test of riccati eye/diag & size-variant
************************************************/
	
#if 1

	// horizon length
	N = 11;

	// base nx and nu
	int nx0 = 2;
	int nu0 = 1;

	// size-varing
	int nxx[N+1];
	for(ii=0; ii<=N; ii++) nxx[ii] = (N+1-ii)*nx0 + nu0;

	int pnxx[N+1];
	for(ii=0; ii<=N; ii++) pnxx[ii] = (nxx[ii]+bs-1)/bs*bs;

	int cnxx[N+1];
	for(ii=0; ii<=N; ii++) cnxx[ii] = (nxx[ii]+ncl-1)/ncl*ncl;

	int nuu[N+1];
	for(ii=0; ii<N; ii++) nuu[ii] = nu0;
	nuu[N] = 0; // !!!!!

	int pnuu[N+1];
	for(ii=0; ii<N; ii++) pnuu[ii] = (nuu[ii]+bs-1)/bs*bs;
	pnuu[N] = 0; // !!!!!

	int cnuu[N+1];
	for(ii=0; ii<N; ii++) cnuu[ii] = (nuu[ii]+ncl-1)/ncl*ncl;
	cnuu[N] = 0; // !!!!!

	//for(ii=0; ii<=N; ii++) printf("\n%d %d %d\n", nxx[ii], pnxx[ii], cnxx[ii]);
	//for(ii=0; ii<N; ii++)  printf("\n%d %d %d\n", nuu[ii], pnuu[ii], cnuu[ii]);



	// factorization
	printf("\nRiccati diag\n\n");

	// data memory space
	double *hdA[N];
	double *hpBt[N];
	double *hpR[N];
	double *hpS[N];
	double *hpQ[N+1];
	double *hpLK[N];
	double *hpP[N+1];
	double *pK;

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hdA[ii], pnxx[ii], 1);
		d_zeros_align(&hpBt[ii], pnuu[ii], cnxx[ii+1]);
		d_zeros_align(&hpR[ii], pnuu[ii], cnuu[ii]);
		d_zeros_align(&hpS[ii], pnxx[ii], cnuu[ii]);
		d_zeros_align(&hpQ[ii], pnxx[ii], cnxx[ii]);
		d_zeros_align(&hpLK[ii], pnuu[ii]+pnxx[ii], cnuu[ii]);
		d_zeros_align(&hpP[ii], pnxx[ii], cnxx[ii]);
		}
	d_zeros_align(&hpQ[N], pnxx[N], cnxx[N]);
	d_zeros_align(&hpP[N], pnxx[N], cnxx[N]);
	d_zeros_align(&pK, pnxx[0], cnuu[0]); // max(nx) x nax(nu)

	// dA
	for(ii=0; ii<N; ii++)
		for(jj=0; jj<nxx[ii+1]; jj++)
			hdA[ii][jj] = 1.0;

	//d_print_mat(1, cnxx[1], hdA[0], 1);

	// B
	double *eye_nu0; d_zeros(&eye_nu0, nu0, nu0);
	for(jj=0; jj<nu0; jj++) eye_nu0[jj*(nu0+1)] = 1.0;
	double *ptrB = BBB;
	for(ii=0; ii<N; ii++)
		{
		d_cvt_mat2pmat(nuu[ii], nuu[ii], eye_nu0, nuu[ii], 0, hpBt[ii], cnxx[ii+1]);
		d_cvt_tran_mat2pmat(nxx[ii+1]-nuu[ii], nuu[ii], ptrB, nxx[ii+1]-nuu[ii], 0, hpBt[ii]+nuu[ii]*bs, cnxx[ii+1]);
		ptrB += nxx[ii+1] - nuu[ii];
		}
	free(eye_nu0);

	//d_print_pmat(pnuu[0], cnxx[1], bs, hpBt[0], cnxx[0]);
	//d_print_pmat(pnuu[1], cnxx[2], bs, hpBt[1], cnxx[1]);
	//d_print_pmat(pnuu[2], cnxx[3], bs, hpBt[2], cnxx[2]);
	//d_print_pmat(pnuu[N-1], cnxx[N-1], bs, hpBt[N-2], cnxx[N-2]);
	//d_print_pmat(pnuu[N-1], cnxx[N], bs, hpBt[N-1], cnxx[N-1]);

	// R
	// penalty on du
	for(ii=0; ii<N; ii++)
		for(jj=0; jj<nuu[ii]; jj++)
			hpR[ii][jj/bs*bs*cnuu[ii]+jj%bs+jj*bs] = 0.0;

	//for(ii=0; ii<N; ii++)
	//	d_print_pmat(pnuu[ii], cnuu[ii], bs, hpR[ii], pnuu[ii]);
	//d_print_pmat(pnuu[0], cnuu[0], bs, hpR[0], pnuu[0]);

	// S (zero)

	// Q
	for(ii=0; ii<=N; ii++)
		{
		// penalty on u
		for(jj=0; jj<nu0; jj++) 
			hpQ[ii][jj/bs*bs*cnxx[ii]+jj%bs+jj*bs] = 1.0;
		// penalty on x
//		for(jj==1; jj<nxx[ii]-nx0; jj++) 
//			hpQ[ii][jj/bs*bs*cnxx[ii]+jj%bs+jj*bs] = 0.0002;
		for(jj=nxx[ii]-nx0; jj<nxx[ii]; jj++) 
			hpQ[ii][jj/bs*bs*cnxx[ii]+jj%bs+jj*bs] = 1.0;
		}

	//for(ii=0; ii<=N; ii++)
	//	d_print_pmat(pnxx[ii], cnxx[ii], bs, hpQ2[ii], cnxx[ii]);
	//d_print_pmat(pnxx[0], cnxx[0], bs, hpQ2[0], cnxx[0]);
	//d_print_pmat(pnxx[1], cnxx[1], bs, hpQ2[1], cnxx[1]);
	//d_print_pmat(pnxx[N-1], cnxx[N-1], bs, hpQ2[N-1], cnxx[N-1]);
	//d_print_pmat(pnxx[N], cnxx[N], bs, hpQ2[N], cnxx[N]);
	//exit(1);

	// work space
	double *diag; d_zeros_align(&diag, pnxx[0]+pnuu[0], 1);


	// factorization
	printf("\nfactorization ...\n");
	d_ric_diag_trf_mpc(N, nxx, nuu, hdA, hpBt, hpR, hpS, hpQ, hpLK, pK, hpP, diag);
	printf("\nfactorization done\n\n");

#if 1
	//d_print_pmat(nxx[0], nxx[0], bs, hpP[0], cnxx[0]);
	//d_print_pmat(nxx[1], nxx[1], bs, hpP[1], cnxx[1]);
	//d_print_pmat(nxx[N-2], nxx[N-2], bs, hpP[N-2], cnxx[N-2]);
	//d_print_pmat(nxx[N-1], nxx[N-1], bs, hpP[N-1], cnxx[N-1]);
	//d_print_pmat(nxx[N], nxx[N], bs, hpP[N], cnxx[N]);

	//for(ii=0; ii<=N; ii++)
	//	d_print_pmat(pnuu[ii]+nxx[ii], nuu[ii], bs, hpLK[ii], cnuu[ii]);
	//d_print_pmat(pnuu[0]+nxx[0], nuu[0], bs, hpLK[0], cnuu[0]);
	//d_print_pmat(pnuu[1]+nxx[1], nuu[1], bs, hpLK[1], cnuu[1]);
	//d_print_pmat(pnuu[2]+nxx[2], nuu[2], bs, hpLK[2], cnuu[2]);
	//d_print_pmat(pnuu[N-3]+nxx[N-3], nuu[N-3], bs, hpLK[N-3], cnuu[N-3]);
	//d_print_pmat(pnuu[N-2]+nxx[N-2], nuu[N-2], bs, hpLK[N-2], cnuu[N-2]);
	//d_print_pmat(pnuu[N-1]+nxx[N-1], nuu[N-1], bs, hpLK[N-1], cnuu[N-1]);
#endif



	// backward-forward solution

	// data memory space
	double *hrq[N+1];
	double *hux[N+1];
	double *hpi[N+1];
	double *hPb[N];
	double *hb[N];

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hrq[ii], pnuu[ii]+pnxx[ii], 1);
		d_zeros_align(&hux[ii], pnuu[ii]+pnxx[ii], 1);
		d_zeros_align(&hpi[ii], pnxx[ii], 1);
		d_zeros_align(&hPb[ii], pnxx[ii+1], 1);
		d_zeros_align(&hb[ii], pnxx[ii+1], 1);
		}
	d_zeros_align(&hrq[N], pnuu[N]+pnxx[N], 1);
	d_zeros_align(&hux[N], pnuu[N]+pnxx[N], 1);
	d_zeros_align(&hpi[N], pnxx[N], 1);

	double *work_diag; d_zeros_align(&work_diag, pnxx[0], 1);

	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nuu[ii]; jj++)
			hrq[ii][jj] = 0.0;

	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nxx[ii]; jj++)
			hrq[ii][nuu[ii]+jj] = 0.0;

	for(ii=0; ii<N; ii++)
		for(jj=0; jj<nxx[ii+1]; jj++)
			hb[ii][jj] = 0.0;

	// x0
	for(jj=0; jj<nuu[0]; jj++)
		{
		hux[0][jj] = 0.0;
		}
	for(; jj<nuu[0]+nu0; jj++)
		{
		hux[0][jj] = 7.5097;
		}
	for(; jj<nxx[0]; jj+=2)
		{
		hux[0][jj+0] = 15.01940;
		hux[0][jj+1] =  0.0;
		}
	//d_print_mat(1, nuu[0]+nxx[0], hux2[0], 1);


	printf("\nbackward-forward solution ...\n");
	d_ric_diag_trs_mpc(N, nxx, nuu, hdA, hpBt, hpLK, hpP, hb, hrq, hux, 1, hPb, 1, hpi, work_diag);
	printf("\nbackward-forward solution done\n\n");

#if 1
	printf("\nux\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hux[ii], 1);
#endif



	// residuals

	// data memory space
	double *hres_rq[N+1];
	double *hres_b[N];

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hres_rq[ii], pnuu[ii]+pnxx[ii], 1);
		d_zeros_align(&hres_b[ii], pnxx[ii+1], 1);
		}
	d_zeros_align(&hres_rq[N], pnuu[N]+pnxx[N], 1);


	printf("\nresuduals ...\n");
	d_res_diag_mpc(N, nxx, nuu, hdA, hpBt, hpR, hpS, hpQ, hb, hrq, hux, hpi, hres_rq, hres_b, work_diag);
	printf("\nresiduals done\n\n");

#if 1
	printf("\nres_q\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hres_rq[ii], 1);

	printf("\nres_b\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nxx[ii+1], hres_b[ii], 1);
#endif





	// timing
	struct timeval tv20, tv21;

#if 1
	printf("\ntiming ...\n\n");
	gettimeofday(&tv20, NULL); // start

	nrep = 10000;
	for(ii=0; ii<nrep; ii++)
		{
		d_ric_diag_trf_mpc(N, nxx, nuu, hdA, hpBt, hpR, hpS, hpQ, hpLK, pK, hpP, diag);
		d_ric_diag_trs_mpc(N, nxx, nuu, hdA, hpBt, hpLK, hpP, hb, hrq, hux, 1, hPb, 1, hpi, work_diag);
		}

	gettimeofday(&tv21, NULL); // start

	time_ric_diag = (float) (tv21.tv_sec-tv20.tv_sec)/(nrep+0.0)+(tv21.tv_usec-tv20.tv_usec)/(nrep*1e6);
	printf("\ntiming done\n\n");
#endif




#if 1
	printf("\nRiccati full\n\n");
	// size-variant full
	int nzz[N+1];
	for(ii=0; ii<=N; ii++) nzz[ii] = nuu[ii] + nxx[ii] + 1;

	int pnzz[N+1];
	for(ii=0; ii<=N; ii++) pnzz[ii] = (nzz[ii]+bs-1)/bs*bs;

	int cnzz[N+1];
	for(ii=0; ii<=N; ii++) cnzz[ii] = (nzz[ii]+ncl-1)/ncl*ncl;

	int anzz[N+1];
	for(ii=0; ii<=N; ii++) anzz[ii] = (nzz[ii]+nal-1)/nal*nal;

	int cnll[N+1];
	for(ii=0; ii<=N; ii++) cnll[ii] = cnzz[ll]<cnxx[ll]+ncl ? cnxx[ll]+ncl : cnzz[ll];

	int nzero[N+1];
	for(ii=0; ii<=N; ii++) nzero[ii] = 0;

	double *hpBAbt_tv[N];
	double *hpRSQ_tv[N+1];
	double *hpL_tv[N+1];
	double *hl[N+1];

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hpBAbt_tv[ii], pnzz[ii], cnxx[ii+1]);
		d_zeros_align(&hpRSQ_tv[ii], pnzz[ii], cnzz[ii]);
		d_zeros_align(&hpL_tv[ii], pnzz[ii], cnll[ii]);
		d_zeros_align(&hl[ii], anzz[ii], 1);
		}
	d_zeros_align(&hpRSQ_tv[N], pnzz[N], cnzz[N]);
	d_zeros_align(&hpL_tv[N], pnzz[N], cnll[N]);
	d_zeros_align(&hl[N], anzz[ii], 1);
	
	double *work_ric_tv; d_zeros_align(&work_ric_tv, pnzz[0], cnxx[0]);

	for(ii=0; ii<N; ii++)
		{
		d_copy_pmat(nuu[ii], nxx[ii+1], bs, hpBt[ii], cnxx[ii], hpBAbt_tv[ii], cnxx[ii+1]);
		for(jj=0; jj<nxx[ii+1]; jj++) hpBAbt_tv[ii][(nuu[ii]+jj)/bs*bs*cnxx[ii+1]+(nuu[ii]+jj)%bs+jj*bs] = 1.0;
		for(jj=0; jj<nxx[ii+1]; jj++) hpBAbt_tv[ii][(nuu[ii]+nxx[ii])/bs*bs*cnxx[ii+1]+(nuu[ii]+nxx[ii])%bs+jj*bs] = hb[ii][jj];
		//d_print_pmat(nzz[ii], nxx[ii+1], bs, hpBAbt_tv[ii], cnxx[ii+1]);
		}
	
	for(ii=0; ii<=N; ii++)
		{
		// R
		// penalty on du
		for(jj=0; jj<nuu[ii]; jj++)
			hpRSQ_tv[ii][jj/bs*bs*cnzz[ii]+jj%bs+jj*bs] = 0.0;
		// Q
		// penalty on u
		for(; jj<nuu[ii]+nu0; jj++) 
			hpRSQ_tv[ii][jj/bs*bs*cnzz[ii]+jj%bs+jj*bs] = 1.0;
		// penalty on x
		for(jj=nuu[ii]+nxx[ii]-nx0; jj<nuu[ii]+nxx[ii]; jj++) 
			hpRSQ_tv[ii][jj/bs*bs*cnzz[ii]+jj%bs+jj*bs] = 1.0;
		// r q
		for(jj=0; jj<nuu[ii]+nxx[ii]; jj++) hpRSQ_tv[ii][(nuu[ii]+nxx[ii])/bs*bs*cnzz[ii]+(nuu[ii]+nxx[ii])%bs+jj*bs] = hrq[ii][jj];
		//d_print_pmat(nzz[ii], nzz[ii], bs, hpRSQ_tv[ii], cnzz[ii]);
		}


	printf("\nfactorization and backward-forward solution ...\n");
	d_ric_sv_mpc_tv(N, nxx, nuu, hpBAbt_tv, hpRSQ_tv, hux, hpL_tv, work_ric_tv, diag, COMPUTE_MULT, hpi, nzero, int_dummy, dummy, dummy, nzero, dummy, dummy, dummy, 0);
	printf("\nfactorization and backward-forward solution done\n\n");

#if 0
	for(ii=0; ii<=N; ii++)
		d_print_pmat(nzz[ii], nzz[ii], bs, hpL_tv[ii], cnzz[ii]);
#endif

	printf("\nux\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hux[ii], 1);


	for(ii=0; ii<N; ii++)
		for(jj=0; jj<nxx[ii+1]; jj++)
			hux[ii+1][nuu[ii+1]+jj] = hb[ii][jj];

	printf("\nbackward-forward solution ...\n");
	d_ric_trs_mpc_tv(N, nxx, nuu, hpBAbt_tv, hpL_tv, hrq, hl, hux, work_ric_tv, 1, hPb, COMPUTE_MULT, hpi, nzero, int_dummy, dummy, nzero, dummy, dummy);
	printf("\nbackward-forward solution done\n\n");

	printf("\nux\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hux[ii], 1);
	
	//exit(1);


	printf("\nresuduals ...\n");
	d_res_diag_mpc(N, nxx, nuu, hdA, hpBt, hpR, hpS, hpQ, hb, hrq, hux, hpi, hres_rq, hres_b, work_diag);
	printf("\nresiduals done\n\n");

#if 1
	printf("\nres_q\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hres_rq[ii], 1);

	printf("\nres_b\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nxx[ii+1], hres_b[ii], 1);
#endif
	
#if 1
	printf("\ntiming ...\n\n");
	gettimeofday(&tv20, NULL); // start

	nrep = 10000;
	for(ii=0; ii<nrep; ii++)
		{
		d_ric_sv_mpc_tv(N, nxx, nuu, hpBAbt_tv, hpRSQ_tv, hux, hpL_tv, work_ric_tv, diag, COMPUTE_MULT, hpi, nzero, int_dummy, dummy, dummy, nzero, dummy, dummy, dummy, 0);
		}

	gettimeofday(&tv21, NULL); // start

	time_ric_full_tv = (float) (tv21.tv_sec-tv20.tv_sec)/(nrep+0.0)+(tv21.tv_usec-tv20.tv_usec)/(nrep*1e6);
	printf("\ntiming done\n\n");
#endif
	

#endif





#if 1
	// IPM
	printf("\nIPM diag\n\n");

	int kk = -1;
	int kmax = 50;
	double mu0 = 1;
	double mu_tol = 1e-8;
	double alpha_min = 1e-12;
	double sigma_par[] = {0.4, 0.3, 0.01};
	double stat[5*50] = {};

	int nbb[N+1];
	nbb[0] = nu0;//nuu[0]; // XXX !!!!!!!!!!!!!!
	for(ii=1; ii<N; ii++) nbb[ii] = 2*nu0 + nx0; //nuu[ii] + nxx[ii];
	nbb[N] = nu0 + nx0;

	int *(idxb[N+1]);
	for(ii=0; ii<=N; ii++)
		{
		idxb[ii] = (int *) malloc(nbb[ii]*sizeof(int));
		}

	int pnbb[N+1];
	for(ii=0; ii<=N; ii++) pnbb[ii] = (nbb[ii]+bs-1)/bs*bs;

	// data memory space
	double *hd[N+1];
	double *hlam[N+1];
	double *ht[N+1];
	double *hres_d[N+1];
	for(ii=0; ii<=N; ii++)
		{
		d_zeros_align(&hd[ii], 2*pnbb[ii], 1);
		d_zeros_align(&hlam[ii], 2*pnbb[ii], 1);
		d_zeros_align(&ht[ii], 2*pnbb[ii], 1);
		d_zeros_align(&hres_d[ii], 2*pnbb[ii], 1);
		}

	double mu = -1;

	//printf("\nbounds\n");
	ii = 0; // initial stage
	ll = 0;
	for(jj=0; jj<nuu[ii]; jj++)
		{
		hd[ii][ll]                  = -20.5;
		hd[ii][pnbb[ii]+ll]         = -20.5;
		idxb[ii][ll] = jj;
		ll++;
		}
	//d_print_mat(1, 2*pnbb[ii], hd[ii], 1);
	for(ii=1; ii<=N; ii++)
		{
		ll = 0;
		for(jj=0; jj<nuu[ii]; jj++)
			{
			hd[ii][ll]          = -20.5;
			hd[ii][pnbb[ii]+ll] = -20.5;
			idxb[ii][ll] = jj;
			ll++;
			}
		for(; jj<nuu[ii]+nu0; jj++)
			{
			hd[ii][ll]          = - 2.5; // -2.5
			hd[ii][pnbb[ii]+ll] = -10.0; // -10
			idxb[ii][ll] = jj;
			ll++;
			}
		//for(; jj<nbb[ii]-nx0; jj++)
		//for(; jj<nbb[ii]; jj++)
			//{
			//hd[ii][jj]          = -100.0;
			//hd[ii][pnbb[ii]+jj] = -100.0;
			//idxb[ii][ll] = jj;
			//ll++;
			//}
		jj += nx0*(N-ii);
		hd[ii][ll+0]          = - 0.0; //   0
		hd[ii][pnbb[ii]+ll+0] = -20.0; // -20
		idxb[ii][ll] = jj;
		ll++;
		jj++;
		hd[ii][ll+0]          = -10.0; // -10
		hd[ii][pnbb[ii]+ll+0] = -10.0; // -10
		idxb[ii][ll] = jj;
		ll++;
		jj++;
		//d_print_mat(1, 2*pnbb[ii], hd[ii], 1);
		}
#if 0
	for(ii=0; ii<=N; ii++)
		{
		for(jj=0; jj<nbb[ii]; jj++)
			printf("%d\t", idxb[ii][jj]);
		printf("\n");
		}
	exit(1);
#endif

	for(jj=0; jj<nuu[0]; jj++)
		{
		hux[0][jj] = 0.0;
		}
	for(; jj<nuu[0]+nu0; jj++)
		{
		hux[0][jj] = 7.5097;
		}
	for(; jj<nxx[0]; jj+=2)
		{
		hux[0][jj+0] = 15.01940;
		hux[0][jj+1] =  0.0;
		}
	//d_print_mat(1, nuu[0]+nxx[0], hux2[0], 1);


	int pnxM = pnxx[0];
	int pnuM = pnuu[0];
	int cnuM = cnuu[0];

	int anxx[N+1];
	for(ii=0; ii<=N; ii++) anxx[ii] = (nxx[ii]+nal-1)/nal*nal;

	int anuu[N+1];
	for(ii=0; ii<=N; ii++) anuu[ii] = (nuu[ii]+nal-1)/nal*nal;

	int work_space_ip_double = 0;
	for(ii=0; ii<=N; ii++)
		work_space_ip_double += anuu[ii] + 3*anxx[ii] + (pnuu[ii]+pnxx[ii])*cnuu[ii] + pnxx[ii]*cnxx[ii] + 12*pnbb[ii];
	work_space_ip_double += pnxM*cnuM + pnxM + pnuM;
	int work_space_ip_int = (N+1)*7*sizeof(int);
	work_space_ip_int = (work_space_ip_int+63)/64*64;
	work_space_ip_int /= sizeof(int);
	printf("\nIPM diag work space size: %d double + %d int\n\n", work_space_ip_double, work_space_ip_int);
	double *work_space_ip; d_zeros_align(&work_space_ip, work_space_ip_double+(work_space_ip_int+1)/2, 1); // XXX assume sizeof(double) = 2 * sizeof(int) !!!!!


	printf("\nIPM solution ...\n");
	d_ip2_diag_mpc(&kk, kmax, mu0, mu_tol, alpha_min, 0, sigma_par, stat, N, nxx, nuu, nbb, idxb, hdA, hpBt, hpR, hpS, hpQ, hb, hd, hrq, hux, 1, hpi, hlam, ht, work_space_ip);
	printf("\nIPM solution done\n");


	printf("\nux\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hux[ii], 1);

	printf("\nlam\n");
	for(ii=0; ii<=N; ii++)
		{
		d_print_mat(1, nbb[ii], hlam[ii], 1);
		d_print_mat(1, nbb[ii], hlam[ii]+pnbb[ii], 1);
		}

	printf("\nt\n");
	for(ii=0; ii<=N; ii++)
		{
		d_print_mat(1, nbb[ii], ht[ii], 1);
		d_print_mat(1, nbb[ii], ht[ii]+pnbb[ii], 1);
		}

	printf("\nstatistics\n\n");
	for(ii=0; ii<kk; ii++)
		printf("%d\t%f\t%f\t%f\t%e\t%f\t%f\t%e\n", ii+1, stat[5*ii+0], stat[5*ii+1], stat[5*ii+2], stat[5*ii+2], stat[5*ii+3], stat[5*ii+4], stat[5*ii+4]);
	printf("\n\n");


	// residuals
	printf("\nresuduals IPM ...\n");
	d_res_ip_diag_mpc(N, nxx, nuu, nbb, idxb, hdA, hpBt, hpR, hpS, hpQ, hb, hrq, hd, hux, hpi, hlam, ht, hres_rq, hres_b, hres_d, &mu, work_diag);
	printf("\nresiduals IPM done\n");

	printf("\nres_rq\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hres_rq[ii], 1);

	printf("\nres_b\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nxx[ii+1], hres_b[ii], 1);

	printf("\nres_d\n");
	for(ii=0; ii<=N; ii++)
		{
		d_print_mat(1, nbb[ii], hres_d[ii], 1);
		d_print_mat(1, nbb[ii], hres_d[ii]+pnbb[ii], 1);
		}

	printf("\nres_mu\n");
	d_print_mat(1, 1, &mu, 1);


	// timing
	printf("\ntiming ...\n\n");
	gettimeofday(&tv20, NULL); // start

	nrep = 1000;
	for(ii=0; ii<nrep; ii++)
		{
		d_ip2_diag_mpc(&kk, kmax, mu0, mu_tol, alpha_min, 0, sigma_par, stat, N, nxx, nuu, nbb, idxb, hdA, hpBt, hpR, hpS, hpQ, hb, hd, hrq, hux, 1, hpi, hlam, ht, work_space_ip);
		}

	gettimeofday(&tv21, NULL); // start
	printf("\ntiming done\n\n");

	time_ip_diag = (float) (tv21.tv_sec-tv20.tv_sec)/(nrep+0.0)+(tv21.tv_usec-tv20.tv_usec)/(nrep*1e6);


	// simulation
	printf("\nsimulation ...\n\n");
	nrep = 15;
	for(ii=0; ii<nrep; ii++)
		{

		d_ip2_diag_mpc(&kk, kmax, mu0, mu_tol, alpha_min, 0, sigma_par, stat, N, nxx, nuu, nbb, idxb, hdA, hpBt, hpR, hpS, hpQ, hb, hd, hrq, hux, 1, hpi, hlam, ht, work_space_ip);

		dgemv_t_lib(nuu[0], nxx[0], hpBt[0], cnxx[0], hux[0], hux[0]+nuu[0], 1);
		for(jj=0; jj<nxx[0]-nx0-nu0; jj++) hux[0][nuu[0]+nxx[0]-jj-1] = hux[0][nuu[0]+nxx[0]-jj-1-nx0];

		printf("\nsimulation step = %d, IPM iterations = %d, mu = %e\n\n", ii, kk, stat[5*(kk-1)+4]);
		d_print_mat(1, nuu[0]+nxx[0], hux[0], 1);

		}
	printf("\nsimulation done\n\n");
	//exit(1);





#if 1
	// IPM
	printf("\nIPM full\n\n");

	int ngg[N+1];
	for(ii=0; ii<=N; ii++) ngg[ii] = 0;

	int pngg[N+1];
	for(ii=0; ii<=N; ii++) pngg[ii] = (ngg[ii]+bs-1)/bs*bs;

	//int pnzM = pnzz[0]; // max
	//int cnxgM = cnxx[0]; // max

	//int work_space_int_size = 7*(N+1);
	//int work_space_double_size = pnzM*cnxgM + pnzM;
	//for(ii=0; ii<=N; ii++)
	//	work_space_double_size += pnzz[ii]*cnll[ii] + 3*anzz[ii] + 2*anxx[ii] + 14*pnbb[ii] + 10*pngg[ii];
	
	//printf("\nIPM diag work space size: %d double + %d int\n\n", work_space_double_size, work_space_int_size);
	//double *work_ipm_tv_double; d_zeros_align(&work_ipm_tv_double, work_space_double_size, 1);
	double *work_ipm_tv_double; d_zeros_align(&work_ipm_tv_double, d_ip2_hard_mpc_tv_work_space_size_double(N, nxx, nuu, nbb, ngg), 1);
	//int *work_ipm_tv_int = (int *) malloc(work_space_int_size*sizeof(int));
	int *work_ipm_tv_int = (int *) malloc(d_ip2_hard_mpc_tv_work_space_size_int(N, nxx, nuu, nbb, ngg)*sizeof(int));


	for(jj=0; jj<nuu[0]; jj++)
		{
		hux[0][jj] = 0.0;
		}
	for(; jj<nuu[0]+nu0; jj++)
		{
		hux[0][jj] = 7.5097;
		}
	for(; jj<nxx[0]; jj+=2)
		{
		hux[0][jj+0] = 15.01940;
		hux[0][jj+1] =  0.0;
		}
	//d_print_mat(1, nuu[0]+nxx[0], hux2[0], 1);



	printf("\nIPM solution ...\n");
	d_ip2_hard_mpc_tv(&kk, kmax, mu0, mu_tol, alpha_min, 0, sigma_par, stat, N, nxx, nuu, nbb, idxb, ngg, hpBAbt_tv, hpRSQ_tv, dummy, hd, hux, 1, hpi, hlam, ht, work_ipm_tv_double, work_ipm_tv_int);
	printf("\nIPM solution done\n");



	printf("\nux\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hux[ii], 1);

	printf("\nlam\n");
	for(ii=0; ii<=N; ii++)
		{
		d_print_mat(1, nbb[ii], hlam[ii], 1);
		d_print_mat(1, nbb[ii], hlam[ii]+pnbb[ii], 1);
		}

	printf("\nt\n");
	for(ii=0; ii<=N; ii++)
		{
		d_print_mat(1, nbb[ii], ht[ii], 1);
		d_print_mat(1, nbb[ii], ht[ii]+pnbb[ii], 1);
		}

	printf("\nstatistics\n\n");
	for(ii=0; ii<kk; ii++)
		printf("%d\t%f\t%f\t%f\t%e\t%f\t%f\t%e\n", ii+1, stat[5*ii+0], stat[5*ii+1], stat[5*ii+2], stat[5*ii+2], stat[5*ii+3], stat[5*ii+4], stat[5*ii+4]);
	printf("\n\n");


	printf("\nresiduals ...\n\n");
	d_res_ip_hard_mpc_tv(N, nxx, nuu, nbb, idxb, ngg, hpBAbt_tv, hpRSQ_tv, hrq, hux, dummy, hd, hpi, hlam, ht, hres_rq, hres_b, hres_d, &mu);
	printf("\nresiduals dones\n\n");

	printf("\nres_rq\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nuu[ii]+nxx[ii], hres_rq[ii], 1);

	printf("\nres_b\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nxx[ii+1], hres_b[ii], 1);

	printf("\nres_d\n");
	for(ii=0; ii<=N; ii++)
		{
		d_print_mat(1, nbb[ii], hres_d[ii], 1);
		d_print_mat(1, nbb[ii], hres_d[ii]+pnbb[ii], 1);
		}

	printf("\nres_mu\n");
	d_print_mat(1, 1, &mu, 1);



	// timing
	printf("\ntiming ...\n\n");
	gettimeofday(&tv20, NULL); // start

	nrep = 1000;
	for(ii=0; ii<nrep; ii++)
		{
		d_ip2_hard_mpc_tv(&kk, kmax, mu0, mu_tol, alpha_min, 0, sigma_par, stat, N, nxx, nuu, nbb, idxb, ngg, hpBAbt_tv, hpRSQ_tv, dummy, hd, hux, 1, hpi, hlam, ht, work_ipm_tv_double, work_ipm_tv_int);
		}

	gettimeofday(&tv21, NULL); // start
	printf("\ntiming done\n\n");

	time_ip_full_tv = (float) (tv21.tv_sec-tv20.tv_sec)/(nrep+0.0)+(tv21.tv_usec-tv20.tv_usec)/(nrep*1e6);



	free(work_ric_tv);
	free(work_ipm_tv_double);
	free(work_ipm_tv_int);
	for(ii=0; ii<N; ii++)
		{
		free(hpBAbt_tv[ii]);
		free(hpRSQ_tv[ii]);
		free(hpL_tv[ii]);
		free(hl[ii]);
		}
	free(hpRSQ_tv[N]);
	free(hpL_tv[N]);
	free(hl[N]);
	
	//exit(1);

#endif



	// free memory
	for(ii=0; ii<=N; ii++)
		{
		free(idxb[ii]);
		free(hd[ii]);
		free(hlam[ii]);
		free(ht[ii]);
		}
	free(work_space_ip);
#endif




	for(ii=0; ii<N; ii++)
		{
		free(hdA[ii]);
		free(hpBt[ii]);
		free(hpR[ii]);
		free(hpS[ii]);
		free(hpQ[ii]);
		free(hpLK[ii]);
		free(hpP[ii]);
		free(hrq[ii]);
		free(hux[ii]);
		free(hpi[ii]);
		free(hPb[ii]);
		free(hb[ii]);
		free(hres_rq[ii]);
		free(hres_b[ii]);
		}
	free(hpQ[N]);
	free(hpP[N]);
	free(pK);
	free(hrq[N]);
	free(hux[N]);
	free(hpi[N]);
	free(work_diag);
	free(hres_rq[N]);



/************************************************
* test of normal riccati & IPM
************************************************/
	
	printf("\nRiccati full\n\n");

	nx = 25;
	nu = 1;
	N = 11;

	int rep;

	int nz = nx+nu+1;
	int anz = nal*((nz+nal-1)/nal);
	int anx = nal*((nx+nal-1)/nal);
	int pnz = bs*((nz+bs-1)/bs);
	int pnx = bs*((nx+bs-1)/bs);
	int pnu = bs*((nu+bs-1)/bs);
	int cnz = ncl*((nx+nu+1+ncl-1)/ncl);
	int cnx = ncl*((nx+ncl-1)/ncl);
	int cnu = ncl*((nu+ncl-1)/ncl);

	int cnl = cnz<cnx+ncl ? cnx+ncl : cnz;

	const int ncx = nx;


#if 1

	double *BAb_temp; d_zeros(&BAb_temp, nx, nu+nx+1);
	double *hpBAbt2[N];

	ptrB = BBB;
	for(ii=0; ii<N; ii++)
		{
		//printf("\n%d\n", ii);
		d_zeros_align(&hpBAbt2[ii], pnz, cnx);
		for(jj=0; jj<nx*(nx+nu+1); jj++) BAb_temp[jj] = 0.0;
		for(jj=0; jj<nu; jj++) BAb_temp[jj*(nx+1)] = 1.0;
		d_copy_mat(nxx[ii+1]-1, nuu[ii], ptrB, nxx[ii+1]-1, BAb_temp+1, nx);
		ptrB += nxx[ii+1]-1;
		for(jj=0; jj<nxx[ii+1]; jj++) BAb_temp[nuu[ii]*nx+jj*(nx+1)] = 1.0;
		//for(jj=0; jj<nxx[ii+1]; jj++) BAb_temp[(nuu[ii]+nxx[ii+1])*nx+jj] = 1.0;
		//d_print_mat(nx, nu+nx+1, BAb_temp, nx);
		d_cvt_tran_mat2pmat(nx, nx+nu+1, BAb_temp, nx, 0, hpBAbt2[ii], cnx);
		//d_print_pmat(nx+nu+1, nx, bs, hpBAbt2[ii], cnx);
		}

	double *RSQ; d_zeros(&RSQ, nz, nz);
	double *hpRSQ[N+1];

	for(ii=0; ii<=N; ii++)
		{
		//printf("\n%d\n", ii);
		d_zeros_align(&hpRSQ[ii], pnz, cnz);
		for(jj=0; jj<nz*nz; jj++) RSQ[jj] = 0.0;
		for(jj=nu; jj<2*nu; jj++) RSQ[jj*(nz+1)] = 1.0;
		for(jj=nu+nxx[ii]-nx0; jj<nu+nxx[ii]; jj++) RSQ[jj*(nz+1)] = 1.0;
		d_cvt_mat2pmat(nz, nz, RSQ, nz, 0, hpRSQ[ii], cnz);
		//d_print_pmat(nz, nz, bs, hpRSQ[ii], cnz);
		}

	double *hpL[N+1];
	double *hq2[N+1];
	double *hux2[N+1];
	double *hpi2[N+1];
	double *hPb2[N];
	for(jj=0; jj<N; jj++)
		{
		d_zeros_align(&hq2[jj], pnz, 1); // it has to be pnz !!!
		d_zeros_align(&hpL[jj], pnz, cnl);
		d_zeros_align(&hux2[jj], pnz, 1); // it has to be pnz !!!
		d_zeros_align(&hpi2[jj], pnx, 1);
		d_zeros_align(&hPb2[jj], pnx, 1);
		}
	d_zeros_align(&hpL[N], pnz, cnl);
	d_zeros_align(&hq2[N], pnz, 1); // it has to be pnz !!!
	d_zeros_align(&hux2[N], pnz, 1); // it has to be pnz !!!
	d_zeros_align(&hpi2[N], pnx, 1);

	
	//double *work; d_zeros_align(&work, 2*anz, 1);
	double *work; d_zeros_align(&work, pnz, cnx);


	for(jj=0; jj<nx+nu; jj++) hux2[0][jj] = 0.0;
	for(jj=0; jj<nu; jj++)
		{
		hux2[0][nu+jj] = 7.5097;
		}
	for(; jj<nx; jj+=2)
		{
		hux2[0][nu+jj+0] = 15.01940;
		hux2[0][nu+jj+1] =  0.0;
		}

	printf("\nfactorization and backward-forward solution ...\n");
	d_ric_sv_mpc(nx, nu, N, hpBAbt2, hpRSQ, 0, dummy, dummy, hux2, hpL, work, diag, COMPUTE_MULT, hpi2, 0, 0, 0, dummy, dummy, dummy, 0);
	printf("\nfactorization and backward-forward solution done\n\n");

	//for(ii=0; ii<=N; ii++)
	//	d_print_pmat(pnz, cnl-3, bs, hpL[ii], cnl);
	//d_print_pmat(pnz, nu, bs, hpL[0], cnl);
	//d_print_pmat(pnz, cnl-3, bs, hpL[1], cnl);
	//d_print_pmat(pnz, cnl-3, bs, hpL[2], cnl);
	//d_print_pmat(pnz, cnl-3, bs, hpL[N-3], cnl);
	//d_print_pmat(pnz, cnl-3, bs, hpL[N-2], cnl);
	//d_print_pmat(pnz, cnl-3, bs, hpL[N-1], cnl);
	//d_print_pmat(pnz, cnl, bs, hpL[N], cnl);

#if 1
	printf("\nux Riccati full\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nx+nu, hux2[ii], 1);
#endif

	
	// residuals

	double *hres_rq2[N+1];
	double *hres_b2[N];

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hres_rq2[ii], pnz, 1);
		d_zeros_align(&hres_b2[ii], pnx, 1);
		}
	d_zeros_align(&hres_rq2[N], pnz, 1);
	

	printf("\nresuduals ...\n");
	d_res_mpc(nx, nu, N, hpBAbt2, hpRSQ, hq2, hux2, hpi2, hres_rq2, hres_b2);
	printf("\nresiduals done\n\n");

	printf("\nres_q full\n");
	d_print_mat(1, nu, hres_rq2[ii], 1);
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nx+nu, hres_rq2[ii], 1);

	printf("\nres_b full\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nx, hres_b2[ii], 1);



	// timing
	//struct timeval tv20, tv21;

#if 1
	printf("\ntiming ...\n\n");
	gettimeofday(&tv20, NULL); // start

	nrep = 10000;
	for(ii=0; ii<nrep; ii++)
		{
		d_ric_sv_mpc(nx, nu, N, hpBAbt2, hpRSQ, 0, dummy, dummy, hux2, hpL, work, diag, COMPUTE_MULT, hpi2, 0, 0, 0, dummy, dummy, dummy, 0);
		}

	gettimeofday(&tv21, NULL); // start

	time_ric_full = (float) (tv21.tv_sec-tv20.tv_sec)/(nrep+0.0)+(tv21.tv_usec-tv20.tv_usec)/(nrep*1e6);
	printf("\ntiming done\n\n");
#endif



	printf("\nIPM full\n\n");

	int nb  = nu+nx;
	int ng  = 0;
	int ngN = 0;

	int pnb  = (nb+bs-1)/bs*bs;
	int png  = (ng+bs-1)/bs*bs;
	int pngN = (ngN+bs-1)/bs*bs;



	double *hd2[N+1];
	double *hlam2[N+1];
	double *ht2[N+1];

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hd2[ii], 2*pnb+2*png, 1);
		d_zeros_align(&hlam2[ii],2*pnb+2*png, 1);
		d_zeros_align(&ht2[ii], 2*pnb+2*png, 1);
		}
	d_zeros_align(&hd2[N], 2*pnb+2*pngN, 1);
	d_zeros_align(&hlam2[N],2*pnb+2*pngN, 1);
	d_zeros_align(&ht2[N], 2*pnb+2*pngN, 1);

	// work space // more than enought !!!!!
	double *work_ipm_full; d_zeros_align(&work_ipm_full, hpmpc_ip_hard_mpc_dp_work_space(N, nx, nu, nb, ng, ngN), 1);

	// bounds
	for(ii=0; ii<=N; ii++)
		{
		for(jj=0; jj<nu; jj++)
			{
			hd2[ii][jj]     = -20.5;
			hd2[ii][pnb+jj] = -20.5;
			}
		for(; jj<2*nu; jj++)
			{
			hd2[ii][jj]     = - 2.5;
			hd2[ii][pnb+jj] = -10.0;
			}
		for(; jj<2*nu+(N-ii)*nx0; jj++)
			{
			hd2[ii][jj]     = -100.0;
			hd2[ii][pnb+jj] = -100.0;
			}
		hd2[ii][jj+0]     =   0.0;
		hd2[ii][pnb+jj+0] = -20.0;
		hd2[ii][jj+1]     = -10.0;
		hd2[ii][pnb+jj+1] = -10.0;
		jj += 2;
		for(; jj<nu+nx; jj++)
			{
			hd2[ii][jj]     = -100.0;
			hd2[ii][pnb+jj] = -100.0;
			}
		//d_print_mat(1, nb, hd2[ii], 1);
		//d_print_mat(1, nb, hd2[ii]+pnb, 1);
		}
	//exit(1);



	printf("\nIPM full solve ...\n\n");
	d_ip2_hard_mpc(&kk, kmax, mu0, mu_tol, alpha_min, 0, sigma_par, stat, nx, nu, N, nb, ng, ngN, hpBAbt2, hpRSQ, dummy, hd2, hux2, 1, hpi2, hlam2, ht2, work_ipm_full);
	printf("\nIPM full solve done\n\n");



#if 1
	printf("\nux IPM full\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nx+nu, hux2[ii], 1);
#endif
	
	printf("\nstatistics\n\n");
	for(ii=0; ii<kk; ii++)
		printf("%d\t%f\t%f\t%f\t%e\t%f\t%f\t%e\n", ii+1, stat[5*ii+0], stat[5*ii+1], stat[5*ii+2], stat[5*ii+2], stat[5*ii+3], stat[5*ii+4], stat[5*ii+4]);
	printf("\n\n");



	// timing
	printf("\ntiming ...\n\n");
	gettimeofday(&tv20, NULL); // start

	nrep = 1000;
	for(ii=0; ii<nrep; ii++)
		{
		d_ip2_hard_mpc(&kk, kmax, mu0, mu_tol, alpha_min, 0, sigma_par, stat, nx, nu, N, nb, ng, ngN, hpBAbt2, hpRSQ, dummy, hd2, hux2, 1, hpi2, hlam2, ht2, work_ipm_full);
		}

	gettimeofday(&tv21, NULL); // start
	printf("\ntiming done\n\n");

	time_ip_full = (float) (tv21.tv_sec-tv20.tv_sec)/(nrep+0.0)+(tv21.tv_usec-tv20.tv_usec)/(nrep*1e6);



	// free memory
	free(work_ipm_full);
	for(ii=0; ii<N; ii++)
		{
		free(hd2[ii]);
		free(hlam2[ii]);
		free(ht2[ii]);
		}
	free(hd2[N]);
	free(hlam2[N]);
	free(ht2[N]);


	// free memory 
	free(work);
	free(RSQ);
	free(BAb_temp);
	for(ii=0; ii<N; ii++)
		{
		free(hpBAbt2[ii]);
		free(hpRSQ[ii]);
		free(hpL[ii]);
		free(hux2[ii]);
		free(hpi2[ii]);
		free(hq2[ii]);
		free(hPb2[ii]);
		free(hres_rq2[ii]);
		free(hres_b2[ii]);
		}
	free(hpRSQ[N]);
	free(hpL[N]);
	free(hux2[N]);
	free(hpi2[N]);
	free(hq2[N]);
	free(hres_rq2[N]);

#endif

	printf("\nric diag time = %e\t\tric full time = %e\t\tric full tv time = %e\t\tip diag time = %e\t\tip full time = %e\t\tip full tv time = %e\n\n", time_ric_diag, time_ric_full, time_ric_full_tv, time_ip_diag, time_ip_full, time_ip_full_tv);


#endif

	}
Beispiel #5
0
/* primal-dual interior-point method, hard constraints, time variant matrices (mpc version) */
int d_ip2_hard_mpc(int *kk, int k_max, double mu0, double mu_tol, double alpha_min, int warm_start, double *sigma_par, double *stat, int nx, int nu, int N, int nb, int ng, int ngN, double **pBAbt, double **pQ, double **pDCt, double **d, double **ux, int compute_mult, double **pi, double **lam, double **t, double *work_memory)
	{
	
	int nbu = nu<nb ? nu : nb ;

	// indeces
	int jj, ll, ii, bs0;

	// constants
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line

	const int nz   = nx+nu+1;
	const int nxu  = nx+nu;
	const int pnz  = bs*((nz+bs-1)/bs);
	const int pnx  = bs*((nx+bs-1)/bs);
	const int pnb  = bs*((nb+bs-1)/bs); // simd aligned number of two-sided box constraints !!!!!!!!!!!!!!!!!!
	const int png  = bs*((ng+bs-1)/bs); // simd aligned number of two-sided general constraints !!!!!!!!!!!!!!!!!!
	const int pngN = bs*((ngN+bs-1)/bs); // simd aligned number of two-sided general constraints at stage N !!!!!!!!!!!!!!!!!!
	const int cnz  = ncl*((nz+ncl-1)/ncl);
	const int cnx  = ncl*((nx+ncl-1)/ncl);
//	const int cng  = ncl*((ng+ncl-1)/ncl);
	const int cngN = ncl*((ngN+ncl-1)/ncl);
	const int cnxg = ncl*((ng+nx+ncl-1)/ncl);
	const int anz  = nal*((nz+nal-1)/nal);
	const int anx  = nal*((nx+nal-1)/nal);
//	const int anb = nal*((2*nb+nal-1)/nal); // cache aligned number of box constraints
	//const int anb = nal*((nb+nal-1)/nal); // cache aligned number of two-sided box constraints !!!!!!!!!!!!!!!!!!

//	const int pad = (ncl-nx%ncl)%ncl; // packing between BAbtL & P
	//const int cnl = cnz<cnx+ncl ? nx+pad+cnx+ncl : nx+pad+cnz;
	const int cnl = cnz<cnx+ncl ? cnx+ncl : cnz;

	//printf("\n%d %d %d %d %d\n", N, nx, nu, nb, ng);
	//d_print_pmat(nz, nx, bs, pBAbt[0], cnx);
	//d_print_pmat(nz, nx, bs, pBAbt[1], cnx);
	//d_print_pmat(nz, nx, bs, pBAbt[N-1], cnx);
	//d_print_pmat(nz, nz, bs, pQ[0], cnz);
	//d_print_pmat(nz, nz, bs, pQ[1], cnz);
	//d_print_pmat(nz, nz, bs, pQ[N], cnz);
	//d_print_pmat(nx+nu, ng, bs, pDCt[0], cng);
	//d_print_pmat(nx+nu, ng, bs, pDCt[1], cng);
	//d_print_pmat(nx+nu, ng, bs, pDCt[N], cng);
	//d_print_mat(1, 2*pnb+2*png, d[0], 1);
	//d_print_mat(1, 2*pnb+2*png, d[1], 1);
	//d_print_mat(1, 2*pnb+2*png, d[N], 1);
	//d_print_mat(1, nx+nu, ux[0], 1);
	//d_print_mat(1, nx+nu, ux[1], 1);
	//d_print_mat(1, nx+nu, ux[N], 1);
	//exit(1);
	
	

	// initialize work space
	double *ptr;
	ptr = work_memory;

	double *(dux[N+1]);
	double *(dpi[N+1]);
	double *(pL[N+1]);
	double *(pd[N+1]); // pointer to diagonal of Hessian
	double *(pl[N+1]); // pointer to linear part of Hessian
	double *(bd[N+1]); // backup diagonal of Hessian
	double *(bl[N+1]); // backup linear part of Hessian
	double *work;
	double *diag;
	double *(dlam[N+1]);
	double *(dt[N+1]);
	double *(lamt[N+1]);
	double *(t_inv[N+1]);
	double *(Qx[N+1]);
	double *(qx[N+1]);
	double *(Pb[N]);

//	ptr += (N+1)*(pnx + pnz*cnl + 12*pnz) + 3*pnz;

	// inputs and states
	for(jj=0; jj<=N; jj++)
		{
		dux[jj] = ptr;
		ptr += anz;
		}

	// equality constr multipliers
	for(jj=0; jj<=N; jj++)
		{
		dpi[jj] = ptr;
		ptr += anx;
		}
	
	// Hessian
	for(jj=0; jj<=N; jj++)
		{
		pd[jj] = ptr; //pQ[jj];
		pl[jj] = ptr + anz; //pQ[jj] + ((nu+nx)/bs)*bs*cnz + (nu+nx)%bs;
		bd[jj] = ptr + 2*anz;
		bl[jj] = ptr + 3*anz;
		ptr += 4*anz;
		// backup
		for(ll=0; ll<nx+nu; ll++)
			{
			bd[jj][ll] = pQ[jj][(ll/bs)*bs*cnz+ll%bs+ll*bs];
			bl[jj][ll] = pQ[jj][((nx+nu)/bs)*bs*cnz+(nx+nu)%bs+ll*bs];
			}
		}

	// work space
	for(jj=0; jj<=N; jj++)
		{
		pL[jj] = ptr;
		ptr += pnz*cnl;
		}
	
	work = ptr;
	//ptr += 2*anz;
	if(cngN<=cnxg)
		ptr += pnz*cnxg;
	else
		ptr += pnz*cngN;

	diag = ptr;
	ptr += anz;

	// slack variables, Lagrangian multipliers for inequality constraints and work space (assume # box constraints <= 2*(nx+nu) < 2*pnz)
	for(jj=0; jj<N; jj++)
		{
		dlam[jj] = ptr;
		dt[jj]   = ptr + 2*pnb+2*png;
		ptr += 4*pnb+4*png;
		}
	dlam[N] = ptr;
	dt[N]   = ptr + 2*pnb+2*pngN;
	ptr += 4*pnb+4*pngN;

	for(jj=0; jj<N; jj++)
		{
		lamt[jj] = ptr;
		ptr += 2*pnb+2*png;
		}
	lamt[N] = ptr;
	ptr += 2*pnb+2*pngN;

	for(jj=0; jj<N; jj++)
		{
		t_inv[jj] = ptr;
		ptr += 2*pnb+2*png;
		}
	t_inv[N] = ptr;
	ptr += 2*pnb+2*pngN;

	for(jj=0; jj<N; jj++)
		{
		Qx[jj] = ptr;
		qx[jj] = ptr+png;
		ptr += 2*pnb+2*png;
		}
	Qx[N] = ptr;
	qx[N] = ptr+pngN;
	ptr += 2*pnb+2*pngN;

	// backup of P*b
	for(jj=0; jj<N; jj++)
		{
		Pb[jj] = ptr;
		ptr += anx;
		}



	double temp0, temp1;
	double alpha, mu, mu_aff;
	double mu_scal = N*2*(nb+ng)+2*ngN;
	//printf("\nmu_scal = %f\n", mu_scal);
	mu_scal = 1.0/mu_scal;
	//printf("\nmu_scal = %f\n", mu_scal);
	double sigma, sigma_decay, sigma_min;
	//printf("\n%d %d %d\n", ng, ngN, N*2*ng+2*ngN);
	//exit(1);

	sigma = sigma_par[0]; //0.4;
	sigma_decay = sigma_par[1]; //0.3;
	sigma_min = sigma_par[2]; //0.01;
	


	// initialize ux & t>0 (slack variable)
	d_init_var_hard_mpc(N, nx, nu, nb, ng, ngN, ux, pi, pDCt, d, t, lam, mu0, warm_start);


#if 0
d_print_mat(1, 2*pnb+2*png, t[0], 1);
d_print_mat(1, 2*pnb+2*png, t[1], 1);
d_print_mat(1, 2*pnb+2*pngN, t[N], 1);
d_print_mat(1, 2*pnb+2*png, lam[0], 1);
d_print_mat(1, 2*pnb+2*png, lam[1], 1);
d_print_mat(1, 2*pnb+2*pngN, lam[N], 1);
exit(1);
#endif

	// initialize pi
	for(jj=0; jj<=N; jj++)
		for(ll=0; ll<nx; ll++)
			dpi[jj][ll] = 0.0;



	// initialize dux
	for(ll=0; ll<nx; ll++)
		dux[0][nu+ll] = ux[0][nu+ll];



	// compute the duality gap
	//alpha = 0.0; // needed to compute mu !!!!!
	//d_compute_mu_hard_mpc(N, nx, nu, nb, &mu, mu_scal, alpha, lam, dlam, t, dt);
	mu = mu0;

	// set to zero iteration count
	*kk = 0;	

	// larger than minimum accepted step size
	alpha = 1.0;

	// update hessian in Riccati routine
	const int update_hessian = 1;

	int fast_rsqrt = 0;



	// IP loop		
	while( *kk<k_max && mu>mu_tol && alpha>=alpha_min )
		{
						


		//update cost function matrices and vectors (box constraints)
		d_update_hessian_hard_mpc(N, nx, nu, nb, ng, ngN, cnz, 0.0, t, t_inv, lam, lamt, dlam, Qx, qx, bd, bl, pd, pl, d);

#if 0
d_print_mat(1, 2*pnb+2*png, pd[0], 1);
d_print_mat(1, 2*pnb+2*png, pd[1], 1);
d_print_mat(1, 2*pnb+2*png, pd[N], 1);
d_print_mat(1, 2*pnb+2*png, pl[0], 1);
d_print_mat(1, 2*pnb+2*png, pl[1], 1);
d_print_mat(1, 2*pnb+2*png, pl[N], 1);
#if 0
d_print_mat(1, 2*pnb+2*png, Qx[0], 1);
d_print_mat(1, 2*pnb+2*png, Qx[1], 1);
d_print_mat(1, 2*pnb+2*pngN, Qx[N], 1);
d_print_mat(1, 2*pnb+2*png, qx[0], 1);
d_print_mat(1, 2*pnb+2*png, qx[1], 1);
d_print_mat(1, 2*pnb+2*pngN, qx[N], 1);
#endif
exit(1);
#endif
#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu+nx, pd[ii], 1);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu+nx, pl[ii], 1);
for(ii=0; ii<N; ii++)
	d_print_mat(1, ng, Qx[ii], 1);
d_print_mat(1, ngN, Qx[N], 1);
for(ii=0; ii<N; ii++)
	d_print_mat(1, ng, qx[ii], 1);
d_print_mat(1, ngN, qx[N], 1);
if(*kk==1)
exit(1);
#endif



		// compute the search direction: factorize and solve the KKT system
#if defined(FAST_RSQRT)
		if(mu>1e-2)
			fast_rsqrt = 2;
		else
			{
			if(mu>1e-4)
				fast_rsqrt = 1;
			else
				fast_rsqrt = 0;
			}
#else
		fast_rsqrt = 0;
#endif
		//printf("\n%d %f\n", fast_rsqrt, mu);
		d_back_ric_sv(N, nx, nu, pBAbt, pQ, update_hessian, pd, pl, 1, dux, pL, work, diag, 1, Pb, compute_mult, dpi, nb, ng, ngN, pDCt, Qx, qx);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_pmat(nz, nz, bs, pL[ii], cnl);
exit(1);
#endif
#if 0
printf("\ndux\n");
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nx+nu, dux[ii], 1);
if(*kk==1)
exit(1);
#endif

#if 1

		// compute t_aff & dlam_aff & dt_aff & alpha
		for(jj=0; jj<=N; jj++)
			for(ll=0; ll<2*nb; ll++)
				dlam[jj][ll] = 0.0;


		alpha = 1.0;
		d_compute_alpha_hard_mpc(N, nx, nu, nb, ng, ngN, &alpha, t, dt, lam, dlam, lamt, dux, pDCt, d);

		

		stat[5*(*kk)] = sigma;
		stat[5*(*kk)+1] = alpha;
			
		alpha *= 0.995;



		// compute the affine duality gap
		d_compute_mu_hard_mpc(N, nx, nu, nb, ng, ngN, &mu_aff, mu_scal, alpha, lam, dlam, t, dt);

		stat[5*(*kk)+2] = mu_aff;

//mu_aff = 1.346982; // TODO remove !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


		// compute sigma
		sigma = mu_aff/mu;
		sigma = sigma*sigma*sigma;
//		if(sigma<sigma_min)
//			sigma = sigma_min;



		d_update_gradient_hard_mpc(N, nx, nu, nb, ng, ngN, sigma*mu, dt, dlam, t_inv, pl, qx);

#if 0
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu+nx, pl[ii], 1);
//for(ii=0; ii<N; ii++)
//	d_print_mat(1, ng, qx[ii], 1);
//d_print_mat(1, ngN, qx[N], 1);
if(*kk==1)
exit(1);
#endif


#if 0
		// first stage
		for(ii=0; ii<2*nbu; ii+=2)
			{
			dlam[0][ii+0] = t_inv[0][ii+0]*(sigma*mu - dlam[0][ii+0]*dt[0][ii+0]); // !!!!!
			dlam[0][ii+1] = t_inv[0][ii+1]*(sigma*mu - dlam[0][ii+1]*dt[0][ii+1]); // !!!!!
			pl[0][ii/2] += dlam[0][ii+1] - dlam[0][ii+0];
			}

		// middle stages
		for(jj=1; jj<N; jj++)
			{
			for(ii=0; ii<2*nb; ii+=2)
				{
				dlam[jj][ii+0] = t_inv[jj][ii+0]*(sigma*mu - dlam[jj][ii+0]*dt[jj][ii+0]); // !!!!!
				dlam[jj][ii+1] = t_inv[jj][ii+1]*(sigma*mu - dlam[jj][ii+1]*dt[jj][ii+1]); // !!!!!
				pl[jj][ii/2] += dlam[jj][ii+1] - dlam[jj][ii+0];
				}
			}

		// last stages
		for(ii=2*nu; ii<2*nb; ii+=2)
			{
			dlam[jj][ii+0] = t_inv[jj][ii+0]*(sigma*mu - dlam[jj][ii+0]*dt[jj][ii+0]); // !!!!!
			dlam[jj][ii+1] = t_inv[jj][ii+1]*(sigma*mu - dlam[jj][ii+1]*dt[jj][ii+1]); // !!!!!
			pl[jj][ii/2] += dlam[jj][ii+1] - dlam[jj][ii+0];
			}
#endif



		// copy b into x
		for(ii=0; ii<N; ii++)
			for(jj=0; jj<nx; jj++) 
				dux[ii+1][nu+jj] = pBAbt[ii][((nu+nx)/bs)*bs*cnx+(nu+nx)%bs+bs*jj]; // copy b



		// solve the system
		d_ric_trs_mpc(nx, nu, N, pBAbt, pL, pl, dux, work, 0, Pb, compute_mult, dpi, nb, ng, ngN, pDCt, qx);

#if 0
printf("\ndux\n");
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nx+nu, dux[ii], 1);
if(*kk==1)
exit(1);
#endif



#endif


		// compute t & dlam & dt & alpha
		alpha = 1.0;
		d_compute_alpha_hard_mpc(N, nx, nu, nb, ng, ngN, &alpha, t, dt, lam, dlam, lamt, dux, pDCt, d);

		stat[5*(*kk)] = sigma;
		stat[5*(*kk)+3] = alpha;
			
		alpha *= 0.995;



		// update x, u, lam, t & compute the duality gap mu

		d_update_var_hard_mpc(N, nx, nu, nb, ng, ngN, &mu, mu_scal, alpha, ux, dux, t, dt, lam, dlam, pi, dpi);

		stat[5*(*kk)+4] = mu;
		
		// update sigma
/*		sigma *= sigma_decay;*/
/*		if(sigma<sigma_min)*/
/*			sigma = sigma_min;*/
/*		if(alpha<0.3)*/
/*			sigma = sigma_par[0];*/


#if 0
d_print_mat(1, 2*pnb+2*png, lam[0], 1);
d_print_mat(1, 2*pnb+2*png, lam[1], 1);
d_print_mat(1, 2*pnb+2*png, lam[N], 1);
d_print_mat(1, 2*pnb+2*png, t[0], 1);
d_print_mat(1, 2*pnb+2*png, t[1], 1);
d_print_mat(1, 2*pnb+2*png, t[N], 1);
printf("\n%f\n", mu);
exit(1);
#endif

//mu = 13.438997;

		// increment loop index
		(*kk)++;



		} // end of IP loop
	
	// restore Hessian
	for(jj=0; jj<=N; jj++)
		{
		for(ll=0; ll<nx+nu; ll++)
			{
			pQ[jj][(ll/bs)*bs*cnz+ll%bs+ll*bs] = bd[jj][ll];
			pQ[jj][((nx+nu)/bs)*bs*cnz+(nx+nu)%bs+ll*bs] = bl[jj][ll];
			}
		}



	// successful exit
	if(mu<=mu_tol)
		return 0;
	
	// max number of iterations reached
	if(*kk>=k_max)
		return 1;
	
	// no improvement
	if(alpha<alpha_min)
		return 2;
	
	// impossible
	return -1;

	} // end of ipsolver
Beispiel #6
0
int main()
	{

#if defined(REF_BLAS_OPENBLAS)
	openblas_set_num_threads(1);
#endif
#if defined(REF_BLAS_BLIS)
	omp_set_num_threads(1);
#endif

	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

	printf("Riccati solver performance test - double precision\n");
	printf("\n");

	// maximum frequency of the processor
	const float GHz_max = GHZ_MAX;
	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
	printf("\n");

	// maximum flops per cycle, double precision
#if defined(TARGET_X64_AVX2)
	const float flops_max = 16;
	printf("Testing solvers for AVX & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_AVX)
	const float flops_max = 8;
	printf("Testing solvers for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	const float flops_max = 4;
	printf("Testing solvers for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A15)
	const float flops_max = 2;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A9)
	const float flops_max = 1;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A7)
	const float flops_max = 0.5;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X86_ATOM)
	const float flops_max = 1;
	printf("Testing solvers for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_POWERPC_G2)
	const float flops_max = 1;
	printf("Testing solvers for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4)
	const float flops_max = 2;
	printf("Testing reference solvers, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4_PREFETCH)
	const float flops_max = 2;
	printf("Testing reference solvers, 4x4 kernel with register prefetch: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_2X2)
	const float flops_max = 2;
	printf("Testing reference solvers, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#endif
	
	FILE *f;
	f = fopen("./test_problems/results/test_blas.m", "w"); // a

#if defined(TARGET_X64_AVX2)
	fprintf(f, "C = 'd_x64_avx2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_AVX)
	fprintf(f, "C = 'd_x64_avx';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	fprintf(f, "C = 'd_x64_sse3';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A9)
	fprintf(f, "C = 'd_ARM_cortex_A9';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A7)
	fprintf(f, "C = 'd_ARM_cortex_A7';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A15)
	fprintf(f, "C = 'd_ARM_cortex_A15';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X86_ATOM)
	fprintf(f, "C = 'd_x86_atom';\n");
	fprintf(f, "\n");
#elif defined(TARGET_POWERPC_G2)
	fprintf(f, "C = 'd_PowerPC_G2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4_PREFETCH)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_2X2)
	fprintf(f, "C = 'd_c99_2x2';\n");
	fprintf(f, "\n");
#endif

	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
	fprintf(f, "\n");

	fprintf(f, "B = [\n");
	

	printf("\n");
	printf("Tested solvers:\n");
	printf("-sv : Riccati factorization and system solution (prediction step in IP methods)\n");
	printf("-trs: system solution after a previous call to Riccati factorization (correction step in IP methods)\n");
	printf("\n");
	printf("\n");
	
#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
/*	printf("\nflush to zero on\n");*/
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	// to throw floating-point exception
/*#ifndef __APPLE__*/
/*    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);*/
/*#endif*/
	
	int ii, jj;
	
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line
	
	int nn[] = {4, 6, 8, 10, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300};
	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 4000, 4000, 2000, 2000, 1000, 1000, 400, 400, 400, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
	
	int vnx[] = {8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 1024};
	int vnrep[] = {100, 100, 100, 100, 100, 100, 50, 50, 50, 20, 10, 10};
	int vN[] = {4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256};

	int nx, nw, ny, ndN, N, nrep, Ns;
	int diag_R;

	int ll;
//	int ll_max = 77;
	int ll_max = 1;
	for(ll=0; ll<ll_max; ll++)
		{
		

		FILE* fid;
		double* yy;
		float* yy_temp;

		if(1)
			{
			fid = fopen("./test_problems/mhe_measure.dat", "r");
			if(fid==NULL)
				exit(-1);
			//printf("\nhola\n");
			int dummy_int = fscanf(fid, "%d %d %d %d", &nx, &nw, &ny, &Ns);
			//printf("\n%d %d %d %d\n", nx, nw, ny, Ns);
			yy_temp = (float*) malloc(ny*Ns*sizeof(float));
			yy = (double*) malloc(ny*Ns*sizeof(double));
			for(jj=0; jj<ny*Ns; jj++)
				{
				dummy_int = fscanf(fid, "%e", &yy_temp[jj]);
				yy[jj] = (double) yy_temp[jj];
				//printf("\n%f", yy[jj]);
				}
			//printf("\n");
			fclose(fid);
			#if 1
			N = 15; //Ns-1; // NN;
			nrep = NREP;//nnrep[ll];
			nx = 12;//nn[ll];
			nw = 5;//nn[ll];
			ny = 3;
			ndN = 0; //2;
			diag_R = 0;
			#else
			N = 10; //Ns-1; // NN;
			nrep = nnrep[ll];
			nx = nn[ll];
			nw = nn[ll];
			ny = 3;
			ndN = 0;
			diag_R = 0;
			#endif
			//printf("\nnx = %d; nw =  %d; ny =  %d; ndN = %d; N = %d\n\n", nx, nw, ny, ndN, N);
			}
		else if(ll_max==1)
			{
			nx = NX; // number of states (it has to be even for the mass-spring system test problem)
			nw = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
			ny = nx/2; // size of measurements vector
			N  = NN; // horizon lenght
			nrep = NREP;
			}
		else
			{
			nx = nn[ll]; // number of states (it has to be even for the mass-spring system test problem)
			nw = 2; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
			ny = nx/2; // size of measurements vector
			N  = 10; // horizon lenght
			nrep = nnrep[ll];
			}

		int rep;
		
	
		const int nz = nx+ny; // TODO delete
		const int nwx = nw+nx;
		const int anz = nal*((nz+nal-1)/nal);
		const int anx = nal*((nx+nal-1)/nal);
		const int anw = nal*((nw+nal-1)/nal);
		const int any = nal*((ny+nal-1)/nal);
		const int pnz = bs*((nz+bs-1)/bs);
		const int pnx = bs*((nx+bs-1)/bs);
		const int pnw = bs*((nw+bs-1)/bs);
		const int pny = bs*((ny+bs-1)/bs);
		const int pnx2 = bs*((2*nx+bs-1)/bs);
		const int pnwx = bs*((nw+nx+bs-1)/bs);
		const int cnz = ncl*((nz+ncl-1)/ncl);
		const int cnx = ncl*((nx+ncl-1)/ncl);
		const int cnw = ncl*((nw+ncl-1)/ncl);
		const int cny = ncl*((ny+ncl-1)/ncl);
		const int cnx2 = 2*(ncl*((nx+ncl-1)/ncl));
		const int cnwx = ncl*((nw+nx+ncl-1)/ncl);
		const int cnwx1 = ncl*((nw+nx+1+ncl-1)/ncl);
		const int cnf = cnz<cnx+ncl ? cnx+ncl : cnz;

		const int pad = (ncl-(nx+nw)%ncl)%ncl; // packing between AGL & P
		const int cnl = nx+nw+pad+cnx;
		const int pad2 = (ncl-(nx)%ncl)%ncl; // packing between AGL & P
		const int cnl2 = cnz<cnx+ncl ? nx+pad2+cnx+ncl : nx+pad2+cnz;
	
/************************************************
* dynamical system
************************************************/	

		double *A; d_zeros(&A, nx, nx); // states update matrix

		double *B; d_zeros(&B, nx, nw); // inputs matrix

		double *b; d_zeros(&b, nx, 1); // states offset
		double *x0; d_zeros(&x0, nx, 1); // initial state

		double Ts = 0.5; // sampling time
		mass_spring_system(Ts, nx, nw, N, A, B, b, x0);
	
		for(jj=0; jj<nx; jj++)
			b[jj] = 0.0;
	
		for(jj=0; jj<nx; jj++)
			x0[jj] = 0.0;
		x0[0] = 3.5;
		x0[1] = 3.5;
	
		double *C; d_zeros(&C, ny, nx); // inputs matrix
		for(jj=0; jj<ny; jj++)
			C[jj*(ny+1)] = 1.0;

//		d_print_mat(nx, nx, A, nx);
//		d_print_mat(nx, nw, B, nx);
//		d_print_mat(ny, nx, C, ny);
//		d_print_mat(nx, 1, b, nx);
//		d_print_mat(nx, 1, x0, nx);
	
		/* packed into contiguous memory */
		double *pA; d_zeros_align(&pA, pnx, cnx);
		d_cvt_mat2pmat(nx, nx, A, nx, 0, pA, cnx);

		double *pG; d_zeros_align(&pG, pnx, cnw);
		d_cvt_mat2pmat(nx, nw, B, nx, 0, pG, cnw);
		
		double *pC; d_zeros_align(&pC, pny, cnx);
		d_cvt_mat2pmat(ny, nx, C, ny, 0, pC, cnx);
		
		double *pCA; d_zeros_align(&pCA, pnz, cnx);
		d_cvt_mat2pmat(ny, nx, C, ny, 0, pCA, cnx);
		d_cvt_mat2pmat(nx, nx, A, nx, ny, pCA+(ny/bs)*bs+ny%bs, cnx);

//		d_print_pmat(nx, nx, bs, pA, cnx);
//		d_print_pmat(nx, nw, bs, pG, cnw);
//		d_print_pmat(ny, nx, bs, pC, cnx);

/************************************************
* cost function
************************************************/	

		double *R; d_zeros(&R, nw, nw);
		for(jj=0; jj<nw; jj++)
			R[jj*(nw+1)] = 1.0;

		double *Q; d_zeros(&Q, ny, ny);
		for(jj=0; jj<ny; jj++)
			Q[jj*(ny+1)] = 1.0;

		double *Qx; d_zeros(&Qx, nx, nx);
		for(jj=0; jj<ny; jj++)
			for(ii=0; ii<ny; ii++)
				Qx[ii+nx*jj] = Q[ii+ny*jj];

		double *L0; d_zeros(&L0, nx, nx);
		for(jj=0; jj<nx; jj++)
			L0[jj*(nx+1)] = 1.0;

		double *q; d_zeros_align(&q, any, 1);
		for(jj=0; jj<ny; jj++)
			q[jj] = 0.0;

		double *r; d_zeros_align(&r, anw, 1);
		for(jj=0; jj<nw; jj++)
			r[jj] = 1.0;

		double *f; d_zeros_align(&f, anx, 1);
		for(jj=0; jj<nx; jj++)
			f[jj] = jj;//1.0; //b[jj]; //1.0;

		/* packed into contiguous memory */
		double *pR; d_zeros_align(&pR, pnw, cnw);
		d_cvt_mat2pmat(nw, nw, R, nw, 0, pR, cnw);

		double *pQ; d_zeros_align(&pQ, pny, cny);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQ, cny);

//		d_print_pmat(nw, nw, bs, pQ, cnw);
//		d_print_pmat(ny, ny, bs, pR, cny);

/************************************************
* compound quantities
************************************************/	
		
		double *pRG; d_zeros_align(&pRG, pnwx, cnw);
		d_cvt_mat2pmat(nw, nw, R, nw, 0, pRG, cnw);
		d_cvt_mat2pmat(nx, nw, B, nx, nw, pRG+(nw/bs)*bs*cnw+nw%bs, cnw);
		//d_print_pmat(nw+nx, nw, bs, pRG, cnw);

		double *pQA; d_zeros_align(&pQA, pnx2, cnx);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQA, cnx);
		d_cvt_mat2pmat(nx, nx, A, nx, nx, pQA+(nx/bs)*bs*cnx+nx%bs, cnx);
		//d_print_pmat(2*nx, cnx, bs, pQA, cnx);
		//exit(1);

/************************************************
* series of matrices
************************************************/	

		double *(hpA[N]);
		double *(hpCA[N]);
		double *(hpG[N]);
		double *(hpC[N+1]);
		double *(hpR[N]);
		double *(hpQ[N+1]);
		double *(hpLp[N+1]);
		double *(hdLp[N+1]);
		double *(hpLp2[N+1]);
		double *(hpLe[N+1]);
		double *(hq[N]);
		double *(hr[N+1]);
		double *(hf[N]);
		double *(hxe[N+1]);
		double *(hxp[N+1]);
		double *(hw[N]);
		double *(hy[N+1]);
		double *(hlam[N]);

		double *(hpRG[N]);
		double *(hpQA[N+1]);
		double *(hpGLr[N]);
		double *(hpALe[N+1]);
		double *(hrr[N]);
		double *(hqq[N+1]);
		double *(hff[N+1]);
		double *p_hrr; d_zeros_align(&p_hrr, anw, N);
		double *p_hqq; d_zeros_align(&p_hqq, anx, N+1);
		double *p_hff; d_zeros_align(&p_hff, anx, N+1);

		double *p_hxe; d_zeros_align(&p_hxe, anx, N+1);
		double *p_hxp; d_zeros_align(&p_hxp, anx, N+1);
		double *p_hw; d_zeros_align(&p_hw, anw, N);
		double *p_hy; d_zeros_align(&p_hy, any, N+1);
		double *p_hlam; d_zeros_align(&p_hlam, anx, N+1);

		double *(hq_res[N+1]);
		double *(hr_res[N]);
		double *(hf_res[N+1]);
		double *p_hq_res; d_zeros_align(&p_hq_res, anx, N+1);
		double *p_hr_res; d_zeros_align(&p_hr_res, anw, N);
		double *p_hf_res; d_zeros_align(&p_hf_res, anx, N+1);

		for(jj=0; jj<N; jj++)
			{
			hpA[jj] = pA;
			hpCA[jj] = pCA;
			hpG[jj] = pG;
			hpC[jj] = pC;
			hpR[jj] = pR;
			hpQ[jj] = pQ;
			d_zeros_align(&hpLp[jj], pnx, cnl);
			d_zeros_align(&hdLp[jj], anx, 1);
			d_zeros_align(&hpLp2[jj], pnz, cnl2);
			d_zeros_align(&hpLe[jj], pnz, cnf);
			hr[jj] = r;
			hq[jj] = q;
			hf[jj] = f;

			hpRG[jj] = pRG;
			hpQA[jj] = pQA;
			d_zeros_align(&hpGLr[jj], pnwx, cnw);
			d_zeros_align(&hpALe[jj], pnx2, cnx2);
			hrr[jj] = p_hrr+jj*anw;
			hqq[jj] = p_hqq+jj*anx;
			hff[jj] = p_hff+jj*anx;

			hxe[jj] = p_hxe+jj*anx; //d_zeros_align(&hxe[jj], anx, 1);
			hxp[jj] = p_hxp+jj*anx; //d_zeros_align(&hxp[jj], anx, 1);
			hw[jj] = p_hw+jj*anw; //d_zeros_align(&hw[jj], anw, 1);
			hy[jj] = p_hy+jj*any; //d_zeros_align(&hy[jj], any, 1);
			hlam[jj] = p_hlam+jj*anx; //d_zeros_align(&hlambda[jj], anx, 1);

			hq_res[jj] = p_hq_res+jj*anx;
			hr_res[jj] = p_hr_res+jj*anw;
			hf_res[jj] = p_hf_res+jj*anx;
			}

		hpC[N] = pC;
		hpQ[N] = pQ;
		d_zeros_align(&hpLp[N], pnx, cnl);
		d_zeros_align(&hdLp[N], anx, 1);
		d_zeros_align(&hpLp2[N], pnz, cnl2);
		d_zeros_align(&hpLe[N], pnz, cnf);
		hq[N] = q;

		// equality constraints on the states at the last stage
		double *D; d_zeros(&D, ndN, nx);
		for(ii=0; ii<ndN; ii++) D[ii*(ndN+1)] = 1;
		//D[0+ndN*0] = 1;
		//D[1+ndN*(nx-1)] = 1;
		double *d; d_zeros_align(&d, ndN, 1);
		for(ii=0; ii<ndN; ii++) d[ii] = ii;
		//d[0] = 1;
		//d[1] = 0;
		const int pnxdN = bs*((nx+ndN+bs-1)/bs);
		double *pCtQC; d_zeros_align(&pCtQC, pnxdN, cnx);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pCtQC, cnx);
		d_cvt_mat2pmat(ndN, nx, D, ndN, nx, pCtQC+nx/bs*bs*cnx+nx%bs, cnx);
		//d_print_pmat(nx+ndN, nx, bs, pCtRC, cnx);
		hpQA[N] = pCtQC; // there is not A_N
		d_zeros_align(&hpALe[N], pnxdN, cnx2); // there is not A_N: pnx not pnx2
		hqq[N] = p_hqq+N*anx;
		hff[N] = p_hff+N*anx;
		const int pndN = bs*((ndN+bs-1)/bs);
		const int cndN = ncl*((ndN+ncl-1)/ncl);
		double *Ld; d_zeros_align(&Ld, pndN, cndN);
		double *d_res; d_zeros_align(&d_res, pndN, 1);



		hxe[N] = p_hxe+N*anx; //d_zeros_align(&hxe[N], anx, 1);
		hxp[N] = p_hxp+N*anx; //d_zeros_align(&hxp[N], anx, 1);
		hy[N] = p_hy+N*any; //d_zeros_align(&hy[N], any, 1);
		hlam[N] = p_hlam+N*anx; //d_zeros_align(&hlambda[jj], anx, 1);

		hf_res[N] = p_hf_res+N*anx;
		hq_res[N] = p_hq_res+N*anx;

		// initialize hpLp[0] with the cholesky factorization of /Pi_p
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, hpLp[0]+(nx+nw+pad)*bs, cnl);
		for(ii=0; ii<nx; ii++) hdLp[0][ii] = 1.0/L0[ii*(nx+1)];
		d_cvt_mat2pmat(nx, nx, L0, nx, ny, hpLp2[0]+(ny/bs)*bs+ny%bs+(nx+pad2+ny)*bs, cnl2);
		dtrtr_l_lib(nx, ny, hpLp2[0]+(ny/bs)*bs*cnl2+ny%bs+(nx+pad2+ny)*bs, cnl2, hpLp2[0]+(nx+pad2+ncl)*bs, cnl2);	
		//d_print_pmat(nx, cnl, bs, hpLp[0], cnl);
		//d_print_pmat(nz, cnl2, bs, hpLp2[0], cnl2);

		// buffer for L0
		double *pL0; d_zeros_align(&pL0, pnx, cnx);
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, pL0, cnx);
		// invert L0 in hpALe[0]
		dtrinv_lib(nx, pL0, cnx, hpALe[0], cnx2);
		double *pL0_inv; d_zeros_align(&pL0_inv, pnx, cnx);
		dtrinv_lib(nx, pL0, cnx, pL0_inv, cnx);
		//d_print_pmat(nx, nx, bs, pL0, cnx);
		//d_print_pmat(nx, nx, bs, pL0_inv, cnx);
		//d_print_pmat(pnx2, cnx2, bs, hpALe[0], cnx2);
		//exit(1);

		//double *work; d_zeros_align(&work, pny*cnx+pnz*cnz+anz+pnz*cnf+pnw*cnw, 1);
		double *work; d_zeros_align(&work, 2*pny*cnx+anz+pnw*cnw+pnx*cnx, 1);
		//printf("\nciao %d %d %d %d %d %d\n", pny, cnx, anz, pnw, cnw, pnx);

		double *work2; d_zeros_align(&work2, 2*pny*cnx+pnw*cnw+pnx*cnw+2*pnx*cnx+anz, 1);

		double *work3; d_zeros_align(&work3, pnx*cnl+anx, 1);
		double *work4; d_zeros_align(&work4, 4*anx+2*(anx+anw), 1);
//		for(jj=0; jj<2*pny*cnx+anz+pnw*cnw+pnx*cnx; jj++)
//			work[jj] = -100.0;

		// measurements
		for(jj=0; jj<=N; jj++)
			for(ii=0; ii<ny; ii++)
				hy[jj][ii] = yy[jj*ny+ii];

		//d_print_mat(ny, N+1, hy[0], any);

		// initial guess
		for(ii=0; ii<nx; ii++)
			x0[ii] = 0.0;
		for(ii=0; ii<nx; ii++)
			hxp[0][ii] = x0[ii];



		// information filter - solution
		double *y_temp; d_zeros_align(&y_temp, any, 1);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) hrr[ii][jj] = r[jj];
		for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) hff[ii][jj] = f[jj];
		for(jj=0; jj<ndN; jj++) hff[N][jj] = d[jj];
		for(ii=0; ii<=N; ii++) 
			{
			for(jj=0; jj<ny; jj++) y_temp[jj] = - q[jj];
			//d_print_mat(1, ny, y_temp, 1);
			dsymv_lib(ny, ny, hpQ[ii], cny, hy[ii], y_temp, y_temp, -1);
			//d_print_mat(1, ny, y_temp, 1);
			dgemv_t_lib(ny, nx, hpC[ii], cnx, y_temp, hqq[ii], hqq[ii], 0);
			//d_print_mat(1, nx, hqq[ii], 1);
			//if(ii==9)
			//exit(1);
			}
		//exit(1);




/************************************************
* new low-level mhe_if interface
************************************************/	

		int nrows = pnx>pnw ? 2*pnx : pnx+pnw;
		int ncols = cnwx1;

		double *pQRAG; d_zeros_align(&pQRAG, nrows, ncols);

		if(nx>=nw)
			{
			d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQRAG, cnwx1);
			d_cvt_mat2pmat(nx, nx, A, nx, 0, pQRAG+pnx*cnwx1, cnwx1);
			d_cvt_mat2pmat(nw, nw, R, nw, 0, pQRAG+(pnx-pnw)*cnwx1+nx*bs, cnwx1);
			d_cvt_mat2pmat(nx, nw, B, nx, 0, pQRAG+pnx*cnwx1+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			if(nx>pnx-nx)
				d_cvt_mat2pmat(pnx-nx, nx, A+(nx-pnx+nx), nx, nx, pQRAG+nx/bs*bs*cnwx1+nx%bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nx, A, nx, nx, pQRAG+nx/bs*bs*cnwx1+nx%bs, cnwx1);
			if(nx>pnw-nw)
				d_cvt_mat2pmat(pnw-nw, nw, B+(nx-pnw+nw), nx, nw, pQRAG+(pnx-pnw+nw/bs*bs)*cnwx1+nw%bs+nx*bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nw, B, nx, nw, pQRAG+(pnx-pnw+nw/bs*bs)*cnwx1+nw%bs+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			}
		else
			{
			d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQRAG+(pnw-pnx)*cnwx1, cnwx1);
			d_cvt_mat2pmat(nx, nx, A, nx, 0, pQRAG+pnw*cnwx1, cnwx1);
			d_cvt_mat2pmat(nw, nw, R, nw, 0, pQRAG+nx*bs, cnwx1);
			d_cvt_mat2pmat(nx, nw, B, nx, 0, pQRAG+pnw*cnwx1+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			if(nx>pnx-nx)
				d_cvt_mat2pmat(pnx-nx, nx, A+(nx-pnx+nx), nx, nx, pQRAG+(pnw-pnx+nx/bs*bs)*cnwx1+nx%bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nx, A, nx, nx, pQRAG+(pnw-pnx+nx/bs*bs)*cnwx1+nx%bs, cnwx1);
			if(nx>pnw-nw)
				d_cvt_mat2pmat(pnw-nw, nw, B+(nx-pnw+nw), nx, nw, pQRAG+nw/bs*bs*cnwx1+nw%bs+nx*bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nw, B, nx, nw, pQRAG+nw/bs*bs*cnwx1+nw%bs+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			}

		double *pQD; d_zeros_align(&pQD, pnx+pndN, cnx);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQD, cnx);
		d_cvt_mat2pmat(ndN, nx, D, ndN, 0, pQD+pnx*cnx, cnx);
		//d_print_pmat(pnx+pndN, cnx, bs, pQD, cnx);
		if(ndN>pnx-nx)
			d_cvt_mat2pmat(pnx-nx, nx, D+(ndN-pnx+nx), ndN, nx, pQD+nx/bs*bs*cnx+nx%bs, cnx);
		else
			d_cvt_mat2pmat(ndN, nx, D, ndN, nx, pQD+nx/bs*bs*cnx+nx%bs, cnx);
		//d_print_pmat(pnx+pndN, cnx, bs, pQD, cnx);
		//exit(1);




		double *(hpQRAG[N+1]);
		double *(hpLAG[N+1]);
		double *(hpLe2[N+1]);

		for(ii=0; ii<N; ii++)	
			{
			hpQRAG[ii] = pQRAG;
			d_zeros_align(&hpLAG[ii], nrows, ncols);
			d_zeros_align(&hpLe2[ii], pnx, cnx);
			}
		hpQRAG[N] = pQD;
		d_zeros_align(&hpLAG[N], pnx+pndN, cnx);
		d_zeros_align(&hpLe2[N], pnx, cnx);
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, hpLe2[0], cnx);
		//d_print_pmat(nx, nx, bs, hpLe2[0], cnx);



		double **dummy;
#if 0

		struct timeval tv10, tv11, tv12;

		// double precision
		gettimeofday(&tv10, NULL); // start

		for(ii=0; ii<1; ii++)
		//for(ii=0; ii<nrep; ii++)
			{

			d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3);
			//d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3);

			}

		gettimeofday(&tv11, NULL); // stop

		for(ii=0; ii<1; ii++)
		//for(ii=0; ii<nrep; ii++)
			{

			d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);

			}

		gettimeofday(&tv12, NULL); // stop

		float time_trf_mhe_if_new = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6);
		float time_trs_mhe_if_new = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6);

		printf("\ntime = %e\t%e\n\n", time_trf_mhe_if_new, time_trs_mhe_if_new);




		//exit(1);
#endif


/************************************************
* reference code
************************************************/	

		double *(hA[N]);
		double *(hG[N]);
		double *(hQ[N+1]);
		double *(hR[N]);
		double *(hAGU[N]);
		double *(hUp[N+1]);
		double *(hUe[N+1]);
		double *(hUr[N]);
		double *Ud;
		double *work_ref;

		for(ii=0; ii<N; ii++)
			{
			hA[ii] = A;
			hG[ii] = B;
			hQ[ii] = Qx;
			hR[ii] = R;
			d_zeros(&hAGU[ii], nx, nx+nw);
			d_zeros(&hUp[ii], nx, nx);
			d_zeros(&hUe[ii], nx, nx);
			d_zeros(&hUr[ii], nw, nw);
			}
		hA[N] = D;
		hQ[N] = Qx;
		d_zeros(&hAGU[N], ndN, nx);
		d_zeros(&hUp[N], nx, nx);
		d_zeros(&hUe[N], nx, nx);
		d_zeros(&Ud, ndN, ndN);
		d_zeros(&work_ref, nx+nw, 1);

		for(ii=0; ii<nx*nx; ii++)
			hUp[0][ii] = L0[ii];



		#if 0

		printf("\nfactorization\n");
		d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr, Ud);

		printf("\nsolution\n");
		d_ric_trs_mhe_if_blas( nx, nw, ndN, N, hAGU, hUp, hUe, hUr, Ud, hqq, hrr, hff, hxp, hxe, hw, hlam, work_ref);

		//d_print_mat(nx, nx, hUe[N], nx);
		//exit(1);

		#endif




/************************************************
* high-level interface
************************************************/	

#if 0
		int kk;

		double *AA; d_zeros(&AA, nx, nx*N);
		//for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) for(ll=0; ll<nx; ll++) AA[ll+nx*jj+nx*nx*ii] = A[ll+nx*jj];
		for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) for(kk=0; kk<nx; kk++) AA[jj+nx*kk+nx*nx*ii] = A[kk+nx*jj];

		double *GG; d_zeros(&GG, nx, nw*N);
		//for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) for(ll=0; ll<nx; ll++) GG[ll+nx*jj+nx*nw*ii] = B[ll+nx*jj];
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) for(kk=0; kk<nx; kk++) GG[jj+nw*kk+nx*nw*ii] = B[kk+nx*jj];

		double *ff; d_zeros(&ff, nx, N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) ff[jj+nx*ii] = f[jj];

		double *DD; d_zeros(&DD, ndN, nx);
		//for(jj=0; jj<nx; jj++) for(ll=0; ll<ndN; ll++) DD[ll+ndN*jj] = D[ll+ndN*jj];
		for(jj=0; jj<nx; jj++) for(kk=0; kk<ndN; kk++) DD[jj+nx*kk] = D[kk+ndN*jj];

		double *dd; d_zeros(&dd, ndN, 1);
		for(kk=0; kk<ndN; kk++) dd[kk] = d[kk];

		double *RR; d_zeros(&RR, nw, nw*N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw*nw; jj++) RR[jj+nw*nw*ii] = R[jj];

		double *QQ; d_zeros(&QQ, nx, nx*N);
		for(ii=0; ii<N; ii++) 
			{
			for(jj=0; jj<ny; jj++) for(kk=0; kk<ny; kk++) QQ[kk+nx*jj+nx*nx*ii] = Q[kk+ny*jj];
			//for(jj=ny; jj<nx; jj++) QQ[jj+nx*jj+nx*nx*ii] = 1e-8;
			}

		double *Qf; d_zeros(&Qf, nx, nx);
		for(jj=0; jj<ny; jj++) for(kk=0; kk<ny; kk++) Qf[kk+nx*jj] = Q[kk+ny*jj];

		double *rr; d_zeros(&rr, nw, N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) rr[jj+nw*ii] = r[jj];

		double *qq; d_zeros(&qq, nx, N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<ny; jj++) qq[jj+nx*ii] = q[jj];
		double *yy_tmp; d_zeros_align(&yy_tmp, any, 1);
		for(ii=0; ii<N; ii++) 
			{
			for(jj=0; jj<ny; jj++) yy_tmp[jj] = - q[jj];
			dsymv_lib(ny, ny, hpQ[ii], cny, hy[ii], yy_tmp, -1);
			dgemv_t_lib(ny, nx, hpC[ii], cnx, yy_tmp, &qq[ii*nx], 0);
			}

		double *qf; d_zeros(&qf, nx, 1);
//		for(jj=0; jj<ny; jj++) qf[jj] = q[jj];
//		if(ndN>0) 
//			{
			for(jj=0; jj<ny; jj++) yy_tmp[jj] = - q[jj];
			dsymv_lib(ny, ny, hpQ[N], cny, hy[N], yy_tmp, -1);
			dgemv_t_lib(ny, nx, hpC[N], cnx, yy_tmp, qf, 0);
//			}

		double *xx0; d_zeros(&xx0, nx, 1);

		double *LL0; d_zeros(&LL0, nx, nx);

		double *xxe; d_zeros(&xxe, nx, N+1);

		double *LLe; d_zeros(&LLe, nx, nx);

		double *ww; d_zeros(&ww, nw, N);

		double *llam; d_zeros(&llam, nx, N+1);

		double *work_high_level; d_zeros(&work_high_level, hpmpc_ric_mhe_if_dp_work_space(nx, nw, ny, ndN, N), 1);

		double *dummy0;

		struct timeval tv00, tv01;

		int error_code;

		printf("\nhigh-level\n");

		// double precision
		gettimeofday(&tv00, NULL); // start

		for(ii=0; ii<nrep; ii++)
			{

			for(jj=0; jj<nx; jj++) xx0[jj] = x0[jj];
			for(jj=0; jj<nx*nx; jj++) LL0[jj] = L0[jj];

			//error_code = fortran_order_riccati_mhe_if( 'd', 2, nx, nw, 0, ndN, N, AA, GG, dummy, ff, DD, dd, RR, QQ, Qf, rr, qq, qf, dummy, xx0, LL0, xxe, LLe, ww, llam, work_high_level);
			error_code = c_order_riccati_mhe_if( 'd', 2, nx, nw, 0, ndN, N, AA, GG, dummy0, ff, DD, dd, RR, QQ, Qf, rr, qq, qf, dummy0, xx0, LL0, xxe, LLe, ww, llam, work_high_level);

			//if(error_code)
			//	break;

			}

		gettimeofday(&tv01, NULL); // stop

		float time_mhe_if_high_level = (float) (tv01.tv_sec-tv00.tv_sec)/(nrep+0.0)+(tv01.tv_usec-tv00.tv_usec)/(nrep*1e6);

		printf("\nhigh-level interface for MHE_if\n\nerror_code: %d, time = %e\n\n", error_code, time_mhe_if_high_level);

		//d_print_mat(nx, N+1, xxe, nx);
		//d_print_mat(nw, N, ww, nw);

		free(AA);
		free(GG);
		free(ff);
		free(DD);
		free(dd);
		free(RR);
		free(QQ);
		free(Qf);
		free(rr);
		free(qq);
		free(qf);
		free(xx0);
		free(LL0);
		free(xxe);
		free(LLe);
		free(ww);
		free(llam);
		free(work_high_level);
		free(yy_tmp);

		//exit(1);
#endif


/************************************************
* call the solver
************************************************/	

		//d_print_mat(nx, nx, A, nx);
		//d_print_mat(nx, nw, B, nx);

		//d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work);
		d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, work);

		// estimation
		d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 0, hlam, work);

#if 0
		// print solution
		printf("\nx_e\n");
		d_print_mat(nx, N+1, hxe[0], anx);
#endif
	
		// smooth estimation
		d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 1, hlam, work);

		//d_print_pmat(nx, nx, bs, hpLp[N-1]+(nx+nw+pad)*bs, cnl);
		//d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl);
		//d_print_pmat(nx, nx, bs, hpLe[N-1]+ncl*bs, cnf);
		//d_print_pmat(nx, nx, bs, hpLe[N]+ncl*bs, cnf);

#if 1
		printf("\nx_s\n");
		//d_print_mat(nx, N+1, hxp[0], anx);
		d_print_mat(nw, N, hw[0], anw);
		d_print_mat(nx, N+1, hxe[0], anx);
		//d_print_mat(nx, N, hlam[0], anx);
#endif

		// information filter - factorization
		//d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3);
		d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3);

		// information filter - solution
		//d_ric_trs_mhe_if(nx, nw, ndN, N, hpALe, hpGLr, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
		d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
		//d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hxp, hxe, hw, hy, 1, hlam, work);

		//d_print_pmat(nx, nx, bs, hpALe[N-1], cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N], cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N-2]+cnx*bs, cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N-1]+cnx*bs, cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N]+cnx*bs, cnx2);
		//d_print_pmat(nx, nx, bs, hpRA[N], cnx);

#if 1
		printf("\nx_s_if\n");
		//d_print_mat(nx, N+1, hxp[0], anx);
		d_print_mat(nw, N, hw[0], anw);
		d_print_mat(nx, N+1, hxe[0], anx);
		//d_print_mat(nx, N, hlam[0], anx);
		//exit(1);
#endif

		//d_print_pmat(nw, nw, bs, hpQ[0], cnw);
		//d_print_pmat(nx, nw, bs, hpG[0], cnw);
		//d_print_mat(nw, 1, hq[0], nw);
		//d_print_mat(nw, 1, hw[0], nw);
		//d_print_mat(nx, 1, hlam[0], nx);
		//exit(3);

#if 1
		int nZ = nw+nx+1;
		int pnZ = (nw+nx+1+bs-1)/bs*bs;
		int cnZ = (nw+nx+1+ncl-1)/ncl*ncl;

		int cnL = cnZ>cnx+ncl ? cnZ : cnx+ncl;

		double *(hpRSQrq[N+1]); 
		for(ii=0; ii<=N; ii++)
			{
			d_zeros_align(&hpRSQrq[ii], pnZ, cnZ);
			d_cvt_mat2pmat(nw, nw, R, nw, 0, hpRSQrq[ii], cnZ);
			d_cvt_mat2pmat(ny, ny, Q, ny, nw, hpRSQrq[ii]+nw/bs*bs*cnZ+nw%bs+nw*bs, cnZ);
			d_cvt_mat2pmat(1, nw, r, 1, nw+nx, hpRSQrq[ii]+(nw+nx)/bs*bs*cnZ+(nw+nx)%bs, cnZ);
			d_cvt_mat2pmat(1, nx, hqq[ii], 1, nw+nx, hpRSQrq[ii]+(nw+nx)/bs*bs*cnZ+(nw+nx)%bs+nw*bs, cnZ);
			//d_print_pmat(nZ, nZ, bs, hpRSQrq[ii], cnZ);
			}

		double *pP0; d_zeros_align(&pP0, pnx, cnx);
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, pP0, cnx);
		//d_print_pmat(nx, nx, bs, pP0, cnx);
		dgead_lib(nx, nx, 1.0, 0, pP0, cnx, nw, hpRSQrq[0]+nw/bs*bs*cnZ+nw%bs+nw*bs, cnZ); 
		//d_print_pmat(nZ, nZ, bs, hpRSQrq[0], cnZ);

		double *pBAbt; d_zeros_align(&pBAbt, pnZ, cnx);
		d_cvt_tran_mat2pmat(nx, nw, B, nx, 0, pBAbt, cnx);
		d_cvt_tran_mat2pmat(nx, nx, A, nx, nw, pBAbt+nw/bs*bs*cnx+nw%bs, cnx);
		d_cvt_mat2pmat(1, nx, f, 1, nw+nx, pBAbt+(nw+nx)/bs*bs*cnx+(nw+nx)%bs, cnx);
		//d_print_pmat(nZ, nx, bs, pBAbt, cnx);

		double *(hpBAbt[N]);
		for(ii=0; ii<N; ii++)
			{
			hpBAbt[ii] = pBAbt;
			}

		double *(hpLam[N+1]);
		for(ii=0; ii<=N; ii++)
			{
			d_zeros_align(&hpLam[ii], pnZ, cnL);
			}

		double *work_ric; d_zeros_align(&work_ric, pnZ, cnx);
		double *diag_ric; d_zeros_align(&diag_ric, pnZ, 1);

		double *hux_mat; d_zeros_align(&hux_mat, pnZ, N+1);
		double *(hux[N+1]);
		for(ii=0; ii<=N; ii++)
			{
			hux[ii] = hux_mat+ii*pnZ;
			}

		double **pdummy;

		d_back_ric_sv(N, nx, nw, hpBAbt, hpRSQrq, 0, pdummy, pdummy, 0, hux, hpLam, work_ric, diag_ric, 0, pdummy, 0, pdummy, 0, 0, 0, pdummy, pdummy, pdummy);

		d_print_mat(nw, N+1, hux_mat, pnZ);
		d_print_mat(nx, N+1, hux_mat+nw, pnZ);

		exit(1);

#endif

		// compute residuals
		double *p0; d_zeros_align(&p0, anx, 1);
		double *x_temp; d_zeros_align(&x_temp, anx, 1);
		dtrmv_u_t_lib(nx, pL0_inv, cnx, x0, x_temp, 0);
		dtrmv_u_n_lib(nx, pL0_inv, cnx, x_temp, p0, 0);
		d_res_mhe_if(nx, nw, ndN, N, hpQA, hpRG, pL0_inv, hqq, hrr, hff, p0, hxe, hw, hlam, hq_res, hr_res, hf_res, work4);

//		printf("\nprint residuals\n\n");
//		d_print_mat(nx, N+1, hq_res[0], anx);
//		d_print_mat(nw, N, hr_res[0], anw);
//		d_print_mat(nx, N, hf_res[0], anx);
//		d_print_mat(ndN, 1, hf_res[0]+N*anx, anx);

		//return 0;
		//exit(1);

		if(0 && PRINTRES)
			{
			// print solution
			printf("\nx_p\n");
			d_print_mat(nx, N+1, hxp[0], anx);
			printf("\nx_s\n");
			d_print_mat(nx, N+1, hxe[0], anx);
			printf("\nw\n");
			d_print_mat(nw, N+1, hw[0], anw);
			//printf("\nL_p\n");
			//d_print_pmat(nx, nx, bs, hpLp[0]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[0], 1);
			//d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[1], 1);
			//d_print_pmat(nx, nx, bs, hpLp[2]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[2], 1);
			//d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[N], 1);
			//printf("\nL_p\n");
			//d_print_pmat(nz, nz, bs, hpLp2[0]+(nx+pad2)*bs, cnl2);
			//d_print_pmat(nz, nz, bs, hpLp2[1]+(nx+pad2)*bs, cnl2);
			//d_print_pmat(nz, nz, bs, hpLp2[2]+(nx+pad2)*bs, cnl2);
			//printf("\nL_e\n");
			//d_print_pmat(nz, nz, bs, hpLe[0], cnf);
			//d_print_pmat(nz, nz, bs, hpLe[1], cnf);
			//d_print_pmat(nz, nz, bs, hpLe[2], cnf);
			//d_print_pmat(nx, nx, bs, hpA[0], cnx);
			}


		// timing 
		struct timeval tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8;

		// double precision
		gettimeofday(&tv0, NULL); // start

		// factorize
		for(rep=0; rep<nrep; rep++)
			{
			//d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work);
			d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, work);
			}

		gettimeofday(&tv1, NULL); // start

		// solve
		for(rep=0; rep<nrep; rep++)
			{
			d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 1, hlam, work);
			}

		gettimeofday(&tv2, NULL); // start

		// factorize
		for(rep=0; rep<nrep; rep++)
			{
			//d_print_pmat(nx, nx, bs, hpLe[N]+(ncl)*bs, cnf);
			//d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl);
			//d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work);
			d_ric_trf_mhe_end(nx, nw, ny, N, hpCA, hpG, hpC, hpLp2, hpR, hpQ, hpLe, work2);
			}

		gettimeofday(&tv3, NULL); // start

		// solve
		for(rep=0; rep<nrep; rep++)
			{
			d_ric_trs_mhe_end(nx, nw, ny, N, hpA, hpG, hpC, hpLp2, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hy, work2);
			}

		gettimeofday(&tv4, NULL); // start

		// factorize information filter
		for(rep=0; rep<nrep; rep++)
			{
			//d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3);
			d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3);
			}

		gettimeofday(&tv5, NULL); // start

		// factorize information filter
		for(rep=0; rep<nrep; rep++)
			{
			//d_ric_trs_mhe_if(nx, nw, ndN, N, hpALe, hpGLr, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
			d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
			}

		gettimeofday(&tv6, NULL); // start

		// factorize information filter
		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_BLIS) || defined(REF_BLAS_NETLIB)
			//d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr);
			d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr, Ud);
#endif
			}

		gettimeofday(&tv7, NULL); // start

		// solution information filter
		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_BLIS) || defined(REF_BLAS_NETLIB)
			d_ric_trs_mhe_if_blas( nx, nw, ndN, N, hAGU, hUp, hUe, hUr, Ud, hqq, hrr, hff, hxp, hxe, hw, hlam, work_ref);
#endif
			}

		gettimeofday(&tv8, NULL); // start

		float Gflops_max = flops_max * GHz_max;

		float time_trf = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
		float time_trs = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
		float time_trf_end = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
		float time_trs_end = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6);
		float time_trf_if = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6);
		float time_trs_if = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6);
		float time_trf_if_blas = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6);
		float time_trs_if_blas = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6);

		float flop_trf_if = N*(10.0/3.0*nx*nx*nx+nx*nx*nw)+2.0/3.0*nx*nx*nx+ndN*nx*nx+ndN*ndN*nx+1.0/3.0*ndN*ndN*ndN;
		if(diag_R==0)
			flop_trf_if += N*(nx*nw*nw+1.0/3.0*nw*nw*nw);
		else
			flop_trf_if += N*(nx*nw+1.0/2.0*nw*nw);

		float Gflops_trf_if = flop_trf_if*1e-9/time_trf_if;
		float Gflops_trf_if_blas = flop_trf_if*1e-9/time_trf_if_blas;

		if(ll==0)
			{
			printf("\nnx\tnw\tny\tN\ttrf time\ttrs time\ttrf_e time\ttrs_e time\ttrf_if time\ttrf_if Gflops\ttrf_if percent\ttrs_if time\ttrf_if BLAS\tGflops\t\tpercent\t\ttrs_if BLAS\n\n");
//			fprintf(f, "\nnx\tnu\tN\tsv time\t\tsv Gflops\tsv %%\t\ttrs time\ttrs Gflops\ttrs %%\n\n");
			}
		printf("%d\t%d\t%d\t%d\t%e\t%e\t%e\t%e\t%e\t%f\t%f\t%e\t%e\t%f\t%f\t%e\n", nx, nw, ny, N, time_trf, time_trs, time_trf_end, time_trs_end, time_trf_if, Gflops_trf_if, 100*Gflops_trf_if/Gflops_max, time_trs_if, time_trf_if_blas, Gflops_trf_if_blas, 100*Gflops_trf_if_blas/Gflops_max, time_trs_if_blas);


#if 0
		return 0;


		// moving horizon test

		// window size
		N = 20;

		double *(hhxe[N+1]);
		double *(hhxp[N+1]);
		double *(hhw[N]);
		double *(hhy[N+1]);
		double *(hhlam[N]);

		double *p_hhxe; d_zeros_align(&p_hhxe, anx, N+1);
		double *p_hhxp; d_zeros_align(&p_hhxp, anx, N+1);
		double *p_hhw; d_zeros_align(&p_hhw, anw, N);
		double *p_hhlam; d_zeros_align(&p_hhlam, anx, N);

		// shift measurements and initial prediction
		for(ii=0; ii<N; ii++)
			{
			hhxe[ii] = p_hhxe+ii*anx; //d_zeros_align(&hxe[jj], anx, 1);
			hhxp[ii] = p_hhxp+ii*anx; //d_zeros_align(&hxp[jj], anx, 1);
			hhw[ii] = p_hhw+ii*anw; //d_zeros_align(&hw[jj], anw, 1);
			hhy[ii] = hy[ii]; //d_zeros_align(&hy[jj], any, 1);
			hhlam[ii] = p_hhlam+ii*anx; //d_zeros_align(&hlam[jj], anx, 1);
			}
		hhxe[N] = p_hhxe+N*anx; //d_zeros_align(&hxe[jj], anx, 1);
		hhxp[N] = p_hhxp+N*anx; //d_zeros_align(&hxp[jj], anx, 1);
		hhy[N] = hy[N]; //d_zeros_align(&hy[jj], any, 1);

		// shift initial prediction covariance
		//for(ii=0; ii<pnx*cnl; ii++)
		//	hpLp[0][ii] = hpLp[1][ii];

		d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, work);
		d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hhxp, hhxe, hhw, hhy, 1, hhlam, work);

		// zero data
		for(ii=0; ii<Ns*anx; ii++)
			hxe[0][ii] = 0.0;

		for(ii=anx; ii<Ns*anx; ii++)
			hxp[0][ii] = 0.0;

		for(ii=0; ii<(Ns-1)*anw; ii++)
			hw[0][ii] = 0.0;

		for(ii=0; ii<(Ns-1)*anx; ii++)
			hlam[0][ii] = 0.0;

		// save data
		for(ii=0; ii<(N+1); ii++)
			for(jj=0; jj<nx; jj++)
				hxe[ii][jj] = hhxe[ii][jj];

		for(ii=0; ii<(N+1); ii++)
			for(jj=0; jj<nx; jj++)
				hxp[ii][jj] = hhxp[ii][jj];

		for(ii=0; ii<N; ii++)
			for(jj=0; jj<nw; jj++)
				hw[ii][jj] = hhw[ii][jj];
		//d_print_mat(nw, N, hw[0], anw);

		for(ii=0; ii<N; ii++)
			for(jj=0; jj<nx; jj++)
				hlam[ii][jj] = hhlam[ii][jj];



		for(jj=1; jj<Ns-N; jj++)
			{

			//break;
			
			// shift measurements and initial prediction
			for(ii=0; ii<=N; ii++)
				{
				hhy[ii] = hy[ii+jj];
				}

			// shift initial prediction and relative covariance
			for(ii=0; ii<nx; ii++)
				hhxp[0][ii] = hhxp[1][ii];
			for(ii=0; ii<pnx*cnl; ii++)
				hpLp[0][ii] = hpLp[1][ii];

			//d_print_mat(nx, N+1, hhxp[0], anx);

			//d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[1], cnf);
			//d_print_pmat(nx, nx, bs, hpLp[2]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[2], cnf);

			d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, work);
			d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hhxp, hhxe, hhw, hhy, 1, hhlam, work);

			//d_print_mat(nx, N+1, hhxp[0], anx);

			//d_print_pmat(nx, nx, bs, hpLp[0]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[0], cnf);
			//d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[1], cnf);

			// save data
			for(ii=0; ii<nx; ii++)
				hxe[N+jj][ii] = hhxe[N][ii];

			for(ii=0; ii<nx; ii++)
				hxp[N+jj][ii] = hhxp[N][ii];

			if(jj<Ns-N-1)
				for(ii=0; ii<nw; ii++)
					hw[N+jj][ii] = hhw[N-1][ii];

			if(jj<Ns-N-1)
				for(ii=0; ii<nx; ii++)
					hlam[N+jj][ii] = hhlam[N-1][ii];

			//break;

			}

		// print solution
		if(PRINTRES)
			{
			printf("\nx_p\n");
			d_print_mat(nx, Ns, hxp[0], anx);
			printf("\nx_e\n");
			d_print_mat(nx, Ns, hxe[0], anx);
			//printf("\nL_e\n");
			//d_print_pmat(nx, nx, bs, hpLp[Ns-1]+(nx+nw+pad)*bs, cnl);
			}

#endif

/************************************************
* return
************************************************/

		free(A);
		free(B);
		free(C);
		free(b);
		free(D);
		free(d);
		free(x0);
		free(Q);
		free(Qx);
		free(R);
		free(q);
		free(r);
		free(f);
		free(L0);
		free(pA);
		free(pG);
		free(pC);
		free(pQ);
		free(pR);
		free(pQA);
		free(pRG);
		free(work);
		free(work2);
		free(work3);
		free(work4);
		free(p_hxe);
		free(p_hxp);
		free(p_hy);
		free(p_hw);
		free(p_hlam);
		//free(p_hhxe);
		//free(p_hhxp);
		//free(p_hhw);
		//free(p_hhlam);
		free(x_temp);
		free(y_temp);
		free(p0);
		free(p_hr_res);
		free(p_hq_res);
		free(p_hf_res);
		free(pL0_inv);
		free(hpLp[0]);
		free(hdLp[0]);
		free(hpLe[0]);
		for(jj=0; jj<N; jj++)
			{
			free(hpLp[jj+1]);
			free(hdLp[jj+1]);
			free(hpLe[jj+1]);
			free(hpGLr[jj]);
			free(hpALe[jj]);
			free(hpLp2[jj]);
			}
		free(hpALe[N]);


		free(pQRAG);
		free(pQD);
		for(ii=0; ii<N; ii++)
			{
			free(hpLAG[ii]);
			free(hpLe2[ii]);
			}
		free(hpLAG[N]);
		free(hpLe2[N]);

		for(ii=0; ii<N; ii++)
			{
			free(hAGU[ii]);
			free(hUp[ii]);
			free(hUe[ii]);
			free(hUr[ii]);
			}
		free(hUp[N]);
		free(hUe[N]);
		free(Ud);
		free(work_ref);


		} // increase size

	fprintf(f, "];\n");
	fclose(f);


	return 0;

	}
Beispiel #7
0
int main()
	{
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
/*	printf("\nflush subnormals to zero\n");*/
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	int ii, jj, idx;
	
	int rep, nrep=NREP;

	int nx = NX; // number of states (it has to be even for the mass-spring system test problem)
	int nu = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
	int N  = NN; // horizon lenght
//	int nb = NB; // number of box constrained inputs and states
	int nh = nu;//nu+nx/2; // number of hard box constraints
	int ns = nx;//nx/2;//nx; // number of soft box constraints
	int nb = nh + ns;

	int nhu = nu<nh ? nu : nh ;

	printf(" Test problem: mass-spring system with %d masses and %d controls.\n", nx/2, nu);
	printf("\n");
	printf(" MPC problem size: %d states, %d inputs, %d horizon length, %d two-sided box constraints on inputs and states, %d two-sided soft constraints on states.\n", nx, nu, N, nh, ns);
	printf("\n");
#if IP == 1
	printf(" IP method parameters: primal-dual IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_d_ip_box.c to change them).\n", K_MAX, MU_TOL);
#elif IP == 2
	printf(" IP method parameters: predictor-corrector IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_d_ip_box.c to change them).\n", K_MAX, MU_TOL);
#else
	printf(" Wrong value for IP solver choice: %d\n", IP);
#endif

	int info = 0;
		
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line
	
	const int nz = nx+nu+1;
	const int pnz = bs*((nz+bs-1)/bs);
	const int pnx = bs*((nx+bs-1)/bs);
	const int pnu = bs*((nu+bs-1)/bs);
	const int pnb = bs*((2*nb+bs-1)/bs); // packed number of box constraints
	const int cnz = ncl*((nx+nu+1+ncl-1)/ncl);
	const int cnx = ncl*((nx+ncl-1)/ncl);
	const int anz = nal*((nz+nal-1)/nal);
	const int anx = nal*((nx+nal-1)/nal);

//	const int pad = (ncl-nx%ncl)%ncl; // packing between BAbtL & P
//	const int cnl = cnz<cnx+ncl ? nx+pad+cnx+ncl : nx+pad+cnz;
	const int cnl = cnz<cnx+ncl ? cnx+ncl : cnz;
	

/************************************************
* dynamical system
************************************************/	

	double *A; d_zeros(&A, nx, nx); // states update matrix

	double *B; d_zeros(&B, nx, nu); // inputs matrix

	double *b; d_zeros(&b, nx, 1); // states offset
	double *x0; d_zeros(&x0, nx, 1); // initial state

	double Ts = 0.5; // sampling time
	mass_spring_system(Ts, nx, nu, N, A, B, b, x0);
	
	for(jj=0; jj<nx; jj++)
		b[jj] = 0.0;
	
	for(jj=0; jj<nx; jj++)
		x0[jj] = 0;
	x0[0] = 3.5;
	x0[1] = 3.5;
	
//	d_print_mat(nx, nx, A, nx);
//	d_print_mat(nx, nu, B, nx);
//	d_print_mat(nx, 1, b, nx);
//	d_print_mat(nx, 1, x0, nx);
	
	/* packed */
/*	double *BAb; d_zeros(&BAb, nx, nz);*/

/*	dmcopy(nx, nu, B, nx, BAb, nx);*/
/*	dmcopy(nx, nx, A, nx, BAb+nu*nx, nx);*/
/*	dmcopy(nx, 1 , b, nx, BAb+(nu+nx)*nx, nx);*/
	
	/* transposed */
/*	double *BAbt; d_zeros_align(&BAbt, pnz, pnz);*/
/*	for(ii=0; ii<nx; ii++)*/
/*		for(jj=0; jj<nz; jj++)*/
/*			{*/
/*			BAbt[jj+pnz*ii] = BAb[ii+nx*jj];*/
/*			}*/

	/* packed into contiguous memory */
	double *pBAbt; d_zeros_align(&pBAbt, pnz, cnx);
/*	d_cvt_mat2pmat(nz, nx, BAbt, pnz, 0, pBAbt, cnx);*/
/*	d_cvt_tran_mat2pmat(nx, nz, BAb, nx, 0, pBAbt, cnx);*/

	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbt, cnx);
	d_cvt_tran_mat2pmat(nx, nx, A, nx, nu, pBAbt+nu/bs*cnx*bs+nu%bs, cnx);
	for (jj = 0; jj<nx; jj++)
		pBAbt[(nx+nu)/bs*cnx*bs+(nx+nu)%bs+jj*bs] = b[jj];

/*	d_print_pmat (nz, nx, bs, pBAbt, cnx);*/
/*	exit(1);*/

/************************************************
* box constraints
************************************************/	

	double *db; d_zeros_align(&db, 2*nb, 1);
	jj=0;
	for( ; jj<2*nhu; jj++)
		db[jj] = - 0.5;   // umin
	for( ; jj<2*nh; jj++)
		db[jj] = - 4.0;   // xmin_hard
	for( ; jj<2*nb; jj++)
		db[jj] = - 1.0;   // xmin_soft

/************************************************
* cost function
************************************************/	

	double *Q; d_zeros(&Q, nz, nz);
	for(ii=0; ii<nu; ii++) Q[ii*(nz+1)] = 2.0;
	for(; ii<nz; ii++) Q[ii*(nz+1)] = 0.0;
	for(ii=0; ii<nu; ii++) Q[nx+nu+ii*nz] = 0.2;
	for(; ii<nz; ii++) Q[nx+nu+ii*nz] = 0.1;
/*	Q[(nx+nu)*(pnz+1)] = 1e35; // large enough (not needed any longer) */
	
	/* packed into contiguous memory */
	double *pQ; d_zeros_align(&pQ, pnz, cnz);
	d_cvt_mat2pmat(nz, nz, Q, nz, 0, pQ, cnz);

	// cost function of the soft constrained slack variables
	double *Z; d_zeros_align(&Z, pnb, 1);
	for(ii=0; ii<2*ns; ii++) Z[2*nh+ii] = 0.0;
	//for(ii=0; ii<nx; ii++) Z[2*nu+2*ii+0] = 100.0;
	double *z; d_zeros_align(&z, pnb, 1);
	for(ii=0; ii<2*ns; ii++) z[2*nh+ii] = 100.0;

	// maximum element in cost functions
	double mu0 = 1.0;
	for(ii=0; ii<nu+nx; ii++)
		for(jj=0; jj<nu+nx; jj++)
			mu0 = fmax(mu0, Q[jj+nz*ii]);
	for(ii=0; ii<2*ns; ii++)
		{
		mu0 = fmax(mu0, Z[2*nh+ii]);
		mu0 = fmax(mu0, z[2*nh+ii]);
		}
	//printf("\n mu0 = %f\n", mu0);

/************************************************
* matrices series
************************************************/	

	double *hpQ[N+1];
	double *hq[N+1];
	double *hZ[N+1];
	double *hz[N+1];
	double *hux[N+1];
	double *hpi[N+1];
	double *hlam[N+1];
	double *ht[N+1];
	double *hpBAbt[N];
	double *hdb[N+1];
	double *hrb[N];
	double *hrq[N+1];
	double *hrd[N+1];
	double *hrz[N+1];

	for(jj=0; jj<N; jj++)
		{
		//d_zeros_align(&hpQ[jj], pnz, cnz);
		hpQ[jj] = pQ;
		}
	//d_zeros_align(&hpQ[N], pnz, pnz);
	hpQ[N] = pQ;

	for(jj=0; jj<N; jj++)
		{
		d_zeros_align(&hq[jj], anz, 1);
		hZ[jj] = Z;
		hz[jj] = z;
		d_zeros_align(&hux[jj], anz, 1);
		d_zeros_align(&hpi[jj], anx, 1);
		d_zeros_align(&hlam[jj],2*pnb, 1); // TODO pnb
		d_zeros_align(&ht[jj], 2*pnb, 1); // TODO pnb
		hpBAbt[jj] = pBAbt;
		hdb[jj] = db;
		d_zeros_align(&hrb[jj], anx, 1);
		d_zeros_align(&hrq[jj], anz, 1);
		d_zeros_align(&hrd[jj], pnb, 1); // TODO pnb
		d_zeros_align(&hrz[jj], pnb, 1); // TODO pnb
		}
	d_zeros_align(&hq[N], anz, 1);
	hZ[N] = Z;
	hz[N] = z;
	d_zeros_align(&hux[N], anz, 1);
	d_zeros_align(&hpi[N], anx, 1);
	d_zeros_align(&hlam[N], 2*pnb, 1); // TODO pnb
	d_zeros_align(&ht[N], 2*pnb, 1); // TODO pnb
	hdb[N] = db;
	d_zeros_align(&hrq[N], anz, 1);
	d_zeros_align(&hrd[N], pnb, 1); // TODO pnb
	d_zeros_align(&hrz[N], pnb, 1); // TODO pnb
	
	// starting guess
	for(jj=0; jj<nx; jj++) hux[0][nu+jj]=x0[jj];

/************************************************
* riccati-like iteration
************************************************/

//	double *work; d_zeros_align(&work, (N+1)*(pnz*cnl + 5*anz + 10*pnb + 2*anx) + 3*anz, 1); // work space
	double *work; d_zeros_align(&work, (N+1)*(pnz*cnl + pnz + 5*anz + 10*pnb + 2*anx) + anz + pnz*cnx, 1); // work space
/*	for(jj=0; jj<( (N+1)*(pnz*cnl + 4*anz + 4*pnb + 2*anx) + 3*anz ); jj++) work[jj] = -1.0;*/
	int kk = 0; // acutal number of iterations
/*	char prec = PREC; // double/single precision*/
/*	double sp_thr = SP_THR; // threshold to switch between double and single precision*/
	int k_max = K_MAX; // maximum number of iterations in the IP method
	double mu_tol = MU_TOL; // tolerance in the duality measure
	double alpha_min = ALPHA_MIN; // minimum accepted step length
	double sigma[] = {0.4, 0.3, 0.01}; // control primal-dual IP behaviour
	double *stat; d_zeros(&stat, 5, k_max); // stats from the IP routine
	int compute_mult = COMPUTE_MULT;
	int warm_start = WARM_START;
	double mu = -1.0;
	int hpmpc_status;
	


	/* initizile the cost function */
//	for(ii=0; ii<N; ii++)
//		{
//		for(jj=0; jj<pnz*cnz; jj++) hpQ[ii][jj]=pQ[jj];
//		}
//	for(jj=0; jj<pnz*cnz; jj++) hpQ[N][jj]=pQ[jj];



	// initial states
	double xx0[] = {3.5, 3.5, 3.66465, 2.15833, 1.81327, -0.94207, 1.86531, -2.35760, 2.91534, 1.79890, -1.49600, -0.76600, -2.60268, 1.92456, 1.66630, -2.28522, 3.12038, 1.83830, 1.93519, -1.87113};



	/* warm up */

	// initialize states and inputs
	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nx+nu; jj++)
			hux[ii][jj] = 0;

	hux[0][nu+0] = xx0[0];
	hux[0][nu+1] = xx0[1];

	// call the IP solver
//	if(FREE_X0==0)
//		{
		if(IP==1)
			hpmpc_status = d_ip_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
		else
			hpmpc_status = d_ip2_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
//		}
//	else
//		{
//		if(IP==1)
//			hpmpc_status = d_ip_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//		else
//			hpmpc_status = d_ip2_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//		}

#if 0
	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (soft-constraints solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nu = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nu, hux[ii], 1);
		
		printf("\nx = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nx, hux[ii]+nu, 1);
		
		printf("\nlam = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*nb, hlam[ii], 1);
		
		}
#endif



	int kk_avg = 0;
	int kks_avg = 0;

	/* timing */
	struct timeval tv0, tv1, tv2, tv3, tv4, tv5;

	// use general constraint to solve the soft-box-constrainted problem
	#if 1 
	int nus = nu + 2*nx; // number of inputs and slack variables
	int nbs = nus;
	int ngs = nx;
	const int nzs = nx+nus+1;
	const int cnzs = ncl*((nzs+ncl-1)/ncl);
	const int cngs = ncl*((ngs+ncl-1)/ncl);
	const int cnxgs= ncl*((ngs+nx+ncl-1)/ncl);
	const int pnzs = bs*((nzs+bs-1)/bs);
	const int pnbs = bs*((nbs+bs-1)/bs); // simd aligned number of one-sided box constraints !!!!!!!!!!!!
	const int pngs = bs*((ngs+bs-1)/bs); // simd aligned number of one-sided box constraints !!!!!!!!!!!!
	const int cnls = cnzs<cnx+ncl ? cnx+ncl : cnzs;
	const int anzs = nal*((nzs+nal-1)/nal);
	double *pBAbts; d_zeros_align(&pBAbts, pnzs, cnx);
	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbts, cnx);
	d_cvt_tran_mat2pmat(nx, nx, A, nx, nus, pBAbts+nus/bs*cnx*bs+nus%bs, cnx);
	for(jj=0; jj<nx; jj++)
		pBAbts[(nx+nus)/bs*cnx*bs+(nx+nus)%bs+jj*bs] = b[jj];
	//d_print_pmat (nzs, nx, bs, pBAbts, cnx);
	double *ds; d_zeros_align(&ds, 2*pnbs+2*pngs, 1);
	for(jj=0; jj<nu; jj++)
		{
		ds[jj]      = - 0.5; //   umin
		ds[pnbs+jj] = - 0.5; // - umax
		}
	for(; jj<nus; jj++)
		{
		ds[jj]      =    0.0; //   smin
		ds[pnbs+jj] = - 10.0; // - smax
		}
	for(jj=0; jj<ngs; jj++)
		{
		ds[2*pnbs+jj]      = - 1.0; //   xmin
		ds[2*pnbs+pngs+jj] = - 1.0; // - xmax
		}
	//d_print_mat(1, 2*pnbs+2*pngs, ds, 1);
	double *Cs; d_zeros(&Cs, ngs, nx);
	double *Ds; d_zeros(&Ds, ngs, nus);
	for(jj=0; jj<nx; jj++)
		{
		Cs[jj+jj*ngs] = 1.0;
		Ds[jj+(nu+jj)*ngs] = 1.0;
		Ds[jj+(nu+nx+jj)*ngs] = - 1.0;
		}
	double *pDCts; d_zeros_align(&pDCts, pnzs, cngs);
	d_cvt_tran_mat2pmat(ngs, nus, Ds, ngs, 0, pDCts, cngs);
	d_cvt_tran_mat2pmat(ngs, nx, Cs, ngs, nus, pDCts+nus/bs*cngs*bs+nus%bs, cngs);
	//d_print_pmat(nus+nx, ngs, bs, pDCts, cngs);
	double *Qs; d_zeros(&Qs, nzs, nzs);
	d_copy_mat(nu, nu, Q, nz, Qs, nzs);
	d_copy_mat(nx+1, nu, Q+nu, nz, Qs+nus, nzs);
	d_copy_mat(nx+1, nx, Q+nu*(nz+1), nz, Qs+nus*(nzs+1), nzs);
	for(jj=0; jj<nx; jj++)
		{
		Qs[(nu+jj)*(nzs+1)] = Z[2*nh+2*jj+0]; // TODO change when updated IP !!!!!
		Qs[(nu+nx+jj)*(nzs+1)] = Z[2*nh+2*jj+1]; // TODO change when updated IP !!!!!
		Qs[nus+nx+(nu+jj)*nzs] = z[2*nh+2*jj+0]; // TODO change when updated IP !!!!!
		Qs[nus+nx+(nu+nx+jj)*nzs] = z[2*nh+2*jj+1]; // TODO change when updated IP !!!!!
		}
	double *pQs; d_zeros_align(&pQs, pnzs, cnzs);
	d_cvt_mat2pmat(nzs, nzs, Qs, nzs, 0, pQs, cnzs);
	//d_print_pmat(nzs, nzs, bs, pQs, cnzs);
	double *hpQs[N+1];
	double *huxs[N+1];
	double *hpis[N+1];
	double *hlams[N+1];
	double *hts[N+1];
	double *hpBAbts[N];
	double *hpDCts[N+1];
	double *hds[N+1];
	for(jj=0; jj<N; jj++)
		{
		hpQs[jj] = pQs;
		hpBAbts[jj] = pBAbts;
		hpDCts[jj] = pDCts;
		hds[jj] = ds;
		d_zeros_align(&huxs[jj], pnzs, 1);
		d_zeros_align(&hpis[jj], pnx, 1);
		d_zeros_align(&hlams[jj], 2*pnbs+2*pngs, 1);
		d_zeros_align(&hts[jj], 2*pnbs+2*pngs, 1);
		}
	hpQs[N] = pQs;
	d_zeros_align(&hpDCts[N], pnzs, cngs);
	d_zeros_align(&hds[N], 2*pnbs+2*pngs, 1);
	d_zeros_align(&huxs[N], pnzs, 1);
	d_zeros_align(&hpis[N], pnx, 1);
	d_zeros_align(&hlams[N] ,2*pnbs+2*pngs, 1);
	d_zeros_align(&hts[N], 2*pnbs+2*pngs, 1);
	double *works; d_zeros_align(&works, (N+1)*(pnzs*cnls + pnzs + 5*anzs + 10*(pnbs+pngs) + 2*anx) + anzs + pnzs*cnxgs, 1); // work space 

	gettimeofday(&tv0, NULL); // start

	for(rep=0; rep<nrep; rep++)
		{

		// initialize states and inputs
		for(ii=0; ii<=N; ii++)
			for(jj=0; jj<nx+nus; jj++)
				huxs[ii][jj] = 0;

		idx = rep%10;
		huxs[0][nus+0] = xx0[2*idx];
		huxs[0][nus+1] = xx0[2*idx+1];

		if(IP==1)
			hpmpc_status = d_ip_hard_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nus, N, nbs, ngs, ngs, hpBAbts, hpQs, hpDCts, hds, huxs, compute_mult, hpis, hlams, hts, works);
		else
			hpmpc_status = d_ip2_hard_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nus, N, nbs, ngs, ngs, hpBAbts, hpQs, hpDCts, hds, huxs, compute_mult, hpis, hlams, hts, works);

		kks_avg += kk;

		}


	gettimeofday(&tv1, NULL); // stop

	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (general-constraints solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nus = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nus, huxs[ii], 1);
		
		printf("\nxs = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nx, huxs[ii]+nus, 1);
	
		}


	for(jj=0; jj<N; jj++)
		{
		free(huxs[jj]);
		free(hpis[jj]);
		free(hlams[jj]);
		free(hts[jj]);
		}
	free(hpDCts[N]);
	free(hds[N]);
	free(huxs[N]);
	free(hpis[N]);
	free(hlams[N]);
	free(hts[N]);
	free(works);
	//exit(1);
	#endif



	gettimeofday(&tv2, NULL); // start



	for(rep=0; rep<nrep; rep++)
		{

		idx = rep%10;
//		x0[0] = xx0[2*idx];
//		x0[1] = xx0[2*idx+1];

		// initialize states and inputs
		for(ii=0; ii<=N; ii++)
			for(jj=0; jj<nx+nu; jj++)
				hux[ii][jj] = 0;

		hux[0][nu+0] = xx0[2*idx];
		hux[0][nu+1] = xx0[2*idx+1];

		// call the IP solver
//		if(FREE_X0==0)
//			{
			if(IP==1)
				hpmpc_status = d_ip_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
			else
				hpmpc_status = d_ip2_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
//			}
//		else
//			{
//			if(IP==1)
//				hpmpc_status = d_ip_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//			else
//				hpmpc_status = d_ip2_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//			}

		kk_avg += kk;

		}
	
	gettimeofday(&tv3, NULL); // stop
	


	// restore linear part of cost function 
	for(ii=0; ii<N; ii++)
		{
		for(jj=0; jj<nx+nu; jj++) hq[ii][jj] = Q[nx+nu+nz*jj];
		}
	for(jj=0; jj<nx+nu; jj++) hq[N][jj] = Q[nx+nu+nz*jj];

	// residuals computation
//	if(FREE_X0==0)
		d_res_ip_soft_mpc(nx, nu, N, nh, ns, hpBAbt, hpQ, hq, hZ, hz, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, hrz, &mu);
//	else
//		d_res_ip_box_mhe_old(nx, nu, N, nb, hpBAbt, hpQ, hq, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, &mu);


	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (soft-constraints solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nu = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nu, hux[ii], 1);
		
		printf("\nx = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nx, hux[ii]+nu, 1);
		
		printf("\nlam = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*nb, hlam[ii], 1);
		
		}

	if(PRINTRES==1 && COMPUTE_MULT==1)
		{
		// print result 
		// print result 
		printf("\n");
		printf("\n");
		printf(" Print residuals\n\n");
		printf("\n");
		printf("\n");
		printf("rq = \n\n");
//		if(FREE_X0==0)
//			{
			d_print_mat(1, nu, hrq[0], 1);
			for(ii=1; ii<=N; ii++)
/*				d_print_mat_e(1, nx+nu, hrq[ii], 1);*/
				d_print_mat(1, nx+nu, hrq[ii], 1);
//			}
//		else
//			{
//			for(ii=0; ii<=N; ii++)
///*				d_print_mat_e(1, nx+nu, hrq[ii], 1);*/
//				d_print_mat(1, nx+nu, hrq[ii], 1);
//			}
		printf("rz = \n\n");
		for(ii=0; ii<=N; ii++)
//			d_print_mat_e(1, 2*nb-2*nu, hrz[ii]+2*nu, 1);
			d_print_mat(1, 2*nb-2*nu, hrz[ii]+2*nu, 1);
		printf("\n");
		printf("\n");
		printf("\n");
		printf("\n");
		printf("rb = \n\n");
		for(ii=0; ii<N; ii++)
/*			d_print_mat_e(1, nx, hrb[ii], 1);*/
			d_print_mat(1, nx, hrb[ii], 1);
		printf("\n");
		printf("\n");
		printf("rd = \n\n");
		for(ii=0; ii<=N; ii++)
/*			d_print_mat_e(1, 2*nb, hrd[ii], 1);*/
			d_print_mat(1, 2*nb, hrd[ii], 1);
		printf("\n");
		printf("\n");
		printf("mu = %e\n\n", mu);
		
		}

/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	


/**************************************************************************************************
*
*	time-variant nx and nu, sparse box and soft constraints format
*
**************************************************************************************************/

	// problem size
	int nx_tv[N+1];
	int nu_tv[N+1];
	int nb_tv[N+1];
	int ng_tv[N+1];
	int ns_tv[N+1];
	int nz_tv[N+1]; // vector of zeros

	// first stage
	nx_tv[0] = 0;
	nu_tv[0] = nu;
	nb_tv[0] = nu;
	ng_tv[0] = 0;
	ns_tv[0] = 0;
	nz_tv[0] = 0;

	// middle stages
	for(ii=1; ii<N; ii++)
		{
		nx_tv[ii] = nx;
		nu_tv[ii] = nu;
		nb_tv[ii] = nu;
		ng_tv[ii] = 0;
		ns_tv[ii] = nx;
		nz_tv[ii] = 0;
		}
	
	// last stage
	nx_tv[N] = nx;
	nu_tv[N] = 0;
	nb_tv[N] = 0;
	ng_tv[N] = 0;
	ns_tv[N] = nx;
	nz_tv[N] = 0;


	// matrix sizes
	int pnz_tv[N+1];
	int pnx_tv[N+1];
	int pnb_tv[N+1];
	int png_tv[N+1];
	int pns_tv[N+1];
	int cnz_tv[N+1];
	int cnx_tv[N+1];
	int cnl_tv[N+1];

	for(ii=0; ii<=N; ii++)
		{
		pnz_tv[ii] = (nu_tv[ii]+nx_tv[ii]+1+bs-1)/bs*bs;
		pnx_tv[ii] = (nx_tv[ii]+bs-1)/bs*bs;
		pnb_tv[ii] = (nb_tv[ii]+bs-1)/bs*bs;
		png_tv[ii] = (ng_tv[ii]+bs-1)/bs*bs;
		pns_tv[ii] = (ns_tv[ii]+bs-1)/bs*bs;
		cnz_tv[ii] = (nu_tv[ii]+nx_tv[ii]+1+ncl-1)/ncl*ncl;
		cnx_tv[ii] = (nx_tv[ii]+ncl-1)/ncl*ncl;
		cnl_tv[ii] = cnz_tv[ii]<cnx_tv[ii]+ncl ? cnx_tv[ii]+ncl : cnz_tv[ii];
		}
	
//	for(ii=0; ii<=N; ii++)
//		printf("\n%d\t%d\t%d\t%d\t%d\t%d\t%d\n", pnz_tv[ii], pnx_tv[ii], pnb_tv[ii], pns_tv[ii], cnz_tv[ii], cnx_tv[ii], cnl_tv[ii]);



	// state-space matrices
	//d_print_mat(nx, nx, A, nx);
	//d_print_mat(nx, nu, B, nx);
	//for(ii=0; ii<nx; ii++) b[ii] = 1.0;
	//d_print_mat(nx, 1, b, nx);
	//d_print_mat(nx, 1, x0, nx);

	// compute b0
	double *pA; d_zeros_align(&pA, pnx, cnx);
	d_cvt_mat2pmat(nx, nx, A, nx, 0, pA, cnx);
	double *b0; d_zeros_align(&b0, pnx, 1);
	dgemv_n_lib(nx, nx, pA, cnx, x0, 1, b, b0);
	//d_print_pmat(nx, nx, bs, pA, cnx);
	//d_print_mat(nx, 1, b0, nx);

	double *pBAbt0; d_zeros_align(&pBAbt0, pnz_tv[0], cnx_tv[1]);
	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbt0, cnx_tv[1]);
	d_cvt_tran_mat2pmat(nx, 1, b0, nx, nu, pBAbt0+nu/bs*bs*cnx_tv[1]+nu%bs, cnx_tv[1]);
	//d_print_pmat(nu_tv[0]+nx_tv[0]+1, nx_tv[1], bs, pBAbt0, cnx_tv[1]);

	double *pBAbt1; d_zeros_align(&pBAbt1, pnz_tv[1], cnx_tv[2]);
	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbt1, cnx_tv[2]);
	d_cvt_tran_mat2pmat(nx, nx, A, nx, nu, pBAbt1+nu/bs*bs*cnx_tv[2]+nu%bs, cnx_tv[2]);
	d_cvt_tran_mat2pmat(nx, 1, b, nx, nu+nx, pBAbt1+(nu+nx)/bs*bs*cnx_tv[2]+(nu+nx)%bs, cnx_tv[2]);
//	d_print_pmat(nu_tv[1]+nx_tv[1]+1, nx_tv[2], bs, pBAbt1, cnx_tv[2]);
	
	double *(hpBAbt_tv[N]);
	hpBAbt_tv[0] = pBAbt0;
	for(ii=1; ii<N; ii++)
		hpBAbt_tv[ii] = pBAbt1;
	

	// cost function matrices
	//for(ii=nu; ii<nu+nx; ii++) Q[ii*(nz+1)] = 1.0; // TODO remove !!!!
	//d_print_mat(nz, nz, Q, nz);

	double *q; d_zeros_align(&q, pnz, 1);
	for(ii=0; ii<nu; ii++) q[ii] = Q[nu+nx+ii*nz];
	//d_print_mat(nu, 1, q, nu);

	double *pS; d_zeros_align(&pS, pnu, cnx);
	d_cvt_tran_mat2pmat(nx, nu, Q+nu, nz, 0, pS, cnx);
	//d_print_pmat(nu, nx, bs, pS, cnx);

	double *q0; d_zeros_align(&q0, pnz_tv[0], 1);
	dgemv_n_lib(nu, nx, pS, cnx, x0, 1, q, q0);
	//d_print_mat(nu, 1, q0, nu);

	double *pQ0; d_zeros_align(&pQ0, pnz_tv[0], cnz_tv[0]);
	d_cvt_mat2pmat(nu, nu, Q, nz, 0, pQ0, cnz_tv[0]);
	d_cvt_tran_mat2pmat(nu, 1, q0, nu, nu, pQ0+nu/bs*bs*cnz_tv[0]+nu%bs, cnz_tv[0]);
	//d_print_pmat(nu_tv[0]+nx_tv[0]+1, nu_tv[0]+nx_tv[0]+1, bs, pQ0, pnz_tv[0]);
	
	double *pQ1; d_zeros_align(&pQ1, pnz_tv[1], cnz_tv[1]);
	d_cvt_mat2pmat(nz, nz, Q, nz, 0, pQ1, cnz_tv[1]);
	//d_print_pmat(nu_tv[1]+nx_tv[1]+1, nu_tv[1]+nx_tv[1]+1, bs, pQ1, pnz_tv[1]);

	double *pQN; d_zeros_align(&pQN, pnz_tv[N], cnz_tv[N]);
	d_cvt_mat2pmat(nx+1, nx+1, Q+nu*(nz+1), nz, 0, pQN, cnz_tv[N]);
	//d_print_pmat(nu_tv[N]+nx_tv[N]+1, nu_tv[N]+nx_tv[N]+1, bs, pQN, cnz_tv[N]);

	double *(hpQ_tv[N+1]);
	hpQ_tv[0] = pQ0;
	for(ii=1; ii<N; ii++)
		hpQ_tv[ii] = pQ1;
	hpQ_tv[N] = pQN;
	


	double *(hpL_tv[N+1]);
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hpL_tv[ii], pnz_tv[ii], cnl_tv[ii]);

	double *(hdL_tv[N+1]);
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hdL_tv[ii], pnz_tv[ii], 1);



	double *hux_tv[N+1];
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hux_tv[ii], (nu_tv[ii]+nx_tv[ii]+bs-1)/bs*bs, 1);
	
	double *hpi_tv[N+1];
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hpi_tv[ii], pnx_tv[ii], 1);
	

	// dummy variables
	int **pdummyi;
	double **pdummyd;
	

#if 0
	// work space
	double *ric_tv_work; d_zeros_align(&ric_tv_work, d_ric_sv_mpc_tv_work_space_size_double(N, nx_tv, nu_tv, nz_tv, nz_tv), 1);
	double *ric_tv_diag; d_zeros_align(&ric_tv_diag, pnz, 1);

	// call the Riccati solver
	d_back_ric_sv_tv(N, nx_tv, nu_tv, hpBAbt_tv, hpQ_tv, hux_tv, hpL_tv, hdL_tv, ric_tv_work, ric_tv_diag, 0, pdummyd, 1, hpi_tv, nz_tv, pdummyi, pdummyd, pdummyd, nz_tv, pdummyd, pdummyd, pdummyd);

	// print solution
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nu_tv[ii]+nx_tv[ii], hux_tv[ii], 1);
#endif
	

	// constraints
	int *idxb0 = (int *) malloc((nb_tv[0]+ns_tv[0])*sizeof(int));
	double *db0; d_zeros_align(&db0, 2*pnb_tv[0]+2*pns_tv[0], 1);
	int nbu0;
	nbu0 = nu_tv[0]<nb_tv[0] ? nu_tv[0] : nb_tv[0];
	idx = 0;
	for(jj=0; jj<nbu0; jj++)
		{
		idxb0[idx] = idx;
		db0[0*pnb_tv[0]+jj] = - 0.5; // umin_hard
		db0[1*pnb_tv[0]+jj] = - 0.5; // umax_hard
		idx++;
		}

	int *idxb1 = (int *) malloc((nb_tv[1]+ns_tv[1])*sizeof(int));
	double *db1; d_zeros_align(&db1, 2*pnb_tv[1]+2*pns_tv[1], 1);
	nbu0 = nu_tv[1]<nb_tv[1] ? nu_tv[1] : nb_tv[1];
	idx = 0;
	for(jj=0; jj<nbu0; jj++)
		{
		idxb1[idx] = idx;
		db1[0*pnb_tv[1]+jj] = - 0.5; // umin_hard
		db1[1*pnb_tv[1]+jj] = - 0.5; // umax_hard
		idx++;
		}
	for(jj=nu_tv[1]; jj<nb_tv[1]; jj++)
		{
		idxb1[idx] = idx;
		db1[0*pnb_tv[1]+jj] = - 4.0; // xmin_hard
		db1[1*pnb_tv[1]+jj] = - 4.0; // xmax_hard
		idx++;
		}
	for(jj=0; jj<ns_tv[1]; jj++)
		{
		idxb1[idx] = idx;
		db1[2*pnb_tv[1]+0*pns_tv[1]+jj] = - 1.0; // xmin_soft
		db1[2*pnb_tv[1]+1*pns_tv[1]+jj] = - 1.0; // xmax soft
		idx++;
		}

	int *idxbN = (int *) malloc((nb_tv[N]+ns_tv[N])*sizeof(int));
	double *dbN; d_zeros_align(&dbN, 2*pnb_tv[N]+2*pns_tv[N], 1);
	idx = 0;
	for(jj=nu_tv[N]; jj<nb_tv[N]; jj++)
		{
		idxbN[idx] = idx;
		dbN[0*pnb_tv[N]+jj] = - 4.0; // xmin_hard
		dbN[1*pnb_tv[N]+jj] = - 4.0; // xmax_hard
		idx++;
		}
	for(jj=0; jj<ns_tv[N]; jj++)
		{
		idxbN[idx] = idx;
		dbN[2*pnb_tv[N]+0*pns_tv[N]+jj] = - 1.0; // xmin_soft
		dbN[2*pnb_tv[N]+1*pns_tv[N]+jj] = - 1.0; // xmax soft
		idx++;
		}
	
	int *idxb_tv[N+1];
	double *hdb_tv[N+1];
	idxb_tv[0] = idxb0;
	hdb_tv[0] = db0;
	for(ii=1; ii<N; ii++)
		{
		idxb_tv[ii] = idxb1;
		hdb_tv[ii] = db1;
		}
	idxb_tv[N] = idxbN;
	hdb_tv[N] = dbN;

#if 0
	for(ii=0; ii<=N; ii++)
		{
		for(jj=0; jj<nb_tv[ii]+ns_tv[ii]; jj++)
			printf("\t%d", idxb_tv[ii][jj]);
		printf("\n");
		}
#endif
	

	// cost function of the soft contraint slack variables
	double *Z1; d_zeros_align(&Z1, 2*pns_tv[1], 1);
	for(ii=0; ii<ns_tv[1]; ii++)
		{
		Z1[0*pns_tv[1]+ii] = 0.0;
		Z1[1*pns_tv[1]+ii] = 0.0;
		}
	double *z1; d_zeros_align(&z1, 2*pns_tv[1], 1);
	for(ii=0; ii<ns_tv[1]; ii++)
		{
		z1[0*pns_tv[1]+ii] = 100.0;
		z1[1*pns_tv[1]+ii] = 100.0;
		}
	
	double *hZ_tv[N+1];
	double *hz_tv[N+1];
	for(ii=0; ii<=N; ii++)
		{
		hZ_tv[ii] = Z1;
		hz_tv[ii] = z1;
		}

	// maximum element in cost functions
	mu0 = 1.0;
	for(ii=0; ii<nu+nx; ii++)
		for(jj=0; jj<nu+nx; jj++)
			mu0 = fmax(mu0, Q[jj+nz*ii]);
	for(ii=0; ii<ns; ii++)
		{
		mu0 = fmax(mu0, Z[0*pns_tv[1]+ii]);
		mu0 = fmax(mu0, Z[1*pns_tv[1]+ii]);
		mu0 = fmax(mu0, z[0*pns_tv[1]+ii]);
		mu0 = fmax(mu0, z[1*pns_tv[1]+ii]);
		}
	//printf("\n mu0 = %f\n", mu0);

	// lagrangian multipliers and slack variables
	double *hlam_tv[N+1];
	double *ht_tv[N+1];
	for(ii=0; ii<=N; ii++)
		{
		d_zeros_align(&hlam_tv[ii], 2*pnb_tv[ii]+2*png_tv[ii]+4*pns_tv[ii], 1);
		d_zeros_align(&ht_tv[ii], 2*pnb_tv[ii]+2*png_tv[ii]+4*pns_tv[ii], 1);
		}



	// ip soft work space
	double *ip_soft_tv_work; d_zeros_align(&ip_soft_tv_work, d_ip2_soft_mpc_tv_work_space_size_double(N, nx_tv, nu_tv, nb_tv, ng_tv, ns_tv), 1);

	// call the ip soft solver
	d_ip2_soft_mpc_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, N, nx_tv, nu_tv, nb_tv, idxb_tv, ng_tv, ns_tv, hpBAbt_tv, hpQ_tv, hZ_tv, hz_tv, pdummyd, hdb_tv, hux_tv, 1, hpi_tv, hlam_tv, ht_tv, ip_soft_tv_work);



	int kk_avg_tv = 0;

	gettimeofday(&tv4, NULL); // start



	for(rep=0; rep<nrep; rep++)
		{

		idx = rep%10;
//		x0[0] = xx0[2*idx];
//		x0[1] = xx0[2*idx+1];

		// initialize states and inputs
//		for(ii=0; ii<=N; ii++)
//			for(jj=0; jj<nx+nu; jj++)
//				hux[ii][jj] = 0;

		x0[0] = xx0[2*idx];
		x0[1] = xx0[2*idx+1];

		// update initial state embedded in b and r
		dgemv_n_lib(nx, nx, pA, cnx, x0, 1, b, b0);
		d_cvt_tran_mat2pmat(nx, 1, b0, nx, nu, pBAbt0+nu/bs*bs*cnx_tv[1]+nu%bs, cnx_tv[1]);
		dgemv_n_lib(nu, nx, pS, cnx, x0, 1, q, q0);
		d_cvt_tran_mat2pmat(nu, 1, q0, nu, nu, pQ0+nu/bs*bs*cnz_tv[0]+nu%bs, cnz_tv[0]);

		// call the IP solver
		d_ip2_soft_mpc_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, N, nx_tv, nu_tv, nb_tv, idxb_tv, ng_tv, ns_tv, hpBAbt_tv, hpQ_tv, hZ_tv, hz_tv, pdummyd, hdb_tv, hux_tv, 1, hpi_tv, hlam_tv, ht_tv, ip_soft_tv_work);

		kk_avg_tv += kk;

		}
	
	gettimeofday(&tv5, NULL); // stop
	

	
	double *hrq_tv[N+1];
	double *hrb_tv[N];
	double *hrd_tv[N+1];
	double *hrz_tv[N+1];
	double *hq_tv[N+1];

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hrq_tv[ii], pnz_tv[ii], 1);
		d_zeros_align(&hrb_tv[ii], pnx_tv[ii+1], 1);
		d_zeros_align(&hrd_tv[ii], 2*pnb_tv[ii]+2*png_tv[ii]+2*pns_tv[ii], 1);
		d_zeros_align(&hrz_tv[ii], 2*pns_tv[ii], 1);
		d_zeros_align(&hq_tv[ii], pnz_tv[ii], 1);
		}
	d_zeros_align(&hrq_tv[N], pnz_tv[N], 1);
	d_zeros_align(&hrd_tv[N], 2*pnb_tv[N]+2*png_tv[N]+2*pns_tv[N], 1);
	d_zeros_align(&hrz_tv[N], 2*pns_tv[N], 1);
	d_zeros_align(&hq_tv[N], pnz_tv[N], 1);


	// restore linear part of cost function 
	for(ii=0; ii<=N; ii++)
		{
		drowex_lib(nu_tv[ii]+nx_tv[ii], hpQ_tv[ii]+(nu_tv[ii]+nx_tv[ii])/bs*bs*cnz_tv[ii]+(nu_tv[ii]+nx_tv[ii])%bs, hq_tv[ii]);
		}



	// residuals computation
//	d_res_ip_soft_mpc(nx, nu, N, nh, ns, hpBAbt, hpQ, hq, hZ, hz, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, hrz, &mu);
	d_res_ip_soft_mpc_tv(N, nx_tv, nu_tv, nb_tv, idxb_tv, ng_tv, ns_tv, hpBAbt_tv, hpQ_tv, hq_tv, hZ_tv, hz_tv, hux_tv, pdummyd, hdb_tv, hpi_tv, hlam_tv, ht_tv, hrq_tv, hrb_tv, hrd_tv, hrz_tv, &mu);




	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (soft-constraints time-variant solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		// print solution
		printf("\nhux_tv = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nu_tv[ii]+nx_tv[ii], hux_tv[ii], 1);
		
		}

	if(PRINTRES==1 && COMPUTE_MULT==1)
		{
		// print result 
		// print result 
		printf("\n");
		printf("\n");
		printf(" Print residuals\n\n");
		printf("\n");
		printf("\n");
		printf("rq = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nu_tv[ii]+nx_tv[ii], hrq_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("rz = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*pns_tv[ii], hrz_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("rb = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nx_tv[ii], hrb_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("rd = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*pnb_tv[ii]+2*png_tv[ii]+2*pns_tv[ii], hrd_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("mu = %e\n\n", mu);
		
		}



	// free memory
	free(pA);
	free(b0);
	free(pBAbt0);
	free(pBAbt1);
	free(pQ0);
	free(pQ1);
	free(pQN);
	free(idxb0);
	free(idxb1);
	free(idxbN);
	free(db0);
	free(db1);
	free(dbN);
	free(Z1);
	free(z1);
	for(ii=0; ii<=N; ii++) free(hpL_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hdL_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hux_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hpi_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hlam_tv[ii]);
	for(ii=0; ii<=N; ii++) free(ht_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hrq_tv[ii]);
	for(ii=0; ii<N; ii++) free(hrb_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hrd_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hrz_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hq_tv[ii]);



/**************************************************************************************************
*	printing timings
**************************************************************************************************/

	double times = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
	double time = (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
	double time_tv = (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6);
	
/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f (soft-constraints solver)\n", nrep, kk_avg / (double) nrep);
	printf(" Average number of iterations over %d runs: %5.1f (general-constraints solver)\n", nrep, kks_avg / (double) nrep);
	printf(" Average number of iterations over %d runs: %5.1f (soft-constraints time-variant solver)\n", nrep, kk_avg_tv / (double) nrep);
	printf("\n");
	printf(" Average solution time over %d runs: %5.2e seconds (soft-constraints solver)\n", nrep, time);
	printf(" Average solution time over %d runs: %5.2e seconds (general-constraints solver)\n", nrep, times);
	printf(" Average solution time over %d runs: %5.2e seconds (soft-constraints time-variant solver)\n", nrep, time_tv);
	printf("\n");



/************************************************
* free memory and return
************************************************/

	free(A);
	free(B);
	free(b);
	free(x0);
/*	free(BAb);*/
/*	free(BAbt);*/
	free(pBAbt);
	free(db);
	free(Q);
	free(pQ);
	free(Z);
	free(z);
	free(work);
	free(stat);
	for(jj=0; jj<N; jj++)
		{
//		free(hpQ[jj]);
		free(hq[jj]);
		free(hux[jj]);
		free(hpi[jj]);
		free(hlam[jj]);
		free(ht[jj]);
		free(hrb[jj]);
		free(hrq[jj]);
		free(hrd[jj]);
		free(hrz[jj]);
		}
//	free(hpQ[N]);
	free(hq[N]);
	free(hux[N]);
	free(hpi[N]);
	free(hlam[N]);
	free(ht[N]);
	free(hrq[N]);
	free(hrd[N]);
	free(hrz[N]);



	return 0;

	}
Beispiel #8
0
int main() {

#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SABDY_BRIDGE) ||  \
    defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BULLDOZER)
    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);  // flush to zero subnormals !!!
                                                 // works only with one thread
                                                 // !!!
#endif

    int ii, jj;

    int rep, nrep = NREP;

    int nx = 8;  // number of states (it has to be even for the mass-spring
                  // system test problem)
    int nu = 3;  // number of inputs (controllers) (it has to be at least 1 and
                  // at most nx/2 for the mass-spring system test problem)
    int N = 15;   // horizon length
    int nb = 11;  // number of box constrained inputs and states
    int ng = 0;  // 4;  // number of general constraints
    int ngN = 4;  // 4;  // number of general constraints at the last stage

    int nbu = nu < nb ? nu : nb;
    int nbx = nb - nu > 0 ? nb - nu : 0;

    // stage-wise variant size
    int nxx[N + 1];
#if defined(ELIMINATE_X0)
    nxx[0] = 0;
#else
    nxx[0] = nx;
#endif
    for (ii = 1; ii <= N; ii++) nxx[ii] = nx;

    int nuu[N + 1];
    for (ii = 0; ii < N; ii++) nuu[ii] = nu;
    nuu[N] = 0;

    int nbb[N + 1];
#if defined(ELIMINATE_X0)
    nbb[0] = nbu;
#else
    nbb[0] = nb;
#endif
    for (ii = 1; ii < N; ii++) nbb[ii] = nb;
    nbb[N] = nbx;

    int ngg[N + 1];
    for (ii = 0; ii < N; ii++) ngg[ii] = ng;
    ngg[N] = ngN;

    printf(
        " Test problem: mass-spring system with %d masses and %d controls.\n",
        nx / 2, nu);
    printf("\n");
    printf(
        " MPC problem size: %d states, %d inputs, %d horizon length, %d "
        "two-sided box constraints, %d two-sided general constraints.\n",
        nx, nu, N, nb, ng);
    printf("\n");
    printf("qpDUNES\n");
    printf("\n");

    /************************************************
    * dynamical system
    ************************************************/

    // state space matrices & initial state
    double *A;
    d_zeros(&A, nx, nx);  // states update matrix
    double *B;
    d_zeros(&B, nx, nu);  // inputs matrix
    double *b;
    d_zeros(&b, nx, 1);  // states offset
    double *x0;
    d_zeros(&x0, nx, 1);  // initial state

    // mass-spring system
    double Ts = 0.5;  // sampling time
    mass_spring_system(Ts, nx, nu, A, B, b, x0);

    for (jj = 0; jj < nx; jj++) b[jj] = 0.1;

    for (jj = 0; jj < nx; jj++) x0[jj] = 0;
    x0[0] = 2.5;
    x0[1] = 2.5;

    //    d_print_mat(nx, nx, A, nx);
    //    d_print_mat(nx, nu, B, nx);
    //    d_print_mat(nx, 1, b, nx);
    //    d_print_mat(nx, 1, x0, nx);

#if defined(ELIMINATE_X0)
    // compute b0 = b + A*x0
    double *b0;
    d_zeros(&b0, nx, 1);
    dcopy_3l(nx, b, 1, b0, 1);
    dgemv_n_3l(nx, nx, A, nx, x0, b0);
    //    d_print_mat(nx, 1, b, nx);
    //    d_print_mat(nx, 1, b0, nx);

    // then A0 is a matrix of size 0x0
    double *A0;
    d_zeros(&A0, 0, 0);
#endif

    /************************************************
    * box constraints
    ************************************************/

#if defined (FLIP_BOUNDS)
    int jj_end;
#endif

    int *idxb0;
    int_zeros(&idxb0, nbb[0], 1);
    double *lb0;
    d_zeros(&lb0, nbb[0], 1);
    double *ub0;
    d_zeros(&ub0, nbb[0], 1);
#if defined(ELIMINATE_X0)
    for (jj = 0; jj < nbb[0]; jj++) {
        lb0[jj] = - 0.5;  // umin
        ub0[jj] = + 0.5;  // umin
        idxb0[jj] = jj;
    }
#else
#if defined (FLIP_BOUNDS)
jj_end = nbu < nbb[0] ? nbu : nbb[0];
for (jj = 0; jj < jj_end; jj++) {
    lb0[jj] = - 0.5;  // umin
    ub0[jj] = + 0.5;  // umax
    idxb0[jj] = jj;
}
for ( ; jj < nbb[0]; jj++) {
    lb0[jj] = x0[jj-nbu];  // initial state
    ub0[jj] = x0[jj-nbu];  // initial state
    idxb0[jj] = jj;
}
#else
for (jj = 0; jj < nxx[0]; jj++) {
    lb0[jj] = x0[jj];  // initial state
    ub0[jj] = x0[jj];  // initial state
    idxb0[jj] = jj;
}
for (jj = 0; jj < nbu; jj++) {
    lb0[jj+nxx[0]] = -0.5;  // umin
    ub0[jj+nxx[0]] = 0.5;   // umax
    idxb0[jj+nxx[0]] = nxx[0]+jj;
}
#endif
#endif
    //    int_print_mat(nbb[0], 1, idxb0, nbb[0]);
    //    d_print_mat(nbb[0], 1, lb0, nbb[0]);

    int *idxb1;
    int_zeros(&idxb1, nbb[1], 1);
    double *lb1;
    d_zeros(&lb1, nbb[1], 1);
    double *ub1;
    d_zeros(&ub1, nbb[1], 1);
#if defined (FLIP_BOUNDS)
    jj_end = nbu < nbb[1] ? nbu : nbb[1];
    for (jj = 0; jj < jj_end; jj++) {
        lb1[jj] = - 0.5;  // umin
        ub1[jj] = + 0.5;  // umax
        idxb1[jj] = jj;
    }
    for ( ; jj < nbb[1]; jj++) {
        lb1[jj] = - 4.0;  // xmin
        ub1[jj] = + 4.0;  // xmax
        idxb1[jj] = jj;
    }
#else
    for (jj = 0; jj < nbx; jj++) {
        lb1[jj] = -4.0;  // xmin
        ub1[jj] = 4.0;   // xmax
        idxb1[jj] = jj;
    }
    for (; jj < nb; jj++) {
        lb1[jj] = -0.5;  // umin
        ub1[jj] = 0.5;   // umax
        idxb1[jj] = jj;
    }
#endif
    //    int_print_mat(nbb[1], 1, idxb1, nbb[1]);
    //    d_print_mat(nbb[1], 1, lb1, nbb[1]);

    int *idxbN;
    int_zeros(&idxbN, nbb[N], 1);
    double *lbN;
    d_zeros(&lbN, nbb[N], 1);
    double *ubN;
    d_zeros(&ubN, nbb[N], 1);
#if defined (FLIP_BOUNDS)
    jj_end = nbu < nbb[N] ? nbu : nbb[N];
    for (jj = 0; jj < jj_end; jj++) {
        lbN[jj] = - 0.5;  // umin
        ubN[jj] = + 0.5;  // umax
        idxbN[jj] = jj;
    }
    for ( ; jj < nbb[N]; jj++) {
        lbN[jj] = - 4.0;  // xmin
        ubN[jj] = + 4.0;  // xmax
        idxbN[jj] = jj;
    }
#else
    for (jj = 0; jj < nbx; jj++) {
        lbN[jj] = -4.0;  // xmin
        ubN[jj] = 4.0;   // xmax
        idxbN[jj] = jj;
    }
#endif
    //    int_print_mat(nbb[N], 1, idxbN, nbb[N]);
    //    d_print_mat(nbb[N], 1, lbN, nbb[N]);

    /************************************************
    * general constraints
    ************************************************/

    double *C;
    d_zeros(&C, ng, nx);
    double *D;
    d_zeros(&D, ng, nu);
    double *lg;
    d_zeros(&lg, ng, 1);
    double *ug;
    d_zeros(&ug, ng, 1);

    double *CN;
    d_zeros(&CN, ngN, nx);
    for (ii = 0; ii < ngN; ii++) CN[ii * (ngN + 1)] = 1.0;
    //    d_print_mat(ngN, nx, CN, ngN);
    double *lgN;
    d_zeros(&lgN, ngN, 1);  // force all states to 0 at the last stage
    double *ugN;
    d_zeros(&ugN, ngN, 1);  // force all states to 0 at the last stage

    /************************************************
    * cost function
    ************************************************/

    double *Q;
    d_zeros(&Q, nx, nx);
    for (ii = 0; ii < nx; ii++) Q[ii * (nx + 1)] = 1.0;

    double *R;
    d_zeros(&R, nu, nu);
    for (ii = 0; ii < nu; ii++) R[ii * (nu + 1)] = 2.0;

    double *S;
    d_zeros(&S, nu, nx);

    double *q;
    d_zeros(&q, nx, 1);
    for (ii = 0; ii < nx; ii++) q[ii] = 0.1;

    double *r;
    d_zeros(&r, nu, 1);
    for (ii = 0; ii < nu; ii++) r[ii] = 0.2;

#if defined(ELIMINATE_X0)
    // Q0 and q0 are matrices of size 0
    double *Q0;
    d_zeros(&Q0, 0, 0);
    double *q0;
    d_zeros(&q0, 0, 1);

    // compute r0 = r + S*x0
    double *r0;
    d_zeros(&r0, nu, 1);
    dcopy_3l(nu, r, 1, r0, 1);
    dgemv_n_3l(nu, nx, S, nu, x0, r0);

    // then S0 is a matrix of size nux0
    double *S0;
    d_zeros(&S0, nu, 0);
#endif

    /************************************************
    * problems data
    ************************************************/

    double *hA[N];
    double *hB[N];
    double *hb[N];
    double *hQ[N + 1];
    double *hS[N];
    double *hR[N];
    double *hq[N + 1];
    double *hr[N];
    double *hlb[N + 1];
    double *hub[N + 1];
    int *hidxb[N + 1];
    double *hC[N + 1];
    double *hD[N];
    double *hlg[N + 1];
    double *hug[N + 1];

#if defined(ELIMINATE_X0)
    hA[0] = A0;
    hb[0] = b0;
    hQ[0] = Q0;
    hS[0] = S0;
    hq[0] = q0;
    hr[0] = r0;
#else
    hA[0] = A;
    hb[0] = b;
    hQ[0] = Q;
    hS[0] = S;
    hq[0] = q;
    hr[0] = r;
#endif
    hB[0] = B;
    hR[0] = R;
    hlb[0] = lb0;
    hub[0] = ub0;
    hidxb[0] = idxb0;
    hC[0] = C;
    hD[0] = D;
    hlg[0] = lg;
    hug[0] = ug;
    for (ii = 1; ii < N; ii++) {
        hA[ii] = A;
        hB[ii] = B;
        hb[ii] = b;
        hQ[ii] = Q;
        hS[ii] = S;
        hR[ii] = R;
        hq[ii] = q;
        hr[ii] = r;
        hlb[ii] = lb1;
        hub[ii] = ub1;
        hidxb[ii] = idxb1;
        hC[ii] = C;
        hD[ii] = D;
        hlg[ii] = lg;
        hug[ii] = ug;
    }
    hQ[N] = Q;  // or maybe initialize to the solution of the DARE???
    hq[N] = q;  // or maybe initialize to the solution of the DARE???
    hlb[N] = lbN;
    hub[N] = ubN;
    hidxb[N] = idxbN;
    hC[N] = CN;
    hlg[N] = lgN;
    hug[N] = ugN;

    /************************************************
    * solution
    ************************************************/

    double *hx[N + 1];
    double *hu[N];
    double *hpi[N];
    double *hlam[N+1];
    double *ht[N+1];

    for (ii = 0; ii < N; ii++) {
        d_zeros(&hx[ii], nxx[ii], 1);
        d_zeros(&hu[ii], nuu[ii], 1);
        d_zeros(&hpi[ii], nxx[ii+1], 1);
        d_zeros(&hlam[ii], 2*nbb[ii]+2*ngg[ii], 1);
        d_zeros(&ht[ii], 2*nbb[ii]+2*ngg[ii], 1);
    }
    d_zeros(&hx[N], nxx[N], 1);
    d_zeros(&hlam[N], 2*nbb[N]+2*ngg[N], 1);
    d_zeros(&ht[N], 2*nbb[N]+2*ngg[N], 1);

    /************************************************
    * XXX initial guess
    ************************************************/

    double *hux_in[N+1];
    double *hlam_in[N+1];
    double *ht_in[N+1];

    for (ii = 0; ii <= N; ii++) {
        d_zeros(&hux_in[ii], nuu[ii]+nxx[ii], 1);
        d_zeros(&hlam_in[ii], 2*nbb[ii]+2*ngg[ii], 1);
        d_zeros(&ht_in[ii], 2*nbb[ii]+2*ngg[ii], 1);
    }

    /************************************************
    * create the in and out struct
    ************************************************/

    ocp_qp_in qp_in;
    qp_in.N = N;
    qp_in.nx = (const int *) nxx;
    qp_in.nu = (const int *) nuu;
    qp_in.nb = (const int *) nbb;
    qp_in.nc = (const int *) ngg;
    qp_in.A = (const double **) hA;
    qp_in.B = (const double **) hB;
    qp_in.b = (const double **) hb;
    qp_in.Q = (const double **) hQ;
    qp_in.S = (const double **) hS;
    qp_in.R = (const double **) hR;
    qp_in.q = (const double **) hq;
    qp_in.r = (const double **) hr;
    qp_in.idxb = (const int **) hidxb;
    qp_in.lb = (const double **) hlb;
    qp_in.ub = (const double **) hub;
    qp_in.Cx = (const double **) hC;
    qp_in.Cu = (const double **) hD;
    qp_in.lc = (const double **) hlg;
    qp_in.uc = (const double **) hug;

    ocp_qp_out qp_out;
    qp_out.x = hx;
    qp_out.u = hu;
    qp_out.pi = hpi;
    qp_out.lam = hlam;
    qp_out.t = ht;  // XXX why also the slack variables ???

    /************************************************
    * solver arguments
    ************************************************/

    ocp_qp_qpdunes_args *args = ocp_qp_qpdunes_create_arguments(QPDUNES_LINEAR_MPC);

    /************************************************
    * workspace
    ************************************************/

    ocp_qp_qpdunes_memory *mem = NULL;

    int_t work_space_size = ocp_qp_qpdunes_calculate_workspace_size(&qp_in, args);
    void *work = (void*)malloc(work_space_size);

    /************************************************
    * call the solver
    ************************************************/

    int return_value = 0;

    acados_timer timer;
    acados_tic(&timer);

//  nrep = 1;
    for (rep = 0; rep < nrep; rep++) {
        // NOTE(dimitris): creating memory again to avoid warm start
        mem = ocp_qp_qpdunes_create_memory(&qp_in, args);

        // call the QP OCP solver
        ocp_qp_qpdunes(&qp_in, &qp_out, args, mem, work);
    }

    real_t time = acados_toc(&timer)/nrep;

    if (return_value == ACADOS_SUCCESS)
        printf("\nACADOS status: solution found\n");

    if (return_value == ACADOS_MAXITER)
        printf("\nACADOS status: maximum number of iterations reached\n");

    if (return_value == ACADOS_MINSTEP)
        printf("\nACADOS status: below minimum step size length\n");

    printf("\nu = \n");
    for (ii = 0; ii < N; ii++) d_print_mat(1, nuu[ii], hu[ii], 1);

    printf("\nx = \n");
    for (ii = 0; ii <= N; ii++) d_print_mat(1, nxx[ii], hx[ii], 1);

    printf("\n");
    printf(" Average solution time over %d runs: %5.2e seconds\n", nrep, time);
    printf("\n\n");

    /************************************************
    * free memory
    ************************************************/

    d_free(A);
    d_free(B);
    d_free(b);
    d_free(x0);
    d_free(Q);
    d_free(S);
    d_free(R);
    d_free(q);
    d_free(r);
#if defined(ELIMINATE_X0)
    d_free(A0);
    d_free(b0);
    d_free(Q0);
    d_free(S0);
    d_free(q0);
    d_free(r0);
#endif
    int_free(idxb0);
    d_free(lb0);
    d_free(ub0);
    int_free(idxb1);
    d_free(lb1);
    d_free(ub1);
    int_free(idxbN);
    d_free(lbN);
    d_free(ubN);
    d_free(C);
    d_free(D);
    d_free(lg);
    d_free(ug);
    d_free(CN);
    d_free(lgN);
    d_free(ugN);

    for (ii = 0; ii < N; ii++) {
        d_free(hx[ii]);
        d_free(hu[ii]);
        d_free(hpi[ii]);
        d_free(hlam[ii]);
        d_free(ht[ii]);
    }
    d_free(hx[N]);
    d_free(hlam[N]);
    d_free(ht[N]);

    ocp_qp_qpdunes_free_memory(mem);
    free(work);

    return 0;
}
Beispiel #9
0
int main()
	{
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
/*	printf("\nflush subnormals to zero\n");*/
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	int ii, jj, idx;
	
	int rep, nrep=NREP;

	int nx = NX; // number of states (it has to be even for the mass-spring system test problem)
	int nu = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
	int N  = NN; // horizon lenght
	int nb = NB; // number of box constrained inputs and states

	printf(" Test problem: mass-spring system with %d masses and %d controls.\n", nx/2, nu);
	printf("\n");
	printf(" MPC problem size: %d states, %d inputs, %d horizon length, %d two-sided box constraints.\n", nx, nu, N, nb);
	printf("\n");
#if IP == 1
	printf(" IP method parameters: primal-dual IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_d_ip_box.c to change them).\n", K_MAX, MU_TOL);
#elif IP == 2
	printf(" IP method parameters: predictor-corrector IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_d_ip_box.c to change them).\n", K_MAX, MU_TOL);
#else
	printf(" Wrong value for IP solver choice: %d\n", IP);
#endif

	int info = 0;
		
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line
	
	const int nz = nx+nu+1;
	const int pnz = bs*((nz+bs-1)/bs);
	const int pnx = bs*((nx+bs-1)/bs);
	const int cnz = ncl*((nx+nu+1+ncl-1)/ncl);
	const int cnx = ncl*((nx+ncl-1)/ncl);
	const int pnb = bs*((2*nb+bs-1)/bs); // packed number of box constraints
	const int anz = nal*((nz+nal-1)/nal);
	const int anx = nal*((nx+nal-1)/nal);
	const int anb = nal*((2*nb+nal-1)/nal); // cache aligned number of box constraints

	const int pad = (ncl-nx%ncl)%ncl; // packing between BAbtL & P
	const int cnl = cnz<cnx+ncl ? nx+pad+cnx+ncl : nx+pad+cnz;
	
/************************************************
* dynamical system
************************************************/	

	double *A; d_zeros(&A, nx, nx); // states update matrix

	double *B; d_zeros(&B, nx, nu); // inputs matrix

	double *b; d_zeros(&b, nx, 1); // states offset
	double *x0; d_zeros(&x0, nx, 1); // initial state

	double Ts = 0.5; // sampling time
	mass_spring_system(Ts, nx, nu, N, A, B, b, x0);
	
	for(jj=0; jj<nx; jj++)
		b[jj] = 0.1;
	
	for(jj=0; jj<nx; jj++)
		x0[jj] = 0;
	x0[0] = 3.5;
	x0[1] = 3.5;
	
//	d_print_mat(nx, nx, A, nx);
//	d_print_mat(nx, nu, B, nx);
//	d_print_mat(nx, 1, b, nx);
//	d_print_mat(nx, 1, x0, nx);
	
	/* packed */
/*	double *BAb; d_zeros(&BAb, nx, nz);*/

/*	dmcopy(nx, nu, B, nx, BAb, nx);*/
/*	dmcopy(nx, nx, A, nx, BAb+nu*nx, nx);*/
/*	dmcopy(nx, 1 , b, nx, BAb+(nu+nx)*nx, nx);*/
	
	/* transposed */
/*	double *BAbt; d_zeros_align(&BAbt, pnz, pnz);*/
/*	for(ii=0; ii<nx; ii++)*/
/*		for(jj=0; jj<nz; jj++)*/
/*			{*/
/*			BAbt[jj+pnz*ii] = BAb[ii+nx*jj];*/
/*			}*/

	/* packed into contiguous memory */
	double *pBAbt; d_zeros_align(&pBAbt, pnz, cnx);
/*	d_cvt_mat2pmat(nz, nx, 0, bs, BAbt, pnz, pBAbt, cnx);*/
/*	d_cvt_tran_mat2pmat(nx, nz, 0, bs, BAb, nx, pBAbt, cnx);*/

	d_cvt_tran_mat2pmat(nx, nu, 0, bs, B, nx, pBAbt, cnx);
	d_cvt_tran_mat2pmat(nx, nx, nu, bs, A, nx, pBAbt+nu/bs*cnx*bs+nu%bs, cnx);
	for (jj = 0; jj<nx; jj++)
		pBAbt[(nx+nu)/bs*cnx*bs+(nx+nu)%bs+jj*bs] = b[jj];

/*	d_print_pmat (nz, nx, bs, pBAbt, cnx);*/
/*	exit(1);*/

/************************************************
* box constraints
************************************************/	

	double *db; d_zeros_align(&db, 2*nb, 1);
	for(jj=0; jj<2*nu; jj++)
		db[jj] = - 0.5;   // umin
	for(; jj<2*nb; jj++)
		db[jj] = - 4.0;   // xmin

/************************************************
* cost function
************************************************/	

	double *Q; d_zeros_align(&Q, pnz, pnz);
	for(ii=0; ii<nu; ii++) Q[ii*(pnz+1)] = 2.0;
	for(; ii<pnz; ii++) Q[ii*(pnz+1)] = 1.0;
	for(ii=0; ii<nz; ii++) Q[nx+nu+ii*pnz] = 0.1;
/*	Q[(nx+nu)*(pnz+1)] = 1e35; // large enough (not needed any longer) */
	
	/* packed into contiguous memory */
	double *pQ; d_zeros_align(&pQ, pnz, cnz);
	d_cvt_mat2pmat(nz, nz, 0, bs, Q, pnz, pQ, cnz);

/************************************************
* matrices series
************************************************/	

	double *(hpQ[N+1]);
	double *(hq[N+1]);
	double *(hux[N+1]);
	double *(hpi[N+1]);
	double *(hlam[N+1]);
	double *(ht[N+1]);
	double *(hpBAbt[N]);
	double *(hdb[N+1]);
	double *(hrb[N]);
	double *(hrq[N+1]);
	double *(hrd[N+1]);

	for(jj=0; jj<N; jj++)
		{
		d_zeros_align(&hpQ[jj], pnz, cnz);
		}
	d_zeros_align(&hpQ[N], pnz, pnz);

	for(jj=0; jj<N; jj++)
		{
		d_zeros_align(&hq[jj], anz, 1);
		d_zeros_align(&hux[jj], anz, 1);
		d_zeros_align(&hpi[jj], anx, 1);
		d_zeros_align(&hlam[jj],anb, 1); // TODO pnb
		d_zeros_align(&ht[jj], anb, 1); // TODO pnb
		hpBAbt[jj] = pBAbt;
		hdb[jj] = db;
		d_zeros_align(&hrb[jj], anx, 1);
		d_zeros_align(&hrq[jj], anz, 1);
		d_zeros_align(&hrd[jj], anb, 1); // TODO pnb
		}
	d_zeros_align(&hq[N], anz, 1);
	d_zeros_align(&hux[N], anz, 1);
	d_zeros_align(&hpi[N], anx, 1);
	d_zeros_align(&hlam[N], anb, 1); // TODO pnb
	d_zeros_align(&ht[N], anb, 1); // TODO pnb
	hdb[N] = db;
	d_zeros_align(&hrq[N], anz, 1);
	d_zeros_align(&hrd[N], anb, 1); // TODO pnb
	
	// starting guess
	for(jj=0; jj<nx; jj++) hux[0][nu+jj]=x0[jj];

/************************************************
* riccati-like iteration
************************************************/

	double *work; d_zeros_align(&work, (N+1)*(pnz*cnl + 4*anz + 4*anb + 2*anx) + 3*anz, 1); // work space
/*	for(jj=0; jj<( (N+1)*(pnz*cnl + 4*anz + 4*anb + 2*anx) + 3*anz ); jj++) work[jj] = -1.0;*/
	int kk = 0; // acutal number of iterations
/*	char prec = PREC; // double/single precision*/
/*	double sp_thr = SP_THR; // threshold to switch between double and single precision*/
	int k_max = K_MAX; // maximum number of iterations in the IP method
	double mu_tol = MU_TOL; // tolerance in the duality measure
	double alpha_min = ALPHA_MIN; // minimum accepted step length
	double sigma[] = {0.4, 0.3, 0.01}; // control primal-dual IP behaviour
	double *stat; d_zeros(&stat, 5, k_max); // stats from the IP routine
	int compute_mult = COMPUTE_MULT;
	int warm_start = WARM_START;
	double mu = -1.0;
	int hpmpc_status;
	


	/* initizile the cost function */
	for(ii=0; ii<N; ii++)
		{
		for(jj=0; jj<pnz*cnz; jj++) hpQ[ii][jj]=pQ[jj];
		}
	for(jj=0; jj<pnz*cnz; jj++) hpQ[N][jj]=pQ[jj];



	// initial states
	double xx0[] = {3.5, 3.5, 3.66465, 2.15833, 1.81327, -0.94207, 1.86531, -2.35760, 2.91534, 1.79890, -1.49600, -0.76600, -2.60268, 1.92456, 1.66630, -2.28522, 3.12038, 1.83830, 1.93519, -1.87113};



	/* warm up */

	// initialize states and inputs
	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nx+nu; jj++)
			hux[ii][jj] = 0;

	hux[0][nu+0] = xx0[0];
	hux[0][nu+1] = xx0[1];

	// call the IP solver
	if(FREE_X0==0)
		{
		if(IP==1)
			hpmpc_status = d_ip_box_mpc(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
		else
			hpmpc_status = d_ip2_box_mpc(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
		}
	else
		{
		if(IP==1)
			hpmpc_status = d_ip_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
		else
			hpmpc_status = d_ip2_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
		}


	int kk_avg = 0;

	/* timing */
	struct timeval tv0, tv1;
	gettimeofday(&tv0, NULL); // start

	for(rep=0; rep<nrep; rep++)
		{

		idx = rep%10;
		x0[0] = xx0[2*idx];
		x0[1] = xx0[2*idx+1];

		// initialize states and inputs
		for(ii=0; ii<=N; ii++)
			for(jj=0; jj<nx+nu; jj++)
				hux[ii][jj] = 0;

		hux[0][nu+0] = xx0[2*idx];
		hux[0][nu+1] = xx0[2*idx+1];

		// call the IP solver
		if(FREE_X0==0)
			{
			if(IP==1)
				hpmpc_status = d_ip_box_mpc(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
			else
				hpmpc_status = d_ip2_box_mpc(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
			}
		else
			{
			if(IP==1)
				hpmpc_status = d_ip_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
			else
				hpmpc_status = d_ip2_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
			}

		kk_avg += kk;

		}
	
	gettimeofday(&tv1, NULL); // stop
	


	double time = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
	
/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f\n", nrep, kk_avg / (double) nrep);
	printf("\n");
	printf(" Average solution time over %d runs: %5.2e seconds\n", nrep, time);
	printf("\n");



	// restore linear part of cost function 
	for(ii=0; ii<N; ii++)
		{
		for(jj=0; jj<nx+nu; jj++) hq[ii][jj] = Q[nx+nu+pnz*jj];
		}
	for(jj=0; jj<nx+nu; jj++) hq[N][jj] = Q[nx+nu+pnz*jj];

	// residuals computation
	if(FREE_X0==0)
		d_res_ip_box_mpc(nx, nu, N, nb, hpBAbt, hpQ, hq, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, &mu);
	else
		d_res_ip_box_mhe_old(nx, nu, N, nb, hpBAbt, hpQ, hq, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, &mu);


	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nu = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nu, hux[ii], 1);
		
		printf("\nlam = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*nb, hlam[ii], 1);
		
		}

	if(PRINTRES==1 && COMPUTE_MULT==1)
		{
		// print result 
		// print result 
		printf("\n");
		printf("\n");
		printf(" Print residuals\n\n");
		printf("\n");
		printf("\n");
		printf("rq = \n\n");
		if(FREE_X0==0)
			{
			d_print_mat(1, nu, hrq[0], 1);
			for(ii=1; ii<=N; ii++)
/*				d_print_mat_e(1, nx+nu, hrq[ii], 1);*/
				d_print_mat(1, nx+nu, hrq[ii], 1);
			}
		else
			{
			for(ii=0; ii<=N; ii++)
/*				d_print_mat_e(1, nx+nu, hrq[ii], 1);*/
				d_print_mat(1, nx+nu, hrq[ii], 1);
			}
		printf("\n");
		printf("\n");
		printf("rb = \n\n");
		for(ii=0; ii<N; ii++)
/*			d_print_mat_e(1, nx, hrb[ii], 1);*/
			d_print_mat(1, nx, hrb[ii], 1);
		printf("\n");
		printf("\n");
		printf("rd = \n\n");
		for(ii=0; ii<=N; ii++)
/*			d_print_mat_e(1, 2*nb, hrd[ii], 1);*/
			d_print_mat(1, 2*nb, hrd[ii], 1);
		printf("\n");
		printf("\n");
		printf("mu = %e\n\n", mu);
		
		}

/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	
/************************************************
* free memory and return
************************************************/

	free(A);
	free(B);
	free(b);
	free(x0);
/*	free(BAb);*/
/*	free(BAbt);*/
	free(pBAbt);
	free(db);
	free(Q);
	free(pQ);
	free(work);
	free(stat);
	for(jj=0; jj<N; jj++)
		{
		free(hpQ[jj]);
		free(hq[jj]);
		free(hux[jj]);
		free(hpi[jj]);
		free(hlam[jj]);
		free(ht[jj]);
		free(hrb[jj]);
		free(hrq[jj]);
		free(hrd[jj]);
		}
	free(hpQ[N]);
	free(hq[N]);
	free(hux[N]);
	free(hpi[N]);
	free(hlam[N]);
	free(ht[N]);
	free(hrq[N]);
	free(hrd[N]);



	return 0;

	}
Beispiel #10
0
int main()
	{
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");
	
#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	int ii, jj;
	
	int rep, nrep=1000;//NREP;

	int nx = NX; // number of states (it has to be even for the mass-spring system test problem)
	int nu = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
	int N  = NN; // horizon lenght
	int nb  = nu+nx; // number of box constrained inputs and states
	int ng  = nx; //4;  // number of general constraints
	int ngN = nx; // number of general constraints at the last stage

# define USE_IPM_RES 1
	
//	int M = 32; // where the equality constraint hold

	int nbu = nu<nb ? nu : nb ;
	int nbx = nb-nu>0 ? nb-nu : 0;

#define KEEP_X0 0

	// stage-wise variant size
	int nx_v[N+1];
#if KEEP_X0
	nx_v[0] = nx;
#else
	nx_v[0] = 0;
#endif
	for(ii=1; ii<=N; ii++)
		nx_v[ii] = nx;

	int nu_v[N+1];
	for(ii=0; ii<N; ii++)
		nu_v[ii] = nu;
	nu_v[N] = 0;

	int nb_v[N+1];
#if KEEP_X0
	nb_v[0] = nb;
#else
	nb_v[0] = nbu;
#endif
	for(ii=1; ii<N; ii++)
		nb_v[ii] = nb;
	nb_v[N] = nbx;

	int ng_v[N+1];
	for(ii=0; ii<N; ii++)
		ng_v[ii] = ng;
	ng_v[N] = ngN;
//	ng_v[M] = nx; // XXX
	



	printf(" Test problem: mass-spring system with %d masses and %d controls.\n", nx/2, nu);
	printf("\n");
	printf(" MPC problem size: %d states, %d inputs, %d horizon length, %d two-sided box constraints, %d two-sided general constraints.\n", nx, nu, N, nb, ng);
	printf("\n");
#if IP == 1
	printf(" IP method parameters: primal-dual IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_param.c to change them).\n", K_MAX, MU_TOL);
#elif IP == 2
	printf(" IP method parameters: predictor-corrector IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_param.c to change them).\n", K_MAX, MU_TOL);
#else
	printf(" Wrong value for IP solver choice: %d\n", IP);
#endif

	int info = 0;
		
	const int bs  = D_MR; //d_get_mr();
	const int ncl = D_NCL;

	int pnz = (nu+nx+1+bs-1)/bs*bs;
	int pnu = (nu+bs-1)/bs*bs;
	int pnu1 = (nu+1+bs-1)/bs*bs;
	int pnx = (nx+bs-1)/bs*bs;
	int pnx1 = (nx+1+bs-1)/bs*bs;
	int pnux = (nu+nx+bs-1)/bs*bs;
	int cnx = (nx+ncl-1)/ncl*ncl;
	int cnu = (nu+ncl-1)/ncl*ncl;
	int cnux = (nu+nx+ncl-1)/ncl*ncl;

	int pnb_v[N+1]; 
	int png_v[N+1]; 
	int pnx_v[N+1]; 
	int pnz_v[N+1]; 
	int pnux_v[N+1]; 
	int cnx_v[N+1]; 
	int cnux_v[N+1]; 
	int cng_v[N+1]; 

	for(ii=0; ii<N; ii++) 
		{
		pnb_v[ii] = (nb_v[ii]+bs-1)/bs*bs;
		png_v[ii] = (ng_v[ii]+bs-1)/bs*bs;
		pnx_v[ii] = (nx_v[ii]+bs-1)/bs*bs;
		pnz_v[ii] = (nu_v[ii]+nx_v[ii]+1+bs-1)/bs*bs;
		pnux_v[ii] = (nu_v[ii]+nx_v[ii]+bs-1)/bs*bs;
		cnx_v[ii] = (nx_v[ii]+ncl-1)/ncl*ncl;
		cnux_v[ii] = (nu_v[ii]+nx_v[ii]+ncl-1)/ncl*ncl;
		cng_v[ii] = (ng_v[ii]+ncl-1)/ncl*ncl;
		}
	ii = N;
	pnb_v[ii] = (nb_v[ii]+bs-1)/bs*bs;
	png_v[ii] = (ng_v[ii]+bs-1)/bs*bs;
	pnx_v[ii] = (nx_v[ii]+bs-1)/bs*bs;
	pnz_v[ii] = (nx_v[ii]+1+bs-1)/bs*bs;
	pnux_v[ii] = (nx_v[ii]+bs-1)/bs*bs;
	cnx_v[ii] = (nx_v[ii]+ncl-1)/ncl*ncl;
	cnux_v[ii] = (nx_v[ii]+ncl-1)/ncl*ncl;
	cng_v[ii] = (ng_v[ii]+ncl-1)/ncl*ncl;


/************************************************
* dynamical system
************************************************/	

	double *A; d_zeros(&A, nx, nx); // states update matrix

	double *B; d_zeros(&B, nx, nu); // inputs matrix

	double *b; d_zeros_align(&b, nx, 1); // states offset
	double *x0; d_zeros_align(&x0, nx, 1); // initial state

	double Ts = 0.5; // sampling time
	mass_spring_system(Ts, nx, nu, N, A, B, b, x0);
	
	for(jj=0; jj<nx; jj++)
		b[jj] = 0.1;
	
	for(jj=0; jj<nx; jj++)
		x0[jj] = 0;
	x0[0] = 2.5;
	x0[1] = 2.5;

	double *pA; d_zeros_align(&pA, pnx, cnx);
	d_cvt_mat2pmat(nx, nx, A, nx, 0, pA, cnx);
	double *b0; d_zeros_align(&b0, pnx, 1);
	for(ii=0; ii<nx; ii++) b0[ii] = b[ii];
#if ! KEEP_X0
	dgemv_n_lib(nx, nx, pA, cnx, x0, 1, b0, b0);
#endif

	double *pBAbt0; 
	d_zeros_align(&pBAbt0, pnz_v[0], cnx_v[1]);
	d_cvt_tran_mat2pmat(nx_v[1], nu_v[0], B, nx_v[1], 0, pBAbt0, cnx_v[1]);
	d_cvt_tran_mat2pmat(nx_v[1], nx_v[0], A, nx_v[1], nu_v[0], pBAbt0+nu_v[0]/bs*bs*cnx_v[1]+nu_v[0]%bs, cnx_v[1]);
	d_cvt_tran_mat2pmat(nx_v[1], 1, b0, nx_v[1], nu_v[0]+nx_v[0], pBAbt0+(nu_v[0]+nx_v[0])/bs*bs*cnx_v[1]+(nu_v[0]+nx_v[0])%bs, cnx_v[1]);

	double *pBAbt1; 
	if(N>1)
		{
		d_zeros_align(&pBAbt1, pnz_v[1], cnx_v[2]);
		d_cvt_tran_mat2pmat(nx_v[2], nu_v[1], B, nx_v[2], 0, pBAbt1, cnx_v[2]);
		d_cvt_tran_mat2pmat(nx_v[2], nx_v[1], A, nx_v[2], nu_v[1], pBAbt1+nu_v[1]/bs*bs*cnx_v[2]+nu_v[1]%bs, cnx_v[2]);
		d_cvt_tran_mat2pmat(nx_v[2], 1, b, nx_v[2], nu_v[1]+nx_v[1], pBAbt1+(nu_v[1]+nx_v[1])/bs*bs*cnx_v[2]+(nu_v[1]+nx_v[1])%bs, cnx_v[2]);
		}

#if 0
d_print_pmat(nu_v[0]+nx_v[0]+1, nx_v[1], bs, pBAbt0, cnx_v[1]);
d_print_pmat(nu_v[1]+nx_v[1]+1, nx_v[2], bs, pBAbt1, cnx_v[2]);
exit(2);
#endif

/************************************************
* box & general constraints
************************************************/	

	int *idx0; i_zeros(&idx0, nb_v[0], 1);
	double *d0; d_zeros_align(&d0, 2*pnb_v[0]+2*png_v[0], 1);
#if KEEP_X0
	for(jj=0; jj<nbu; jj++)
		{
		d0[jj]          = - 0.5;   //   umin
		d0[pnb_v[0]+jj] =   0.5;   //   umax
		idx0[jj] = jj;
		}
	for(; jj<nb; jj++)
		{
		d0[jj]          =   x0[jj-nu];   //   xmin
		d0[pnb_v[0]+jj] =   x0[jj-nu];   //   xmax
		idx0[jj] = jj;
		}
#else
	for(jj=0; jj<nbu; jj++)
		{
		d0[jj]          = - 0.5;   //   umin
		d0[pnb_v[0]+jj] =   0.5;   //   umax
		idx0[jj] = jj;
		}
#endif
	for(jj=0; jj<ng_v[0]; jj++)
		{
		d0[2*pnb_v[0]+jj]          = - 100.0;   //   xmin
		d0[2*pnb_v[0]+png_v[0]+jj] =   100.0;   //   xmax
		}
#if 0
	i_print_mat(1, nb_v[0], idx0, 1);
	d_print_mat(1, 2*pnb_v[0]+2*png_v[0], d0, 1);
	exit(2);
#endif

	int *idx1; i_zeros(&idx1, nb_v[1], 1);
	double *d1; d_zeros_align(&d1, 2*pnb_v[1]+2*png_v[1], 1);
	for(jj=0; jj<nbu; jj++)
		{
		d1[jj]          = - 0.5;   //   umin
		d1[pnb_v[1]+jj] =   0.5;   //   umax
		idx1[jj] = jj;
		}
	for(; jj<nb; jj++)
		{
		d1[jj]          = - 10.0;   //   xmin
		d1[pnb_v[1]+jj] =   10.0;   //   xmax
		idx1[jj] = jj;
		}
	for(jj=0; jj<ng_v[1]; jj++)
		{
		d1[2*pnb_v[1]+jj]          = - 100.0;   //   xmin
		d1[2*pnb_v[1]+png_v[1]+jj] =   100.0;   //   xmax
		}
//	i_print_mat(nb, 1, idx1, nb);

	int *idxN; i_zeros(&idxN, nb_v[N], 1);
	double *dN; d_zeros_align(&dN, 2*pnb_v[N]+2*png_v[N], 1);
	for(jj=0; jj<nbx; jj++)
		{
		dN[jj]          = - 10.0;   //   xmin
		dN[pnb_v[N]+jj] =   10.0;   //   xmax
		idxN[jj] = jj;
		}
	for(jj=0; jj<ng_v[N]; jj++)
		{
		dN[2*pnb_v[N]+jj]          = - 0.0;   //   xmin
		dN[2*pnb_v[N]+png_v[N]+jj] =   0.0;   //   xmax
		}
//	d_print_mat(1, 2*pnb+2*png, d, 1);
//	d_print_mat(1, 2*pnb_v[N]+2*png_v[N], dN, 1);
//	exit(1);
	
//	double *dM; d_zeros_align(&dM, 2*pnb_v[M]+2*png_v[M], 1);
//	for(jj=0; jj<nbu; jj++)
//		{
//		dM[jj]          = - 0.5;   //   umin
//		dM[pnb_v[1]+jj] =   0.5;   //   umax
//		}
//	for(; jj<nb; jj++)
//		{
//		dM[jj]          = - 4.0;   //   xmin
//		dM[pnb_v[1]+jj] =   4.0;   //   xmax
//		}
//	for(jj=0; jj<ng_v[M]; jj++)
//		{
//		dM[2*pnb_v[M]+jj]          = - 0.5;   //   xmin
//		dM[2*pnb_v[M]+png_v[M]+jj] = - 0.5;   //   xmax
//		}

	double *C; d_zeros(&C, ng, nx);
	for(ii=0; ii<ng; ii++)
		C[ii*(ng+1)] = 1.0;
	double *D; d_zeros(&D, ng, nu);

	// first stage
	double *pDCt0; d_zeros_align(&pDCt0, pnux_v[0], cng_v[0]);
	// middle stage
	double *DC1; d_zeros(&DC1, ng_v[1], nu_v[1]+nx_v[1]);
	for(jj=0; jj<ng_v[1]; jj++) DC1[jj+(nu_v[1]+jj)*ng_v[1]] = 1.0;
//	d_print_mat(ng_v[1], nu_v[1]+nx_v[1], DC1, ng_v[1]);
	double *pDCt1; d_zeros_align(&pDCt1, pnux_v[1], cng_v[1]);
	d_cvt_tran_mat2pmat(ng_v[1], nu_v[1]+nx_v[1], DC1, ng_v[1], 0, pDCt1, cng_v[1]);
//	d_print_pmat(nu_v[1]+nx_v[1], ng_v[1], bs, pDCt1, cng_v[1]);
//	exit(2);
	// last stage
	double *DCN; d_zeros(&DCN, ng_v[N], nx_v[N]);
	for(jj=0; jj<ng_v[N]; jj++) DCN[jj*(ng_v[N]+1)] = 1.0;
//	d_print_mat(ng_v[N], nx_v[N], DCN, ng_v[N]);
	double *pDCtN; d_zeros_align(&pDCtN, pnx_v[N], cng_v[N]);
	d_cvt_tran_mat2pmat(ng_v[N], nx_v[N], DCN, ng_v[N], 0, pDCtN, cng_v[N]);
//	d_print_pmat(nx_v[N], ng_v[N], bs, pDCtN, cng_v[N]);
	// constrained stage
//	double *DCM; d_zeros(&DCM, ng_v[M], nu_v[M]+nx_v[M]);
//	for(jj=0; jj<ng_v[M]; jj++) DCM[jj+(jj+nu_v[M])*ng_v[M]] = 1.0;
//	d_print_mat(ng_v[M], nu_v[M]+nx_v[M], DCM, ng_v[M]);
//	double *pDCtM; d_zeros_align(&pDCtM, pnux_v[M], cng_v[M]);
//	d_cvt_tran_mat2pmat(ng_v[M], nu_v[M]+nx_v[M], DCM, ng_v[M], 0, pDCtM, cng_v[M]);
//	d_print_pmat(nu_v[M]+nx_v[M], ng_v[M], bs, pDCtM, cng_v[M]);
//	exit(2);

/************************************************
* cost function
************************************************/	
	
	double *Q; d_zeros(&Q, nx, nx);
	for(ii=0; ii<nx; ii++) Q[ii*(nx+1)] = 1.0;

	double *R; d_zeros(&R, nu, nu);
	for(ii=0; ii<nu; ii++) R[ii*(nu+1)] = 2.0;

	double *S; d_zeros(&S, nu, nx); // S=0, so no need to update r0

	double *q; d_zeros(&q, nx, 1);
	for(ii=0; ii<nx; ii++) q[ii] = 0.1;

	double *r; d_zeros(&r, nu, 1);
	for(ii=0; ii<nu; ii++) r[ii] = 0.2;

#if KEEP_X0
	double  *pRSQ0; d_zeros_align(&pRSQ0, pnz, cnux);
	d_cvt_mat2pmat(nu, nu, R, nu, 0, pRSQ0, cnux);
	d_cvt_tran_mat2pmat(nu, nx, S, nu, nu, pRSQ0+nu/bs*bs*cnux+nu%bs, cnux);
	d_cvt_tran_mat2pmat(nu, 1, r, nu, nu+nx, pRSQ0+(nu+nx)/bs*bs*cnux+(nu+nx)%bs, cnux);
	d_cvt_mat2pmat(nx, nx, Q, nx, nu, pRSQ0+nu/bs*bs*cnux+nu%bs+nu*bs, cnux);
	d_cvt_tran_mat2pmat(nx, 1, q, nx, nu+nx, pRSQ0+(nu+nx)/bs*bs*cnux+(nu+nx)%bs+nu*bs, cnux);
//	d_print_pmat(nu+nx+1, nu+nx, bs, pRSQ0, cnux);
	double *rq0; d_zeros_align(&rq0, pnux, 1);
	d_copy_mat(nu, 1, r, nu, rq0, pnux);
	d_copy_mat(nx, 1, q, nx, rq0+nu, pnux);
#else
	double  *pRSQ0; d_zeros_align(&pRSQ0, pnu1, cnu);
	d_cvt_mat2pmat(nu, nu, R, nu, 0, pRSQ0, cnu);
	d_cvt_tran_mat2pmat(nu, 1, r, nu, nu, pRSQ0+nu/bs*bs*cnu+nu%bs, cnu);
//	d_print_pmat(nu+1, nu, bs, pRSQ0, cnu);
	double *rq0; d_zeros_align(&rq0, pnu, 1);
	d_copy_mat(nu, 1, r, nu, rq0, pnu);
#endif

	double  *pRSQ1; d_zeros_align(&pRSQ1, pnz, cnux);
	d_cvt_mat2pmat(nu, nu, R, nu, 0, pRSQ1, cnux);
	d_cvt_tran_mat2pmat(nu, nx, S, nu, nu, pRSQ1+nu/bs*bs*cnux+nu%bs, cnux);
	d_cvt_tran_mat2pmat(nu, 1, r, nu, nu+nx, pRSQ1+(nu+nx)/bs*bs*cnux+(nu+nx)%bs, cnux);
	d_cvt_mat2pmat(nx, nx, Q, nx, nu, pRSQ1+nu/bs*bs*cnux+nu%bs+nu*bs, cnux);
	d_cvt_tran_mat2pmat(nx, 1, q, nx, nu+nx, pRSQ1+(nu+nx)/bs*bs*cnux+(nu+nx)%bs+nu*bs, cnux);
//	d_print_pmat(nu+nx+1, nu+nx, bs, pRSQ1, cnux);
	double *rq1; d_zeros_align(&rq1, pnux, 1);
	d_copy_mat(nu, 1, r, nu, rq1, pnux);
	d_copy_mat(nx, 1, q, nx, rq1+nu, pnux);

	double  *pRSQN; d_zeros_align(&pRSQN, pnx1, cnx);
	d_cvt_mat2pmat(nx, nx, Q, nx, 0, pRSQN, cnx);
	d_cvt_tran_mat2pmat(nx, 1, q, nx, nx, pRSQN+(nx)/bs*bs*cnx+(nx)%bs, cnx);
//	d_print_pmat(nx+1, nx, bs, pRSQN, cnx);
	double *rqN; d_zeros_align(&rqN, pnx, 1);
	d_copy_mat(nx, 1, q, nx, rqN, pnx);


	// maximum element in cost functions
	double mu0 = 2.0;

/************************************************
* high level interface work space
************************************************/	

#if 0
	double *rA; d_zeros(&rA, nx, N*nx);
	d_rep_mat(N, nx, nx, A, nx, rA, nx);

	double *rB; d_zeros(&rB, nx, N*nu);
	d_rep_mat(N, nx, nu, B, nx, rB, nx);

	double *rC; d_zeros(&rC, ng, (N+1)*nx);
	d_rep_mat(N, ng, nx, C, ng, rC+nx*ng, ng);

	double *CN = DCN;

	double *rD; d_zeros(&rD, ng, N*nu);
	d_rep_mat(N, ng, nu, D, ng, rD, ng);

	double *rb; d_zeros(&rb, nx, N*1);
	d_rep_mat(N, nx, 1, b, nx, rb, nx);

	double *rQ; d_zeros(&rQ, nx, N*nx);
	d_rep_mat(N, nx, nx, Q, nx, rQ, nx);

	double *rQf; d_zeros(&rQf, nx, nx);
	d_copy_mat(nx, nx, Q, nx, rQf, nx);

	double *rS; d_zeros(&rS, nu, N*nx);
	d_rep_mat(N, nu, nx, S, nu, rS, nu);

	double *rR; d_zeros(&rR, nu, N*nu);
	d_rep_mat(N, nu, nu, R, nu, rR, nu);

	double *rq; d_zeros(&rq, nx, N);
	d_rep_mat(N, nx, 1, q, nx, rq, nx);

	double *rqf; d_zeros(&rqf, nx, 1);
	d_copy_mat(nx, 1, q, nx, rqf, nx);

	double *rr; d_zeros(&rr, nu, N);
	d_rep_mat(N, nu, 1, r, nu, rr, nu);

	double *lb; d_zeros(&lb, nb, 1);
	for(ii=0; ii<nb; ii++)
		lb[ii] = d1[ii];
	double *rlb; d_zeros(&rlb, nb, N+1);
	d_rep_mat(N+1, nb, 1, lb, nb, rlb, nb);
//	d_print_mat(nb, N+1, rlb, nb);

	double *lg; d_zeros(&lg, ng, 1);
	for(ii=0; ii<ng; ii++)
		lg[ii] = d1[2*pnb_v[1]+ii];
	double *rlg; d_zeros(&rlg, ng, N);
	d_rep_mat(N, ng, 1, lg, ng, rlg, ng);
//	d_print_mat(ng, N, rlg, ng);

	double *lgN; d_zeros(&lgN, ngN, 1);
	for(ii=0; ii<ngN; ii++)
		lgN[ii] = dN[2*pnb_v[N]+ii];
//	d_print_mat(ngN, 1, lgN, ngN);

	double *ub; d_zeros(&ub, nb, 1);
	for(ii=0; ii<nb; ii++)
		ub[ii] = d1[pnb_v[1]+ii];
	double *rub; d_zeros(&rub, nb, N+1);
	d_rep_mat(N+1, nb, 1, ub, nb, rub, nb);
//	d_print_mat(nb, N+1, rub, nb);

	double *ug; d_zeros(&ug, ng, 1);
	for(ii=0; ii<ng; ii++)
		ug[ii] = d1[2*pnb_v[1]+png_v[1]+ii];
	double *rug; d_zeros(&rug, ng, N);
	d_rep_mat(N, ng, 1, ug, ng, rug, ng);
//	d_print_mat(ng, N, rug, ng);

	double *ugN; d_zeros(&ugN, ngN, 1);
	for(ii=0; ii<ngN; ii++)
		ugN[ii] = dN[2*pnb_v[N]+png_v[N]+ii];
//	d_print_mat(ngN, 1, ugN, ngN);

	double *rx; d_zeros(&rx, nx, N+1);
	d_copy_mat(nx, 1, x0, nx, rx, nx);

	double *ru; d_zeros(&ru, nu, N);

	double *rpi; d_zeros(&rpi, nx, N);

	double *rlam; d_zeros(&rlam, N*2*(nb+ng)+2*(nb+ngN), 1);

	double *rt; d_zeros(&rt, N*2*(nb+ng)+2*(nb+ngN), 1);

	double *rwork = (double *) malloc(hpmpc_d_ip_mpc_hard_tv_work_space_size_bytes(N, nx, nu, nb, ng, ngN));

	double inf_norm_res[4] = {}; // infinity norm of residuals: rq, rb, rd, mu
#endif

/************************************************
* low level interface work space
************************************************/	

	double *hpBAbt[N];
	double *hpDCt[N+1];
	double *hb[N];
	double *hpRSQ[N+1];
	double *hrq[N+1];
	double *hd[N+1];
	int *idx[N+1];
	double *hux[N+1];
	double *hpi[N];
	double *hlam[N+1];
	double *ht[N+1];
	double *hrb[N];
	double *hrrq[N+1];
	double *hrd[N+1];
	hpBAbt[0] = pBAbt0;
	hpDCt[0] = pDCt0;
	hb[0] = b0;
	hpRSQ[0] = pRSQ0;
	hrq[0] = rq0;
	hd[0] = d0;
	idx[0] = idx0;
	d_zeros_align(&hux[0], pnux_v[0], 1);
	d_zeros_align(&hpi[0], pnx_v[1], 1);
	d_zeros_align(&hlam[0], 2*pnb_v[0]+2*png_v[0], 1);
	d_zeros_align(&ht[0], 2*pnb_v[0]+2*png_v[0], 1);
	d_zeros_align(&hrb[0], pnx_v[1], 1);
	d_zeros_align(&hrrq[0], pnz_v[0], 1);
	d_zeros_align(&hrd[0], 2*pnb_v[0]+2*png_v[0], 1);
	for(ii=1; ii<N; ii++)
		{
		hpBAbt[ii] = pBAbt1;
//		d_zeros_align(&hpBAbt[ii], pnz_v[ii], cnx_v[ii+1]); for(jj=0; jj<pnz_v[ii]*cnx_v[ii+1]; jj++) hpBAbt[ii][jj] = pBAbt1[jj];
		hpDCt[ii] = pDCt1;
		hb[ii] = b;
		hpRSQ[ii] = pRSQ1;
//		d_zeros_align(&hpRSQ[ii], pnz_v[ii], cnux_v[ii]); for(jj=0; jj<pnz_v[ii]*cnux_v[ii]; jj++) hpRSQ[ii][jj] = pRSQ1[jj];
		hrq[ii] = rq1;
		hd[ii] = d1;
		idx[ii] = idx1;
		d_zeros_align(&hux[ii], pnux_v[ii], 1);
		d_zeros_align(&hpi[ii], pnx_v[ii+1], 1);
		d_zeros_align(&hlam[ii], 2*pnb_v[ii]+2*png_v[ii], 1);
		d_zeros_align(&ht[ii], 2*pnb_v[ii]+2*png_v[ii], 1);
		d_zeros_align(&hrb[ii], pnx_v[ii+1], 1);
		d_zeros_align(&hrrq[ii], pnz_v[ii], 1);
		d_zeros_align(&hrd[ii], 2*pnb_v[ii]+2*png_v[ii], 1);
		}
	hpDCt[N] = pDCtN;
	hpRSQ[N] = pRSQN;
	hrq[N] = rqN;
	hd[N] = dN;
	idx[N] = idxN;
	d_zeros_align(&hux[N], pnx, 1);
	d_zeros_align(&hlam[N], 2*pnb_v[N]+2*png_v[N], 1);
	d_zeros_align(&ht[N], 2*pnb_v[N]+2*png_v[N], 1);
	d_zeros_align(&hrrq[N], pnz_v[N], 1);
	d_zeros_align(&hrd[N], 2*pnb_v[N]+2*png_v[N], 1);

//	hpDCt[M] = pDCtM;
//	hd[M] = dM;

	double mu = 0.0;

#if USE_IPM_RES
	double *work; d_zeros_align(&work, d_ip2_res_mpc_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v)/sizeof(double), 1);
#else
	double *work; d_zeros_align(&work, d_ip2_mpc_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v)/sizeof(double), 1);
#endif

/************************************************
* (new) high level interface work space
************************************************/	

	// box constraints
	double *lb0; d_zeros(&lb0, nb_v[0], 1);
	for(ii=0; ii<nb_v[0]; ii++)
		lb0[ii] = d0[ii];
	double *ub0; d_zeros(&ub0, nb_v[0], 1);
	for(ii=0; ii<nb_v[0]; ii++)
		ub0[ii] = d0[pnb_v[0]+ii];
	double *lb1; d_zeros(&lb1, nb_v[1], 1);
	for(ii=0; ii<nb_v[1]; ii++)
		lb1[ii] = d1[ii];
	double *ub1; d_zeros(&ub1, nb_v[1], 1);
	for(ii=0; ii<nb_v[1]; ii++)
		ub1[ii] = d1[pnb_v[1]+ii];
	double *lbN; d_zeros(&lbN, nb_v[N], 1);
	for(ii=0; ii<nb_v[N]; ii++)
		lbN[ii] = dN[ii];
	double *ubN; d_zeros(&ubN, nb_v[N], 1);
	for(ii=0; ii<nb_v[N]; ii++)
		ubN[ii] = dN[pnb_v[N]+ii];

	// general constraints
	double *lg0; d_zeros(&lg0, ng_v[0], 1);
	for(ii=0; ii<ng_v[0]; ii++)
		lg0[ii] = d0[2*pnb_v[0]+ii];
	double *ug0; d_zeros(&ug0, ng_v[0], 1);
	for(ii=0; ii<ng_v[0]; ii++)
		ug0[ii] = d0[2*pnb_v[0]+png_v[0]+ii];
	double *lg1; d_zeros(&lg1, ng_v[1], 1);
	for(ii=0; ii<ng_v[1]; ii++)
		lg1[ii] = d1[2*pnb_v[1]+ii];
	double *ug1; d_zeros(&ug1, ng_v[1], 1);
	for(ii=0; ii<ng_v[1]; ii++)
		ug1[ii] = d1[2*pnb_v[1]+png_v[1]+ii];
	double *lgN; d_zeros(&lgN, ng_v[N], 1);
	for(ii=0; ii<ng_v[N]; ii++)
		lgN[ii] = dN[2*pnb_v[N]+ii];
	double *ugN; d_zeros(&ugN, ng_v[N], 1);
	for(ii=0; ii<ng_v[N]; ii++)
		ugN[ii] = dN[2*pnb_v[N]+png_v[N]+ii];

	// data matrices
	double *hA[N];
	double *hB[N];
	double *hC[N+1];
	double *hD[N];
	double *hQ[N+1];
	double *hS[N];
	double *hR[N];
	double *hq[N+1];
	double *hr[N];
	double *hlb[N+1];
	double *hub[N+1];
	double *hlg[N+1];
	double *hug[N+1];
	double *hx[N+1];
	double *hu[N];
	double *hpi1[N];
	double *hlam1[N+1];
	double *ht1[N+1];
	double inf_norm_res[4] = {}; // infinity norm of residuals: rq, rb, rd, mu

	ii = 0;
	hA[0] = A;
	hB[0] = B;
	hC[0] = C;
	hD[0] = D;
	hQ[0] = Q;
	hS[0] = S;
	hR[0] = R;
	hq[0] = q;
	hr[0] = r;
	hlb[0] = lb0;
	hub[0] = ub0;
	hlg[0] = lg0;
	hug[0] = ug0;
	d_zeros(&hx[0], nx_v[0], 1);
	d_zeros(&hu[0], nu_v[0], 1);
	d_zeros(&hpi1[0], nx_v[1], 1);
	d_zeros(&hlam1[0], 2*nb_v[0]+2*ng_v[0], 1);
	d_zeros(&ht1[0], 2*nb_v[0]+2*ng_v[0], 1);
	for(ii=1; ii<N; ii++)
		{
		hA[ii] = A;
		hB[ii] = B;
		hC[ii] = C;
		hD[ii] = D;
		hQ[ii] = Q;
		hS[ii] = S;
		hR[ii] = R;
		hq[ii] = q;
		hr[ii] = r;
		hlb[ii] = lb1;
		hub[ii] = ub1;
		hlg[ii] = lg1;
		hug[ii] = ug1;
		d_zeros(&hx[ii], nx_v[ii], 1);
		d_zeros(&hu[ii], nu_v[ii], 1);
		d_zeros(&hpi1[ii], nx_v[ii+1], 1);
		d_zeros(&hlam1[ii], 2*nb_v[ii]+2*ng_v[ii], 1);
		d_zeros(&ht1[ii], 2*nb_v[ii]+2*ng_v[ii], 1);
		}
	ii = N;
	hC[N] = C;
	hQ[N] = Q;
	hq[N] = q;
	hlb[N] = lbN;
	hub[N] = ubN;
	hlg[N] = lgN;
	hug[N] = ugN;
	d_zeros(&hx[N], nx_v[N], 1);
	d_zeros(&hlam1[N], 2*nb_v[N]+2*ng_v[N], 1);
	d_zeros(&ht1[N], 2*nb_v[N]+2*ng_v[N], 1);

	// work space
#if 0
	printf("work space in bytes: %d\n", hpmpc_d_ip_ocp_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v));
	exit(3);
#endif
	void *work1 = malloc(hpmpc_d_ip_ocp_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v));
	double *ptr_work1 = (double *) work1;

/************************************************
* solvers common stuff
************************************************/	

	int hpmpc_status;
	int kk, kk_avg;
	int k_max = 10;
	double mu_tol = 1e-20;
	double alpha_min = 1e-8;
	int warm_start = 0; // read initial guess from x and u
	double *stat; d_zeros(&stat, k_max, 5);
	int compute_res = 1;
	int compute_mult = 1;

	struct timeval tv0, tv1, tv2, tv3;
	double time;

	double **dummy;

/************************************************
* call the solver (high-level interface)
************************************************/	

#if 1
	int time_invariant = 0; // assume the problem to be time invariant
	int free_x0 = 0; // assume x0 as optimization variable

	gettimeofday(&tv0, NULL); // stop

	kk_avg = 0;

	for(rep=0; rep<nrep; rep++)
		{

//		hpmpc_status = fortran_order_d_ip_mpc_hard_tv(&kk, k_max, mu0, mu_tol, N, nx, nu, nb, ng, ngN, time_invariant, free_x0, warm_start, rA, rB, rb, rQ, rQf, rS, rR, rq, rqf, rr, rlb, rub, rC, rD, rlg, rug, CN, lgN, ugN, rx, ru, rpi, rlam, rt, inf_norm_res, rwork, stat);
		hpmpc_status = fortran_order_d_ip_ocp_hard_tv(&kk, k_max, mu0, mu_tol, N, nx_v, nu_v, nb_v, ng_v, warm_start, hA, hB, hb, hQ, hS, hR, hq, hr, hlb, hub, hC, hD, hlg, hug, hx, hu, hpi1, hlam1, ht1, inf_norm_res, work1, stat);

		kk_avg += kk;

		}
	
	gettimeofday(&tv1, NULL); // stop

	printf("\nsolution from high-level interface\n\n");
//	d_print_mat(nx, N+1, rx, nx);
//	d_print_mat(nu, N, ru, nu);
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nx_v[ii], hx[ii], 1);
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nu_v[ii], hu[ii], 1);

	printf("\ninfinity norm of residuals\n\n");
	d_print_mat_e(1, 4, inf_norm_res, 1);

	time = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);

	printf("\nstatistics from last run\n\n");
	for(jj=0; jj<kk; jj++)
		printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
	printf("\n");
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f\n", nrep, kk_avg / (double) nrep);
	printf(" Average solution time over %d runs: %5.2e seconds\n", nrep, time);
	printf("\n\n");

	gettimeofday(&tv0, NULL); // stop

	kk_avg = 0;

	for(rep=0; rep<nrep; rep++)
		{

//		fortran_order_d_solve_kkt_new_rhs_mpc_hard_tv(N, nx, nu, nb, ng, ngN, time_invariant, free_x0, rA, rB, rb, rQ, rQf, rS, rR, rq, rqf, rr, rlb, rub, rC, rD, rlg, rug, CN, lgN, ugN, rx, ru, rpi, rlam, rt, inf_norm_res, rwork);
		fortran_order_d_solve_kkt_new_rhs_ocp_hard_tv(N, nx_v, nu_v, nb_v, ng_v, hA, hB, hb, hQ, hS, hR, hq, hr, hlb, hub, hC, hD, hlg, hug, hx, hu, hpi1, hlam1, ht1, inf_norm_res, work1);

		kk_avg += kk;

		}
	
	gettimeofday(&tv1, NULL); // stop

	printf("\nsolution from high-level interface (resolve final kkt)\n\n");
//	d_print_mat(nx, N+1, rx, nx);
//	d_print_mat(nu, N, ru, nu);
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nx_v[ii], hx[ii], 1);
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nu_v[ii], hu[ii], 1);

	printf("\ninfinity norm of residuals\n\n");
	d_print_mat_e(1, 4, inf_norm_res, 1);

	time = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);

	printf(" Average solution time over %d runs: %5.2e seconds\n", nrep, time);
#endif

/************************************************
* call the solver (low-level interface)
************************************************/	

//	for(ii=0; ii<N; ii++)
//		d_print_pmat(nu_v[ii]+nx_v[ii]+1, nx_v[ii+1], bs, hpBAbt[ii], cnx_v[ii+1]);
//	exit(3);

	gettimeofday(&tv0, NULL); // stop

	kk_avg = 0;

	printf("\nsolution...\n");
	for(rep=0; rep<nrep; rep++)
		{

#if USE_IPM_RES
		hpmpc_status = d_ip2_res_mpc_hard_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, stat, N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hpRSQ, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#else
		hpmpc_status = d_ip2_mpc_hard_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, stat, N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hpRSQ, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#endif
		
		kk_avg += kk;

		}
	printf("\ndone\n");

	gettimeofday(&tv1, NULL); // stop

	printf("\nsolution from low-level interface (original problem)\n\n");
	printf("\nux\n\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nu_v[ii]+nx_v[ii], hux[ii], 1);
	printf("\npi\n\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nx_v[ii+1], hpi[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], hlam[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], ht[ii], 1);
	
	// residuals
	if(compute_res)
		{
		// compute residuals
		d_res_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hux, hpDCt, hd, hpi, hlam, ht, hrrq, hrb, hrd, &mu);

		// print residuals
		printf("\nhrrq\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nu_v[ii]+nx_v[ii], hrrq[ii], 1);

		printf("\nhrb\n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat_e(1, nx_v[ii+1], hrb[ii], 1);

		printf("\nhrd low\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii], 1);

		printf("\nhrd up\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii]+pnb_v[ii], 1);

		}



	// zero the solution again
	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nu_v[ii]+nx_v[ii]; jj++) hux[ii][jj] = 0.0;

	// modify constraints
#if 0
	for(jj=0; jj<nbx; jj++)
		{
		dN[jj]          = - 4.0;   //   xmin
		dN[pnb_v[N]+jj] =   4.0;   //   xmax
		idxN[jj] = jj;
		}
	for(jj=0; jj<ng_v[N]; jj++)
		{
		dN[2*pnb_v[N]+jj]          =   0.1;   //   xmin
		dN[2*pnb_v[N]+png_v[N]+jj] =   0.1;   //   xmax
		}
#endif

#if 0
for(ii=0; ii<=N; ii++)
	d_print_pmat(nu_v[ii]+nx_v[ii]+1, nu_v[ii]+nx_v[ii], bs, hpRSQ[ii], cnux_v[ii]);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu_v[ii]+nx_v[ii], hrq[ii], 1);
exit(1);
#endif

	gettimeofday(&tv2, NULL); // stop

	printf("\nsolution...\n");
	for(rep=0; rep<nrep; rep++)
		{

#if USE_IPM_RES
		d_kkt_solve_new_rhs_res_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#else
		d_kkt_solve_new_rhs_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#endif

		}
	printf("\ndone\n");

	gettimeofday(&tv3, NULL); // stop

	printf("\nsolution from low-level interface (resolve final kkt)\n\n");
	printf("\nux\n\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nu_v[ii]+nx_v[ii], hux[ii], 1);
	printf("\npi\n\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nx_v[ii+1], hpi[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], hlam[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], ht[ii], 1);

	// residuals
	if(compute_res)
		{
		// compute residuals
		d_res_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hux, hpDCt, hd, hpi, hlam, ht, hrrq, hrb, hrd, &mu);

		// print residuals
		printf("\nhrrq\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nu_v[ii]+nx_v[ii], hrrq[ii], 1);

		printf("\nhrb\n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat_e(1, nx_v[ii+1], hrb[ii], 1);

		printf("\nhrd low\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii], 1);

		printf("\nhrd up\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii]+pnb_v[ii], 1);

		}

	double time_ipm = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
	double time_final = (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);

	printf("\nstatistics from last run\n\n");
	for(jj=0; jj<kk; jj++)
		printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
	printf("\n");
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f\n", nrep, kk_avg / (double) nrep);
	printf(" Average solution time over %d runs: %5.2e seconds (IPM)\n", nrep, time_ipm);
	printf(" Average solution time over %d runs: %5.2e seconds (resolve final kkt)\n", nrep, time_final);
	printf("\n\n");

/************************************************
* compute residuals
************************************************/	

/************************************************
* free memory
************************************************/	

	// problem data
	free(A);
	free(B);
	d_free_align(b);
	d_free_align(x0);
	free(C);
	free(D);
	free(Q);
	free(S);
	free(R);
	free(q);
	free(r);

	// low level interface
	d_free_align(pA);
	d_free_align(b0);
	d_free_align(pBAbt0);
	d_free_align(pBAbt1);
	d_free_align(d0);
	d_free_align(d1);
	d_free_align(dN);
	d_free_align(pDCt0);
	d_free_align(pDCt1);
	free(DCN);
	d_free_align(pDCtN);
	free(idx0);
	free(idx1);
	free(idxN);
	d_free_align(pRSQ0);
	d_free_align(pRSQ1);
	d_free_align(pRSQN);
	d_free_align(rq0);
	d_free_align(rq1);
	d_free_align(rqN);
	d_free_align(work);
	free(stat);
	for(ii=0; ii<N; ii++)
		{
		d_free_align(hux[ii]);
		d_free_align(hpi[ii]);
		d_free_align(hlam[ii]);
		d_free_align(ht[ii]);
		d_free_align(hrb[ii]);
		d_free_align(hrrq[ii]);
		d_free_align(hrd[ii]);
		}
	d_free_align(hux[N]);
	d_free_align(hlam[N]);
	d_free_align(ht[N]);
	d_free_align(hrrq[N]);
	d_free_align(hrd[N]);
	
#if 0
	// high level interface
	free(rA);
	free(rB);
	free(rC);
	free(rD);
	free(rb);
	free(rQ);
	free(rQf);
	free(rS);
	free(rR);
	free(rq);
	free(rqf);
	free(rr);
	free(lb);
	free(rlb);
	free(lg);
	free(rlg);
	free(lgN);
	free(ub);
	free(rub);
	free(ug);
	free(rug);
	free(ugN);
	free(rx);
	free(ru);
	free(rpi);
	free(rlam);
	free(rt);
	free(rwork);
#endif
	
	// new high level interface
	free(lb0);
	free(ub0);
	free(lb1);
	free(ub1);
	free(lbN);
	free(ubN);
	free(lg0);
	free(ug0);
	free(lg1);
	free(ug1);
	free(work1);
	for(ii=0; ii<N; ii++)
		{
		free(hx[ii]);
		free(hu[ii]);
		free(hpi1[ii]);
		free(hlam1[ii]);
		free(ht1[ii]);
		}
	free(hx[N]);
	free(hlam1[N]);
	free(ht1[N]);

	return 0;
	
	}