Exemplo n.º 1
0
void d_res_mpc(int nx, int nu, int N, double **hpBAbt, double **hpQ, double **hq, double **hux, double **hpi, double **hrq, double **hrb)
	{

	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;

	const int pnz = bs*((nx+nu+1+bs-1)/bs);
	const int cnz = ncl*((nx+nu+1+ncl-1)/ncl);
	const int cnx = ncl*((nx+ncl-1)/ncl);

	static double temp[D_MR] = {};

	int ii, jj;
	
	int nxu = nx+nu;

	// first block
	for(jj=0; jj<nu; jj++) hrq[0][jj] = - hq[0][jj];
	for(jj=0; jj<nu%bs; jj++) { temp[jj] = hux[0][(nu/bs)*bs+jj]; hux[0][(nu/bs)*bs+jj] = 0.0; }
	dgemv_t_lib(nx, nu, hpQ[0]+(nu/bs)*bs*cnz+nu%bs, cnz, hux[0]+nu, -1, hrq[0], hrq[0]);
	for(jj=0; jj<nu%bs; jj++) hux[0][(nu/bs)*bs+jj] = temp[jj];
	dsymv_lib(nu, nu, hpQ[0], cnz, hux[0], -1, hrq[0], hrq[0]);
	dgemv_n_lib(nu, nx, hpBAbt[0], cnx, hpi[1], -1, hrq[0], hrq[0]);
	for(jj=0; jj<nx; jj++) hrb[0][jj] = hux[1][nu+jj] - hpBAbt[0][(nxu/bs)*bs*cnx+nxu%bs+bs*jj];
	dgemv_t_lib(nxu, nx, hpBAbt[0], cnx, hux[0], -1, hrb[0], hrb[0]);

	// middle blocks
	for(ii=1; ii<N; ii++)
		{
		for(jj=0; jj<nu; jj++) hrq[ii][jj] = - hq[ii][jj];
		for(jj=0; jj<nx; jj++) hrq[ii][nu+jj] = hpi[ii][jj] - hq[ii][nu+jj];
		dsymv_lib(nxu, nxu, hpQ[ii], cnz, hux[ii], -1, hrq[ii], hrq[ii]);
		for(jj=0; jj<nx; jj++) hrb[ii][jj] = hux[ii+1][nu+jj] - hpBAbt[ii][(nxu/bs)*bs*cnx+nxu%bs+bs*jj];
		dgemv_nt_lib(nxu, nx, hpBAbt[ii], cnx, hpi[ii+1], hux[ii], -1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);
		}

	// last block
	for(jj=0; jj<nx; jj++) hrq[N][nu+jj] = hpi[N][jj] - hq[N][nu+jj];
	dsymv_lib(nx+nu%bs, nx+nu%bs, hpQ[N]+(nu/bs)*bs*cnz+(nu/bs)*bs*bs, cnz, hux[N]+(nu/bs)*bs, -1, hrq[N]+(nu/bs)*bs, hrq[N]+(nu/bs)*bs);
	
	}
Exemplo n.º 2
0
void d_res_mpc_soft_tv(int N, int *nx, int *nu, int *nb, int **idxb, int *ng, int *ns, double **hpBAbt, double **hpQ, double **hq, double **hZ, double **hz, double **hux, double **hpDCt, double **hd, double **hpi, double **hlam, double **ht, double **hrq, double **hrb, double **hrd, double **hrz, double *mu)
	{

	const int bs = D_MR;
	const int ncl = D_NCL;

	static double temp[D_MR] = {};

	int ii, jj;
	
	int nu0, nu1, cnz0, nx0, nx1, nxm, cnx0, cnx1, nb0, pnb, ng0, png, cng, ns0, pns, nb_tot;


	// initialize mu
	nb_tot = 0;
	mu[0] = 0;



	nu1 = nu[0];
	nx1 = nx[0];
	cnx1  = (nx1+ncl-1)/ncl*ncl;
	// first blocks
	for(ii=0; ii<N; ii++)
		{
		nu0 = nu1;
		nu1 = nu[ii+1];
		nx0 = nx1;
		nx1 = nx[ii+1];
		cnx0 = cnx1;
		cnx1  = (nx1+ncl-1)/ncl*ncl;
		cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;
		nb0 = nb[ii];
		pnb = (nb0+bs-1)/bs*bs;
		ng0 = ng[ii];
		png = (ng0+bs-1)/bs*bs;
		cng = (ng0+ncl-1)/ncl*ncl;
		ns0 = ns[ii];
		pns = (ns0+bs-1)/bs*bs;
		nb_tot += nb0 + ng0 + ns0;

		for(jj=0; jj<nb0; jj++)
			mu[0] += hlam[ii][jj] * ht[ii][jj] + hlam[ii][pnb+jj] * ht[ii][pnb+jj];
		for(jj=0; jj<ng0; jj++) 
			mu[0] += hlam[ii][2*pnb+jj] * ht[ii][2*pnb+jj] + hlam[ii][2*pnb+png+jj] * ht[ii][2*pnb+png+jj];
		for(jj=0; jj<ns0; jj++) 
			mu[0] += hlam[ii][2*pnb+2*png+0*pns+jj] * ht[ii][2*pnb+2*png+0*pns+jj] + hlam[ii][2*pnb+2*png+1*pns+jj] * ht[ii][2*pnb+2*png+1*pns+jj] + hlam[ii][2*pnb+2*png+2*pns+jj] * ht[ii][2*pnb+2*png+2*pns+jj] + hlam[ii][2*pnb+2*png+3*pns+jj] * ht[ii][2*pnb+2*png+3*pns+jj];

		for(jj=0; jj<nb0; jj++)
			{
			hrd[ii][jj]     =   hux[ii][idxb[ii][jj]] - hd[ii][jj]     - ht[ii][jj];
			hrd[ii][pnb+jj] = - hux[ii][idxb[ii][jj]] - hd[ii][pnb+jj] - ht[ii][pnb+jj];
			}
		if(ng0>0)
			{
			dgemv_t_lib(nu0+nx0, ng0, hpDCt[ii], cng, hux[ii], 0, hrd[ii]+2*pnb, hrd[ii]+2*pnb);
			for(jj=0; jj<ng0; jj++)
				{
				hrd[ii][2*pnb+png+jj] = - hrd[ii][2*pnb+jj];
				hrd[ii][2*pnb+jj] += - hd[ii][2*pnb+jj] - ht[ii][2*pnb+jj];
				hrd[ii][2*pnb+png+jj] += - hd[ii][2*pnb+png+jj] - ht[ii][2*pnb+png+jj];
				}
			}
		for(jj=0; jj<ns0; jj++)
			{
			hrd[ii][2*pnb+2*png+0*pns+jj] = ht[ii][2*pnb+2*png+2*pns+jj] + hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+0*pns+jj] - ht[ii][2*pnb+2*png+0*pns+jj];
			hrd[ii][2*pnb+2*png+1*pns+jj] = ht[ii][2*pnb+2*png+3*pns+jj] - hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+1*pns+jj] - ht[ii][2*pnb+2*png+1*pns+jj];
			}

		for(jj=0; jj<nu0; jj++) 
			hrq[ii][jj] = - hq[ii][jj];
		for(jj=0; jj<nx0; jj++) 
			hrq[ii][nu0+jj] = - hq[ii][nu0+jj] + hpi[ii][jj];
		dsymv_lib(nu0+nx0, nu0+nx0, hpQ[ii], cnz0, hux[ii], -1, hrq[ii], hrq[ii]);
		for(jj=0; jj<nb0; jj++) 
			hrq[ii][idxb[ii][jj]] += hlam[ii][jj] - hlam[ii][pnb+jj];
		if(ng0>0)
			{
			// TODO work space + one dgemv call
			dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb, 1, hrq[ii], hrq[ii]);
			dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb+png, -1, hrq[ii], hrq[ii]);
			}
		for(jj=0; jj<ns0; jj++) 
			hrq[ii][idxb[ii][nu0+jj]] += hlam[ii][2*pnb+2*png+0*pns+jj] - hlam[ii][2*pnb+2*png+1*pns+jj];
		for(jj=0; jj<nx1; jj++) 
			hrb[ii][jj] = hux[ii+1][nu1+jj] - hpBAbt[ii][(nu0+nx0)/bs*bs*cnx1+(nu0+nx0)%bs+bs*jj];
		dgemv_nt_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hpi[ii+1], hux[ii], -1, -1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);

		for(jj=0; jj<ns0; jj++) 
			{ 
			hrz[ii][0*pns+jj] = hz[ii][0*pns+jj] + hZ[ii][0*pns+jj]*ht[ii][2*pnb+2*png+2*pns+jj] - hlam[ii][2*pnb+2*png+0*pns+jj] - hlam[ii][2*pnb+2*png+2*pns+jj]; 
			hrz[ii][1*pns+jj] = hz[ii][1*pns+jj] + hZ[ii][1*pns+jj]*ht[ii][2*pnb+2*png+3*pns+jj] - hlam[ii][2*pnb+2*png+1*pns+jj] - hlam[ii][2*pnb+2*png+3*pns+jj]; 
			}

		}
	

	// last block
	ii = N;
	nu0 = nu1;
	nx0 = nx1;
	cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;
	nb0 = nb[ii];
	pnb = (nb0+bs-1)/bs*bs;
	ng0 = ng[ii];
	png = (ng0+bs-1)/bs*bs;
	cng = (ng0+ncl-1)/ncl*ncl;
	ns0 = ns[ii];
	pns = (ns0+bs-1)/bs*bs;
	nb_tot += nb0 + ng0 + ns0;

	for(jj=0; jj<nb0; jj++)
		mu[0] += hlam[ii][jj] * ht[ii][jj] + hlam[ii][pnb+jj] * ht[ii][pnb+jj];
	for(jj=0; jj<ng0; jj++) 
		mu[0] += hlam[ii][2*pnb+jj] * ht[ii][2*pnb+jj] + hlam[ii][2*pnb+png+jj] * ht[ii][2*pnb+png+jj];
	for(jj=0; jj<ns0; jj++) 
		mu[0] += hlam[ii][2*pnb+2*png+0*pns+jj] * ht[ii][2*pnb+2*png+0*pns+jj] + hlam[ii][2*pnb+2*png+1*pns+jj] * ht[ii][2*pnb+2*png+1*pns+jj] + hlam[ii][2*pnb+2*png+2*pns+jj] * ht[ii][2*pnb+2*png+2*pns+jj] + hlam[ii][2*pnb+2*png+3*pns+jj] * ht[ii][2*pnb+2*png+3*pns+jj];

	for(jj=0; jj<nb0; jj++)
		{
		hrd[ii][jj]     =   hux[ii][idxb[ii][jj]] - hd[ii][jj]     - ht[ii][jj];
		hrd[ii][pnb+jj] = - hux[ii][idxb[ii][jj]] - hd[ii][pnb+jj] - ht[ii][pnb+jj];
		}
	if(ng0>0)
		{
		dgemv_t_lib(nu0+nx0, ng0, hpDCt[ii], cng, hux[ii], 0, hrd[ii]+2*pnb, hrd[ii]+2*pnb);
		for(jj=0; jj<ng0; jj++)
			{
			hrd[ii][2*pnb+png+jj] = - hrd[ii][2*pnb+jj];
			hrd[ii][2*pnb+jj] += - hd[ii][2*pnb+jj] - ht[ii][2*pnb+jj];
			hrd[ii][2*pnb+png+jj] += - hd[ii][2*pnb+png+jj] - ht[ii][2*pnb+png+jj];
			}
		}
	for(jj=0; jj<ns0; jj++)
		{
		hrd[ii][2*pnb+2*png+0*pns+jj] = ht[ii][2*pnb+2*png+2*pns+jj] + hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+0*pns+jj] - ht[ii][2*pnb+2*png+0*pns+jj];
		hrd[ii][2*pnb+2*png+1*pns+jj] = ht[ii][2*pnb+2*png+3*pns+jj] - hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+1*pns+jj] - ht[ii][2*pnb+2*png+1*pns+jj];
		}


	for(jj=0; jj<nx0; jj++) 
		hrq[ii][nu0+jj] = hpi[ii][jj] - hq[ii][nu0+jj];
	for(jj=0; jj<nb0; jj++) 
		hrq[ii][idxb[ii][jj]] += hlam[ii][jj] - hlam[ii][pnb+jj];
	dsymv_lib(nx0+nu0%bs, nx0+nu0%bs, hpQ[ii]+nu0/bs*bs*cnz0+nu0/bs*bs*bs, cnz0, hux[ii]+nu0/bs*bs, -1, hrq[ii]+nu0/bs*bs, hrq[ii]+nu0/bs*bs);
	if(ng0>0)
		{
		// TODO work space + one dgemv call
		dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb, 1, hrq[ii], hrq[ii]);
		dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb+png, -1, hrq[ii], hrq[ii]);
		}
	for(jj=0; jj<ns0; jj++) 
		hrq[ii][idxb[ii][nu0+jj]] += - hlam[ii][2*pnb+2*png+2*pns+jj] + hlam[ii][2*pnb+2*png+3*pns+jj];
	
	for(jj=0; jj<ns0; jj++) 
		{ 
		hrz[ii][0*pns+jj] = hz[ii][0*pns+jj] + hZ[ii][0*pns+jj]*ht[ii][2*pnb+2*png+2*pns+jj] - hlam[ii][2*pnb+2*png+0*pns+jj] - hlam[ii][2*pnb+2*png+2*pns+jj]; 
		hrz[ii][1*pns+jj] = hz[ii][1*pns+jj] + hZ[ii][1*pns+jj]*ht[ii][2*pnb+2*png+3*pns+jj] - hlam[ii][2*pnb+2*png+1*pns+jj] - hlam[ii][2*pnb+2*png+3*pns+jj]; 
		}



	// normalize mu
	if(nb_tot!=0)
		mu[0] /= 2.0*nb_tot;

	}
Exemplo n.º 3
0
void d_res_mpc_tv(int N, int *nx, int *nu, double **hpBAbt, double **hpQ, double **hq, double **hux, double **hpi, double **hrq, double **hrb)
	{

	const int bs = D_MR;
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line

	static double temp[D_MR] = {};

	int ii, jj;
	
	int nu0, nu1, cnz0, nx0, nx1, nxm, cnx0, cnx1;


	// first block
	ii = 0;
	nu0 = nu[ii];
	nu1 = nu[ii+1];
	nx0 = nx[ii];
	nx1 = nx[ii+1];
	cnx1  = (nx1+ncl-1)/ncl*ncl;
	cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;
	
	for(jj=0; jj<nu0; jj++) 
		hrq[ii][jj] = - hq[ii][jj];
	if(nx0>0)
		{
		for(jj=0; jj<nu0%bs; jj++) 
			{ 
			temp[jj] = hux[ii][nu0/bs*bs+jj]; 
			hux[ii][nu0/bs*bs+jj] = 0.0; 
			}
		dgemv_t_lib(nx0+nu0%bs, nu0, hpQ[ii]+nu0/bs*bs*cnz0, cnz0, hux[ii]+nu0/bs*bs, -1, hrq[ii], hrq[ii]);
		for(jj=0; jj<nu0%bs; jj++) 
			hux[ii][nu0/bs*bs+jj] = temp[jj];
		}
	dsymv_lib(nu0, nu0, hpQ[ii], cnz0, hux[ii], -1, hrq[ii], hrq[ii]);
	dgemv_n_lib(nu0, nx1, hpBAbt[ii], cnx1, hpi[ii+1], -1, hrq[ii], hrq[ii]);
	
	for(jj=0; jj<nx1; jj++) 
		hrb[ii][jj] = hux[ii+1][nu1+jj] - hpBAbt[ii][(nu0+nx0)/bs*bs*cnx1+(nu0+nx0)%bs+bs*jj];
	dgemv_t_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hux[ii], -1, hrb[ii], hrb[ii]);



	// middle blocks
	for(ii=1; ii<N; ii++)
		{
		nu0 = nu1;
		nu1 = nu[ii+1];
		nx0 = nx1;
		nx1 = nx[ii+1];
		cnx0 = cnx1;
		cnx1  = (nx1+ncl-1)/ncl*ncl;
		cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;

		for(jj=0; jj<nu0; jj++) 
			hrq[ii][jj] = - hq[ii][jj];
		for(jj=0; jj<nx0; jj++) 
			hrq[ii][nu0+jj] = - hq[ii][nu0+jj] + hpi[ii][jj];
		dsymv_lib(nu0+nx0, nu0+nx0, hpQ[ii], cnz0, hux[ii], -1, hrq[ii], hrq[ii]);

		for(jj=0; jj<nx1; jj++) 
			hrb[ii][jj] = hux[ii+1][nu1+jj] - hpBAbt[ii][(nu0+nx0)/bs*bs*cnx1+(nu0+nx0)%bs+bs*jj];
		dgemv_nt_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hpi[ii+1], hux[ii], -1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);

		}
	


	// last block
	ii = N;
	nu0 = nu1;
	nx0 = nx1;
	cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;

	for(jj=0; jj<nx0; jj++) 
		hrq[ii][nu0+jj] = hpi[ii][jj] - hq[ii][nu0+jj];
	dsymv_lib(nx0+nu0%bs, nx0+nu0%bs, hpQ[ii]+nu0/bs*bs*cnz0+nu0/bs*bs*bs, cnz0, hux[ii]+nu0/bs*bs, -1, hrq[ii]+nu0/bs*bs, hrq[ii]+nu0/bs*bs);

	}
Exemplo n.º 4
0
void d_res_diag_mpc(int N, int *nx, int *nu, double **hdA, double **hpBt, double **hpR, double **hpSt, double **hpQ, double **hb, double **hrq, double **hux, double **hpi, double **hres_rq, double **hres_b, double *work)
	{

	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;

	int ii, jj;

	int nu0, nu1, cnu0, nx0, nx1, nxm, cnx0, cnx1;



	// first stage
	ii = 0;
	nu0 = nu[ii];
	nu1 = nu[ii+1];
	nx0 = nx[ii]; // nx1;
	nx1 = nx[ii+1];
	cnu0  = ncl*((nu0+ncl-1)/ncl);
	cnx1  = ncl*((nx1+ncl-1)/ncl);
	nxm = (nx0<nx1) ? nx0 : nx1;

	for(jj=0; jj<nu0; jj++) hres_rq[ii][jj] = - hrq[ii][jj];
	for(jj=0; jj<nx0; jj++) work[jj] = hux[ii][nu0+jj];
	dgemv_t_lib(nx0, nu0, hpSt[ii], cnu0, work, -1, hres_rq[ii], hres_rq[ii]);
	dsymv_lib(nu0, nu0, hpR[ii], cnu0, hux[ii], -1, hres_rq[ii], hres_rq[ii]);
	dgemv_n_lib(nu0, nx1, hpBt[ii], cnx1, hpi[ii+1], -1, hres_rq[ii], hres_rq[ii]);

	for(jj=0; jj<nx1; jj++) hres_b[ii][jj] = hux[ii+1][nu1+jj] - hb[ii][jj];
	for(jj=0; jj<nxm; jj++) hres_b[ii][jj] -= hdA[ii][jj] * work[jj];
	dgemv_t_lib(nu0, nx1, hpBt[ii], cnx1, hux[ii], -1, hres_b[ii], hres_b[ii]);


	// middle stages
	for(ii=1; ii<N; ii++)
		{
		nu0 = nu1;
		nu1 = nu[ii+1];
		nx0 = nx1;
		nx1 = nx[ii+1];
		cnu0  = ncl*((nu0+ncl-1)/ncl);
		cnx0 = cnx1;
		cnx1  = ncl*((nx1+ncl-1)/ncl);
		nxm = (nx0<nx1) ? nx0 : nx1;

		for(jj=0; jj<nu0; jj++) hres_rq[ii][jj] = - hrq[ii][jj];
		for(jj=0; jj<nx0; jj++) work[jj] = hux[ii][nu0+jj];
		dgemv_t_lib(nx0, nu0, hpSt[ii], cnu0, work, -1, hres_rq[ii], hres_rq[ii]);
		dsymv_lib(nu0, nu0, hpR[ii], cnu0, hux[ii], -1, hres_rq[ii], hres_rq[ii]);
		dgemv_n_lib(nu0, nx1, hpBt[ii], cnx1, hpi[ii+1], -1, hres_rq[ii], hres_rq[ii]);

		for(jj=0; jj<nx0; jj++) hres_rq[ii][nu0+jj] = hpi[ii][jj] - hrq[ii][nu0+jj];
		for(jj=0; jj<nxm; jj++) hres_rq[ii][nu0+jj] -= hdA[ii][jj] * hpi[ii+1][jj];
		dgemv_n_lib(nx0, nu0, hpSt[ii], cnu0, hux[ii], -1, hres_rq[ii]+nu0, hres_rq[ii]+nu0);
		dsymv_lib(nx0, nx0, hpQ[ii], cnx0, work, -1, hres_rq[ii]+nu0, hres_rq[ii]+nu0);

		for(jj=0; jj<nx1; jj++) hres_b[ii][jj] = hux[ii+1][nu1+jj] - hb[ii][jj];
		for(jj=0; jj<nxm; jj++) hres_b[ii][jj] -= hdA[ii][jj] * work[jj];
		dgemv_t_lib(nu0, nx1, hpBt[ii], cnx1, hux[ii], -1, hres_b[ii], hres_b[ii]);

		}

	// last stage
	ii = N;
	nu0 = nu1;
	nx0 = nx1;
	cnx0 = cnx1;

	for(jj=0; jj<nx0; jj++) hres_rq[ii][nu0+jj] = hpi[ii][jj] - hrq[ii][nu0+jj];
	for(jj=0; jj<nx0; jj++) work[jj] = hux[ii][nu0+jj];
	dsymv_lib(nx0, nx0, hpQ[ii], cnx0, work, -1, hres_rq[ii]+nu0, hres_rq[ii]+nu0);

	}
Exemplo n.º 5
0
int main()
	{
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
/*	printf("\nflush subnormals to zero\n");*/
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	int ii, jj, idx;
	
	int rep, nrep=NREP;

	int nx = NX; // number of states (it has to be even for the mass-spring system test problem)
	int nu = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
	int N  = NN; // horizon lenght
//	int nb = NB; // number of box constrained inputs and states
	int nh = nu;//nu+nx/2; // number of hard box constraints
	int ns = nx;//nx/2;//nx; // number of soft box constraints
	int nb = nh + ns;

	int nhu = nu<nh ? nu : nh ;

	printf(" Test problem: mass-spring system with %d masses and %d controls.\n", nx/2, nu);
	printf("\n");
	printf(" MPC problem size: %d states, %d inputs, %d horizon length, %d two-sided box constraints on inputs and states, %d two-sided soft constraints on states.\n", nx, nu, N, nh, ns);
	printf("\n");
#if IP == 1
	printf(" IP method parameters: primal-dual IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_d_ip_box.c to change them).\n", K_MAX, MU_TOL);
#elif IP == 2
	printf(" IP method parameters: predictor-corrector IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_d_ip_box.c to change them).\n", K_MAX, MU_TOL);
#else
	printf(" Wrong value for IP solver choice: %d\n", IP);
#endif

	int info = 0;
		
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line
	
	const int nz = nx+nu+1;
	const int pnz = bs*((nz+bs-1)/bs);
	const int pnx = bs*((nx+bs-1)/bs);
	const int pnu = bs*((nu+bs-1)/bs);
	const int pnb = bs*((2*nb+bs-1)/bs); // packed number of box constraints
	const int cnz = ncl*((nx+nu+1+ncl-1)/ncl);
	const int cnx = ncl*((nx+ncl-1)/ncl);
	const int anz = nal*((nz+nal-1)/nal);
	const int anx = nal*((nx+nal-1)/nal);

//	const int pad = (ncl-nx%ncl)%ncl; // packing between BAbtL & P
//	const int cnl = cnz<cnx+ncl ? nx+pad+cnx+ncl : nx+pad+cnz;
	const int cnl = cnz<cnx+ncl ? cnx+ncl : cnz;
	

/************************************************
* dynamical system
************************************************/	

	double *A; d_zeros(&A, nx, nx); // states update matrix

	double *B; d_zeros(&B, nx, nu); // inputs matrix

	double *b; d_zeros(&b, nx, 1); // states offset
	double *x0; d_zeros(&x0, nx, 1); // initial state

	double Ts = 0.5; // sampling time
	mass_spring_system(Ts, nx, nu, N, A, B, b, x0);
	
	for(jj=0; jj<nx; jj++)
		b[jj] = 0.0;
	
	for(jj=0; jj<nx; jj++)
		x0[jj] = 0;
	x0[0] = 3.5;
	x0[1] = 3.5;
	
//	d_print_mat(nx, nx, A, nx);
//	d_print_mat(nx, nu, B, nx);
//	d_print_mat(nx, 1, b, nx);
//	d_print_mat(nx, 1, x0, nx);
	
	/* packed */
/*	double *BAb; d_zeros(&BAb, nx, nz);*/

/*	dmcopy(nx, nu, B, nx, BAb, nx);*/
/*	dmcopy(nx, nx, A, nx, BAb+nu*nx, nx);*/
/*	dmcopy(nx, 1 , b, nx, BAb+(nu+nx)*nx, nx);*/
	
	/* transposed */
/*	double *BAbt; d_zeros_align(&BAbt, pnz, pnz);*/
/*	for(ii=0; ii<nx; ii++)*/
/*		for(jj=0; jj<nz; jj++)*/
/*			{*/
/*			BAbt[jj+pnz*ii] = BAb[ii+nx*jj];*/
/*			}*/

	/* packed into contiguous memory */
	double *pBAbt; d_zeros_align(&pBAbt, pnz, cnx);
/*	d_cvt_mat2pmat(nz, nx, BAbt, pnz, 0, pBAbt, cnx);*/
/*	d_cvt_tran_mat2pmat(nx, nz, BAb, nx, 0, pBAbt, cnx);*/

	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbt, cnx);
	d_cvt_tran_mat2pmat(nx, nx, A, nx, nu, pBAbt+nu/bs*cnx*bs+nu%bs, cnx);
	for (jj = 0; jj<nx; jj++)
		pBAbt[(nx+nu)/bs*cnx*bs+(nx+nu)%bs+jj*bs] = b[jj];

/*	d_print_pmat (nz, nx, bs, pBAbt, cnx);*/
/*	exit(1);*/

/************************************************
* box constraints
************************************************/	

	double *db; d_zeros_align(&db, 2*nb, 1);
	jj=0;
	for( ; jj<2*nhu; jj++)
		db[jj] = - 0.5;   // umin
	for( ; jj<2*nh; jj++)
		db[jj] = - 4.0;   // xmin_hard
	for( ; jj<2*nb; jj++)
		db[jj] = - 1.0;   // xmin_soft

/************************************************
* cost function
************************************************/	

	double *Q; d_zeros(&Q, nz, nz);
	for(ii=0; ii<nu; ii++) Q[ii*(nz+1)] = 2.0;
	for(; ii<nz; ii++) Q[ii*(nz+1)] = 0.0;
	for(ii=0; ii<nu; ii++) Q[nx+nu+ii*nz] = 0.2;
	for(; ii<nz; ii++) Q[nx+nu+ii*nz] = 0.1;
/*	Q[(nx+nu)*(pnz+1)] = 1e35; // large enough (not needed any longer) */
	
	/* packed into contiguous memory */
	double *pQ; d_zeros_align(&pQ, pnz, cnz);
	d_cvt_mat2pmat(nz, nz, Q, nz, 0, pQ, cnz);

	// cost function of the soft constrained slack variables
	double *Z; d_zeros_align(&Z, pnb, 1);
	for(ii=0; ii<2*ns; ii++) Z[2*nh+ii] = 0.0;
	//for(ii=0; ii<nx; ii++) Z[2*nu+2*ii+0] = 100.0;
	double *z; d_zeros_align(&z, pnb, 1);
	for(ii=0; ii<2*ns; ii++) z[2*nh+ii] = 100.0;

	// maximum element in cost functions
	double mu0 = 1.0;
	for(ii=0; ii<nu+nx; ii++)
		for(jj=0; jj<nu+nx; jj++)
			mu0 = fmax(mu0, Q[jj+nz*ii]);
	for(ii=0; ii<2*ns; ii++)
		{
		mu0 = fmax(mu0, Z[2*nh+ii]);
		mu0 = fmax(mu0, z[2*nh+ii]);
		}
	//printf("\n mu0 = %f\n", mu0);

/************************************************
* matrices series
************************************************/	

	double *hpQ[N+1];
	double *hq[N+1];
	double *hZ[N+1];
	double *hz[N+1];
	double *hux[N+1];
	double *hpi[N+1];
	double *hlam[N+1];
	double *ht[N+1];
	double *hpBAbt[N];
	double *hdb[N+1];
	double *hrb[N];
	double *hrq[N+1];
	double *hrd[N+1];
	double *hrz[N+1];

	for(jj=0; jj<N; jj++)
		{
		//d_zeros_align(&hpQ[jj], pnz, cnz);
		hpQ[jj] = pQ;
		}
	//d_zeros_align(&hpQ[N], pnz, pnz);
	hpQ[N] = pQ;

	for(jj=0; jj<N; jj++)
		{
		d_zeros_align(&hq[jj], anz, 1);
		hZ[jj] = Z;
		hz[jj] = z;
		d_zeros_align(&hux[jj], anz, 1);
		d_zeros_align(&hpi[jj], anx, 1);
		d_zeros_align(&hlam[jj],2*pnb, 1); // TODO pnb
		d_zeros_align(&ht[jj], 2*pnb, 1); // TODO pnb
		hpBAbt[jj] = pBAbt;
		hdb[jj] = db;
		d_zeros_align(&hrb[jj], anx, 1);
		d_zeros_align(&hrq[jj], anz, 1);
		d_zeros_align(&hrd[jj], pnb, 1); // TODO pnb
		d_zeros_align(&hrz[jj], pnb, 1); // TODO pnb
		}
	d_zeros_align(&hq[N], anz, 1);
	hZ[N] = Z;
	hz[N] = z;
	d_zeros_align(&hux[N], anz, 1);
	d_zeros_align(&hpi[N], anx, 1);
	d_zeros_align(&hlam[N], 2*pnb, 1); // TODO pnb
	d_zeros_align(&ht[N], 2*pnb, 1); // TODO pnb
	hdb[N] = db;
	d_zeros_align(&hrq[N], anz, 1);
	d_zeros_align(&hrd[N], pnb, 1); // TODO pnb
	d_zeros_align(&hrz[N], pnb, 1); // TODO pnb
	
	// starting guess
	for(jj=0; jj<nx; jj++) hux[0][nu+jj]=x0[jj];

/************************************************
* riccati-like iteration
************************************************/

//	double *work; d_zeros_align(&work, (N+1)*(pnz*cnl + 5*anz + 10*pnb + 2*anx) + 3*anz, 1); // work space
	double *work; d_zeros_align(&work, (N+1)*(pnz*cnl + pnz + 5*anz + 10*pnb + 2*anx) + anz + pnz*cnx, 1); // work space
/*	for(jj=0; jj<( (N+1)*(pnz*cnl + 4*anz + 4*pnb + 2*anx) + 3*anz ); jj++) work[jj] = -1.0;*/
	int kk = 0; // acutal number of iterations
/*	char prec = PREC; // double/single precision*/
/*	double sp_thr = SP_THR; // threshold to switch between double and single precision*/
	int k_max = K_MAX; // maximum number of iterations in the IP method
	double mu_tol = MU_TOL; // tolerance in the duality measure
	double alpha_min = ALPHA_MIN; // minimum accepted step length
	double sigma[] = {0.4, 0.3, 0.01}; // control primal-dual IP behaviour
	double *stat; d_zeros(&stat, 5, k_max); // stats from the IP routine
	int compute_mult = COMPUTE_MULT;
	int warm_start = WARM_START;
	double mu = -1.0;
	int hpmpc_status;
	


	/* initizile the cost function */
//	for(ii=0; ii<N; ii++)
//		{
//		for(jj=0; jj<pnz*cnz; jj++) hpQ[ii][jj]=pQ[jj];
//		}
//	for(jj=0; jj<pnz*cnz; jj++) hpQ[N][jj]=pQ[jj];



	// initial states
	double xx0[] = {3.5, 3.5, 3.66465, 2.15833, 1.81327, -0.94207, 1.86531, -2.35760, 2.91534, 1.79890, -1.49600, -0.76600, -2.60268, 1.92456, 1.66630, -2.28522, 3.12038, 1.83830, 1.93519, -1.87113};



	/* warm up */

	// initialize states and inputs
	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nx+nu; jj++)
			hux[ii][jj] = 0;

	hux[0][nu+0] = xx0[0];
	hux[0][nu+1] = xx0[1];

	// call the IP solver
//	if(FREE_X0==0)
//		{
		if(IP==1)
			hpmpc_status = d_ip_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
		else
			hpmpc_status = d_ip2_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
//		}
//	else
//		{
//		if(IP==1)
//			hpmpc_status = d_ip_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//		else
//			hpmpc_status = d_ip2_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//		}

#if 0
	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (soft-constraints solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nu = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nu, hux[ii], 1);
		
		printf("\nx = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nx, hux[ii]+nu, 1);
		
		printf("\nlam = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*nb, hlam[ii], 1);
		
		}
#endif



	int kk_avg = 0;
	int kks_avg = 0;

	/* timing */
	struct timeval tv0, tv1, tv2, tv3, tv4, tv5;

	// use general constraint to solve the soft-box-constrainted problem
	#if 1 
	int nus = nu + 2*nx; // number of inputs and slack variables
	int nbs = nus;
	int ngs = nx;
	const int nzs = nx+nus+1;
	const int cnzs = ncl*((nzs+ncl-1)/ncl);
	const int cngs = ncl*((ngs+ncl-1)/ncl);
	const int cnxgs= ncl*((ngs+nx+ncl-1)/ncl);
	const int pnzs = bs*((nzs+bs-1)/bs);
	const int pnbs = bs*((nbs+bs-1)/bs); // simd aligned number of one-sided box constraints !!!!!!!!!!!!
	const int pngs = bs*((ngs+bs-1)/bs); // simd aligned number of one-sided box constraints !!!!!!!!!!!!
	const int cnls = cnzs<cnx+ncl ? cnx+ncl : cnzs;
	const int anzs = nal*((nzs+nal-1)/nal);
	double *pBAbts; d_zeros_align(&pBAbts, pnzs, cnx);
	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbts, cnx);
	d_cvt_tran_mat2pmat(nx, nx, A, nx, nus, pBAbts+nus/bs*cnx*bs+nus%bs, cnx);
	for(jj=0; jj<nx; jj++)
		pBAbts[(nx+nus)/bs*cnx*bs+(nx+nus)%bs+jj*bs] = b[jj];
	//d_print_pmat (nzs, nx, bs, pBAbts, cnx);
	double *ds; d_zeros_align(&ds, 2*pnbs+2*pngs, 1);
	for(jj=0; jj<nu; jj++)
		{
		ds[jj]      = - 0.5; //   umin
		ds[pnbs+jj] = - 0.5; // - umax
		}
	for(; jj<nus; jj++)
		{
		ds[jj]      =    0.0; //   smin
		ds[pnbs+jj] = - 10.0; // - smax
		}
	for(jj=0; jj<ngs; jj++)
		{
		ds[2*pnbs+jj]      = - 1.0; //   xmin
		ds[2*pnbs+pngs+jj] = - 1.0; // - xmax
		}
	//d_print_mat(1, 2*pnbs+2*pngs, ds, 1);
	double *Cs; d_zeros(&Cs, ngs, nx);
	double *Ds; d_zeros(&Ds, ngs, nus);
	for(jj=0; jj<nx; jj++)
		{
		Cs[jj+jj*ngs] = 1.0;
		Ds[jj+(nu+jj)*ngs] = 1.0;
		Ds[jj+(nu+nx+jj)*ngs] = - 1.0;
		}
	double *pDCts; d_zeros_align(&pDCts, pnzs, cngs);
	d_cvt_tran_mat2pmat(ngs, nus, Ds, ngs, 0, pDCts, cngs);
	d_cvt_tran_mat2pmat(ngs, nx, Cs, ngs, nus, pDCts+nus/bs*cngs*bs+nus%bs, cngs);
	//d_print_pmat(nus+nx, ngs, bs, pDCts, cngs);
	double *Qs; d_zeros(&Qs, nzs, nzs);
	d_copy_mat(nu, nu, Q, nz, Qs, nzs);
	d_copy_mat(nx+1, nu, Q+nu, nz, Qs+nus, nzs);
	d_copy_mat(nx+1, nx, Q+nu*(nz+1), nz, Qs+nus*(nzs+1), nzs);
	for(jj=0; jj<nx; jj++)
		{
		Qs[(nu+jj)*(nzs+1)] = Z[2*nh+2*jj+0]; // TODO change when updated IP !!!!!
		Qs[(nu+nx+jj)*(nzs+1)] = Z[2*nh+2*jj+1]; // TODO change when updated IP !!!!!
		Qs[nus+nx+(nu+jj)*nzs] = z[2*nh+2*jj+0]; // TODO change when updated IP !!!!!
		Qs[nus+nx+(nu+nx+jj)*nzs] = z[2*nh+2*jj+1]; // TODO change when updated IP !!!!!
		}
	double *pQs; d_zeros_align(&pQs, pnzs, cnzs);
	d_cvt_mat2pmat(nzs, nzs, Qs, nzs, 0, pQs, cnzs);
	//d_print_pmat(nzs, nzs, bs, pQs, cnzs);
	double *hpQs[N+1];
	double *huxs[N+1];
	double *hpis[N+1];
	double *hlams[N+1];
	double *hts[N+1];
	double *hpBAbts[N];
	double *hpDCts[N+1];
	double *hds[N+1];
	for(jj=0; jj<N; jj++)
		{
		hpQs[jj] = pQs;
		hpBAbts[jj] = pBAbts;
		hpDCts[jj] = pDCts;
		hds[jj] = ds;
		d_zeros_align(&huxs[jj], pnzs, 1);
		d_zeros_align(&hpis[jj], pnx, 1);
		d_zeros_align(&hlams[jj], 2*pnbs+2*pngs, 1);
		d_zeros_align(&hts[jj], 2*pnbs+2*pngs, 1);
		}
	hpQs[N] = pQs;
	d_zeros_align(&hpDCts[N], pnzs, cngs);
	d_zeros_align(&hds[N], 2*pnbs+2*pngs, 1);
	d_zeros_align(&huxs[N], pnzs, 1);
	d_zeros_align(&hpis[N], pnx, 1);
	d_zeros_align(&hlams[N] ,2*pnbs+2*pngs, 1);
	d_zeros_align(&hts[N], 2*pnbs+2*pngs, 1);
	double *works; d_zeros_align(&works, (N+1)*(pnzs*cnls + pnzs + 5*anzs + 10*(pnbs+pngs) + 2*anx) + anzs + pnzs*cnxgs, 1); // work space 

	gettimeofday(&tv0, NULL); // start

	for(rep=0; rep<nrep; rep++)
		{

		// initialize states and inputs
		for(ii=0; ii<=N; ii++)
			for(jj=0; jj<nx+nus; jj++)
				huxs[ii][jj] = 0;

		idx = rep%10;
		huxs[0][nus+0] = xx0[2*idx];
		huxs[0][nus+1] = xx0[2*idx+1];

		if(IP==1)
			hpmpc_status = d_ip_hard_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nus, N, nbs, ngs, ngs, hpBAbts, hpQs, hpDCts, hds, huxs, compute_mult, hpis, hlams, hts, works);
		else
			hpmpc_status = d_ip2_hard_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nus, N, nbs, ngs, ngs, hpBAbts, hpQs, hpDCts, hds, huxs, compute_mult, hpis, hlams, hts, works);

		kks_avg += kk;

		}


	gettimeofday(&tv1, NULL); // stop

	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (general-constraints solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nus = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nus, huxs[ii], 1);
		
		printf("\nxs = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nx, huxs[ii]+nus, 1);
	
		}


	for(jj=0; jj<N; jj++)
		{
		free(huxs[jj]);
		free(hpis[jj]);
		free(hlams[jj]);
		free(hts[jj]);
		}
	free(hpDCts[N]);
	free(hds[N]);
	free(huxs[N]);
	free(hpis[N]);
	free(hlams[N]);
	free(hts[N]);
	free(works);
	//exit(1);
	#endif



	gettimeofday(&tv2, NULL); // start



	for(rep=0; rep<nrep; rep++)
		{

		idx = rep%10;
//		x0[0] = xx0[2*idx];
//		x0[1] = xx0[2*idx+1];

		// initialize states and inputs
		for(ii=0; ii<=N; ii++)
			for(jj=0; jj<nx+nu; jj++)
				hux[ii][jj] = 0;

		hux[0][nu+0] = xx0[2*idx];
		hux[0][nu+1] = xx0[2*idx+1];

		// call the IP solver
//		if(FREE_X0==0)
//			{
			if(IP==1)
				hpmpc_status = d_ip_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
			else
				hpmpc_status = d_ip2_soft_mpc(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nh, ns, hpBAbt, hpQ, hZ, hz, hdb, hux, compute_mult, hpi, hlam, ht, work);
//			}
//		else
//			{
//			if(IP==1)
//				hpmpc_status = d_ip_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//			else
//				hpmpc_status = d_ip2_box_mhe_old(&kk, k_max, mu_tol, alpha_min, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);
//			}

		kk_avg += kk;

		}
	
	gettimeofday(&tv3, NULL); // stop
	


	// restore linear part of cost function 
	for(ii=0; ii<N; ii++)
		{
		for(jj=0; jj<nx+nu; jj++) hq[ii][jj] = Q[nx+nu+nz*jj];
		}
	for(jj=0; jj<nx+nu; jj++) hq[N][jj] = Q[nx+nu+nz*jj];

	// residuals computation
//	if(FREE_X0==0)
		d_res_ip_soft_mpc(nx, nu, N, nh, ns, hpBAbt, hpQ, hq, hZ, hz, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, hrz, &mu);
//	else
//		d_res_ip_box_mhe_old(nx, nu, N, nb, hpBAbt, hpQ, hq, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, &mu);


	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (soft-constraints solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nu = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nu, hux[ii], 1);
		
		printf("\nx = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nx, hux[ii]+nu, 1);
		
		printf("\nlam = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*nb, hlam[ii], 1);
		
		}

	if(PRINTRES==1 && COMPUTE_MULT==1)
		{
		// print result 
		// print result 
		printf("\n");
		printf("\n");
		printf(" Print residuals\n\n");
		printf("\n");
		printf("\n");
		printf("rq = \n\n");
//		if(FREE_X0==0)
//			{
			d_print_mat(1, nu, hrq[0], 1);
			for(ii=1; ii<=N; ii++)
/*				d_print_mat_e(1, nx+nu, hrq[ii], 1);*/
				d_print_mat(1, nx+nu, hrq[ii], 1);
//			}
//		else
//			{
//			for(ii=0; ii<=N; ii++)
///*				d_print_mat_e(1, nx+nu, hrq[ii], 1);*/
//				d_print_mat(1, nx+nu, hrq[ii], 1);
//			}
		printf("rz = \n\n");
		for(ii=0; ii<=N; ii++)
//			d_print_mat_e(1, 2*nb-2*nu, hrz[ii]+2*nu, 1);
			d_print_mat(1, 2*nb-2*nu, hrz[ii]+2*nu, 1);
		printf("\n");
		printf("\n");
		printf("\n");
		printf("\n");
		printf("rb = \n\n");
		for(ii=0; ii<N; ii++)
/*			d_print_mat_e(1, nx, hrb[ii], 1);*/
			d_print_mat(1, nx, hrb[ii], 1);
		printf("\n");
		printf("\n");
		printf("rd = \n\n");
		for(ii=0; ii<=N; ii++)
/*			d_print_mat_e(1, 2*nb, hrd[ii], 1);*/
			d_print_mat(1, 2*nb, hrd[ii], 1);
		printf("\n");
		printf("\n");
		printf("mu = %e\n\n", mu);
		
		}

/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	


/**************************************************************************************************
*
*	time-variant nx and nu, sparse box and soft constraints format
*
**************************************************************************************************/

	// problem size
	int nx_tv[N+1];
	int nu_tv[N+1];
	int nb_tv[N+1];
	int ng_tv[N+1];
	int ns_tv[N+1];
	int nz_tv[N+1]; // vector of zeros

	// first stage
	nx_tv[0] = 0;
	nu_tv[0] = nu;
	nb_tv[0] = nu;
	ng_tv[0] = 0;
	ns_tv[0] = 0;
	nz_tv[0] = 0;

	// middle stages
	for(ii=1; ii<N; ii++)
		{
		nx_tv[ii] = nx;
		nu_tv[ii] = nu;
		nb_tv[ii] = nu;
		ng_tv[ii] = 0;
		ns_tv[ii] = nx;
		nz_tv[ii] = 0;
		}
	
	// last stage
	nx_tv[N] = nx;
	nu_tv[N] = 0;
	nb_tv[N] = 0;
	ng_tv[N] = 0;
	ns_tv[N] = nx;
	nz_tv[N] = 0;


	// matrix sizes
	int pnz_tv[N+1];
	int pnx_tv[N+1];
	int pnb_tv[N+1];
	int png_tv[N+1];
	int pns_tv[N+1];
	int cnz_tv[N+1];
	int cnx_tv[N+1];
	int cnl_tv[N+1];

	for(ii=0; ii<=N; ii++)
		{
		pnz_tv[ii] = (nu_tv[ii]+nx_tv[ii]+1+bs-1)/bs*bs;
		pnx_tv[ii] = (nx_tv[ii]+bs-1)/bs*bs;
		pnb_tv[ii] = (nb_tv[ii]+bs-1)/bs*bs;
		png_tv[ii] = (ng_tv[ii]+bs-1)/bs*bs;
		pns_tv[ii] = (ns_tv[ii]+bs-1)/bs*bs;
		cnz_tv[ii] = (nu_tv[ii]+nx_tv[ii]+1+ncl-1)/ncl*ncl;
		cnx_tv[ii] = (nx_tv[ii]+ncl-1)/ncl*ncl;
		cnl_tv[ii] = cnz_tv[ii]<cnx_tv[ii]+ncl ? cnx_tv[ii]+ncl : cnz_tv[ii];
		}
	
//	for(ii=0; ii<=N; ii++)
//		printf("\n%d\t%d\t%d\t%d\t%d\t%d\t%d\n", pnz_tv[ii], pnx_tv[ii], pnb_tv[ii], pns_tv[ii], cnz_tv[ii], cnx_tv[ii], cnl_tv[ii]);



	// state-space matrices
	//d_print_mat(nx, nx, A, nx);
	//d_print_mat(nx, nu, B, nx);
	//for(ii=0; ii<nx; ii++) b[ii] = 1.0;
	//d_print_mat(nx, 1, b, nx);
	//d_print_mat(nx, 1, x0, nx);

	// compute b0
	double *pA; d_zeros_align(&pA, pnx, cnx);
	d_cvt_mat2pmat(nx, nx, A, nx, 0, pA, cnx);
	double *b0; d_zeros_align(&b0, pnx, 1);
	dgemv_n_lib(nx, nx, pA, cnx, x0, 1, b, b0);
	//d_print_pmat(nx, nx, bs, pA, cnx);
	//d_print_mat(nx, 1, b0, nx);

	double *pBAbt0; d_zeros_align(&pBAbt0, pnz_tv[0], cnx_tv[1]);
	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbt0, cnx_tv[1]);
	d_cvt_tran_mat2pmat(nx, 1, b0, nx, nu, pBAbt0+nu/bs*bs*cnx_tv[1]+nu%bs, cnx_tv[1]);
	//d_print_pmat(nu_tv[0]+nx_tv[0]+1, nx_tv[1], bs, pBAbt0, cnx_tv[1]);

	double *pBAbt1; d_zeros_align(&pBAbt1, pnz_tv[1], cnx_tv[2]);
	d_cvt_tran_mat2pmat(nx, nu, B, nx, 0, pBAbt1, cnx_tv[2]);
	d_cvt_tran_mat2pmat(nx, nx, A, nx, nu, pBAbt1+nu/bs*bs*cnx_tv[2]+nu%bs, cnx_tv[2]);
	d_cvt_tran_mat2pmat(nx, 1, b, nx, nu+nx, pBAbt1+(nu+nx)/bs*bs*cnx_tv[2]+(nu+nx)%bs, cnx_tv[2]);
//	d_print_pmat(nu_tv[1]+nx_tv[1]+1, nx_tv[2], bs, pBAbt1, cnx_tv[2]);
	
	double *(hpBAbt_tv[N]);
	hpBAbt_tv[0] = pBAbt0;
	for(ii=1; ii<N; ii++)
		hpBAbt_tv[ii] = pBAbt1;
	

	// cost function matrices
	//for(ii=nu; ii<nu+nx; ii++) Q[ii*(nz+1)] = 1.0; // TODO remove !!!!
	//d_print_mat(nz, nz, Q, nz);

	double *q; d_zeros_align(&q, pnz, 1);
	for(ii=0; ii<nu; ii++) q[ii] = Q[nu+nx+ii*nz];
	//d_print_mat(nu, 1, q, nu);

	double *pS; d_zeros_align(&pS, pnu, cnx);
	d_cvt_tran_mat2pmat(nx, nu, Q+nu, nz, 0, pS, cnx);
	//d_print_pmat(nu, nx, bs, pS, cnx);

	double *q0; d_zeros_align(&q0, pnz_tv[0], 1);
	dgemv_n_lib(nu, nx, pS, cnx, x0, 1, q, q0);
	//d_print_mat(nu, 1, q0, nu);

	double *pQ0; d_zeros_align(&pQ0, pnz_tv[0], cnz_tv[0]);
	d_cvt_mat2pmat(nu, nu, Q, nz, 0, pQ0, cnz_tv[0]);
	d_cvt_tran_mat2pmat(nu, 1, q0, nu, nu, pQ0+nu/bs*bs*cnz_tv[0]+nu%bs, cnz_tv[0]);
	//d_print_pmat(nu_tv[0]+nx_tv[0]+1, nu_tv[0]+nx_tv[0]+1, bs, pQ0, pnz_tv[0]);
	
	double *pQ1; d_zeros_align(&pQ1, pnz_tv[1], cnz_tv[1]);
	d_cvt_mat2pmat(nz, nz, Q, nz, 0, pQ1, cnz_tv[1]);
	//d_print_pmat(nu_tv[1]+nx_tv[1]+1, nu_tv[1]+nx_tv[1]+1, bs, pQ1, pnz_tv[1]);

	double *pQN; d_zeros_align(&pQN, pnz_tv[N], cnz_tv[N]);
	d_cvt_mat2pmat(nx+1, nx+1, Q+nu*(nz+1), nz, 0, pQN, cnz_tv[N]);
	//d_print_pmat(nu_tv[N]+nx_tv[N]+1, nu_tv[N]+nx_tv[N]+1, bs, pQN, cnz_tv[N]);

	double *(hpQ_tv[N+1]);
	hpQ_tv[0] = pQ0;
	for(ii=1; ii<N; ii++)
		hpQ_tv[ii] = pQ1;
	hpQ_tv[N] = pQN;
	


	double *(hpL_tv[N+1]);
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hpL_tv[ii], pnz_tv[ii], cnl_tv[ii]);

	double *(hdL_tv[N+1]);
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hdL_tv[ii], pnz_tv[ii], 1);



	double *hux_tv[N+1];
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hux_tv[ii], (nu_tv[ii]+nx_tv[ii]+bs-1)/bs*bs, 1);
	
	double *hpi_tv[N+1];
	for(ii=0; ii<=N; ii++)
		d_zeros_align(&hpi_tv[ii], pnx_tv[ii], 1);
	

	// dummy variables
	int **pdummyi;
	double **pdummyd;
	

#if 0
	// work space
	double *ric_tv_work; d_zeros_align(&ric_tv_work, d_ric_sv_mpc_tv_work_space_size_double(N, nx_tv, nu_tv, nz_tv, nz_tv), 1);
	double *ric_tv_diag; d_zeros_align(&ric_tv_diag, pnz, 1);

	// call the Riccati solver
	d_back_ric_sv_tv(N, nx_tv, nu_tv, hpBAbt_tv, hpQ_tv, hux_tv, hpL_tv, hdL_tv, ric_tv_work, ric_tv_diag, 0, pdummyd, 1, hpi_tv, nz_tv, pdummyi, pdummyd, pdummyd, nz_tv, pdummyd, pdummyd, pdummyd);

	// print solution
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nu_tv[ii]+nx_tv[ii], hux_tv[ii], 1);
#endif
	

	// constraints
	int *idxb0 = (int *) malloc((nb_tv[0]+ns_tv[0])*sizeof(int));
	double *db0; d_zeros_align(&db0, 2*pnb_tv[0]+2*pns_tv[0], 1);
	int nbu0;
	nbu0 = nu_tv[0]<nb_tv[0] ? nu_tv[0] : nb_tv[0];
	idx = 0;
	for(jj=0; jj<nbu0; jj++)
		{
		idxb0[idx] = idx;
		db0[0*pnb_tv[0]+jj] = - 0.5; // umin_hard
		db0[1*pnb_tv[0]+jj] = - 0.5; // umax_hard
		idx++;
		}

	int *idxb1 = (int *) malloc((nb_tv[1]+ns_tv[1])*sizeof(int));
	double *db1; d_zeros_align(&db1, 2*pnb_tv[1]+2*pns_tv[1], 1);
	nbu0 = nu_tv[1]<nb_tv[1] ? nu_tv[1] : nb_tv[1];
	idx = 0;
	for(jj=0; jj<nbu0; jj++)
		{
		idxb1[idx] = idx;
		db1[0*pnb_tv[1]+jj] = - 0.5; // umin_hard
		db1[1*pnb_tv[1]+jj] = - 0.5; // umax_hard
		idx++;
		}
	for(jj=nu_tv[1]; jj<nb_tv[1]; jj++)
		{
		idxb1[idx] = idx;
		db1[0*pnb_tv[1]+jj] = - 4.0; // xmin_hard
		db1[1*pnb_tv[1]+jj] = - 4.0; // xmax_hard
		idx++;
		}
	for(jj=0; jj<ns_tv[1]; jj++)
		{
		idxb1[idx] = idx;
		db1[2*pnb_tv[1]+0*pns_tv[1]+jj] = - 1.0; // xmin_soft
		db1[2*pnb_tv[1]+1*pns_tv[1]+jj] = - 1.0; // xmax soft
		idx++;
		}

	int *idxbN = (int *) malloc((nb_tv[N]+ns_tv[N])*sizeof(int));
	double *dbN; d_zeros_align(&dbN, 2*pnb_tv[N]+2*pns_tv[N], 1);
	idx = 0;
	for(jj=nu_tv[N]; jj<nb_tv[N]; jj++)
		{
		idxbN[idx] = idx;
		dbN[0*pnb_tv[N]+jj] = - 4.0; // xmin_hard
		dbN[1*pnb_tv[N]+jj] = - 4.0; // xmax_hard
		idx++;
		}
	for(jj=0; jj<ns_tv[N]; jj++)
		{
		idxbN[idx] = idx;
		dbN[2*pnb_tv[N]+0*pns_tv[N]+jj] = - 1.0; // xmin_soft
		dbN[2*pnb_tv[N]+1*pns_tv[N]+jj] = - 1.0; // xmax soft
		idx++;
		}
	
	int *idxb_tv[N+1];
	double *hdb_tv[N+1];
	idxb_tv[0] = idxb0;
	hdb_tv[0] = db0;
	for(ii=1; ii<N; ii++)
		{
		idxb_tv[ii] = idxb1;
		hdb_tv[ii] = db1;
		}
	idxb_tv[N] = idxbN;
	hdb_tv[N] = dbN;

#if 0
	for(ii=0; ii<=N; ii++)
		{
		for(jj=0; jj<nb_tv[ii]+ns_tv[ii]; jj++)
			printf("\t%d", idxb_tv[ii][jj]);
		printf("\n");
		}
#endif
	

	// cost function of the soft contraint slack variables
	double *Z1; d_zeros_align(&Z1, 2*pns_tv[1], 1);
	for(ii=0; ii<ns_tv[1]; ii++)
		{
		Z1[0*pns_tv[1]+ii] = 0.0;
		Z1[1*pns_tv[1]+ii] = 0.0;
		}
	double *z1; d_zeros_align(&z1, 2*pns_tv[1], 1);
	for(ii=0; ii<ns_tv[1]; ii++)
		{
		z1[0*pns_tv[1]+ii] = 100.0;
		z1[1*pns_tv[1]+ii] = 100.0;
		}
	
	double *hZ_tv[N+1];
	double *hz_tv[N+1];
	for(ii=0; ii<=N; ii++)
		{
		hZ_tv[ii] = Z1;
		hz_tv[ii] = z1;
		}

	// maximum element in cost functions
	mu0 = 1.0;
	for(ii=0; ii<nu+nx; ii++)
		for(jj=0; jj<nu+nx; jj++)
			mu0 = fmax(mu0, Q[jj+nz*ii]);
	for(ii=0; ii<ns; ii++)
		{
		mu0 = fmax(mu0, Z[0*pns_tv[1]+ii]);
		mu0 = fmax(mu0, Z[1*pns_tv[1]+ii]);
		mu0 = fmax(mu0, z[0*pns_tv[1]+ii]);
		mu0 = fmax(mu0, z[1*pns_tv[1]+ii]);
		}
	//printf("\n mu0 = %f\n", mu0);

	// lagrangian multipliers and slack variables
	double *hlam_tv[N+1];
	double *ht_tv[N+1];
	for(ii=0; ii<=N; ii++)
		{
		d_zeros_align(&hlam_tv[ii], 2*pnb_tv[ii]+2*png_tv[ii]+4*pns_tv[ii], 1);
		d_zeros_align(&ht_tv[ii], 2*pnb_tv[ii]+2*png_tv[ii]+4*pns_tv[ii], 1);
		}



	// ip soft work space
	double *ip_soft_tv_work; d_zeros_align(&ip_soft_tv_work, d_ip2_soft_mpc_tv_work_space_size_double(N, nx_tv, nu_tv, nb_tv, ng_tv, ns_tv), 1);

	// call the ip soft solver
	d_ip2_soft_mpc_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, N, nx_tv, nu_tv, nb_tv, idxb_tv, ng_tv, ns_tv, hpBAbt_tv, hpQ_tv, hZ_tv, hz_tv, pdummyd, hdb_tv, hux_tv, 1, hpi_tv, hlam_tv, ht_tv, ip_soft_tv_work);



	int kk_avg_tv = 0;

	gettimeofday(&tv4, NULL); // start



	for(rep=0; rep<nrep; rep++)
		{

		idx = rep%10;
//		x0[0] = xx0[2*idx];
//		x0[1] = xx0[2*idx+1];

		// initialize states and inputs
//		for(ii=0; ii<=N; ii++)
//			for(jj=0; jj<nx+nu; jj++)
//				hux[ii][jj] = 0;

		x0[0] = xx0[2*idx];
		x0[1] = xx0[2*idx+1];

		// update initial state embedded in b and r
		dgemv_n_lib(nx, nx, pA, cnx, x0, 1, b, b0);
		d_cvt_tran_mat2pmat(nx, 1, b0, nx, nu, pBAbt0+nu/bs*bs*cnx_tv[1]+nu%bs, cnx_tv[1]);
		dgemv_n_lib(nu, nx, pS, cnx, x0, 1, q, q0);
		d_cvt_tran_mat2pmat(nu, 1, q0, nu, nu, pQ0+nu/bs*bs*cnz_tv[0]+nu%bs, cnz_tv[0]);

		// call the IP solver
		d_ip2_soft_mpc_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, sigma, stat, N, nx_tv, nu_tv, nb_tv, idxb_tv, ng_tv, ns_tv, hpBAbt_tv, hpQ_tv, hZ_tv, hz_tv, pdummyd, hdb_tv, hux_tv, 1, hpi_tv, hlam_tv, ht_tv, ip_soft_tv_work);

		kk_avg_tv += kk;

		}
	
	gettimeofday(&tv5, NULL); // stop
	

	
	double *hrq_tv[N+1];
	double *hrb_tv[N];
	double *hrd_tv[N+1];
	double *hrz_tv[N+1];
	double *hq_tv[N+1];

	for(ii=0; ii<N; ii++)
		{
		d_zeros_align(&hrq_tv[ii], pnz_tv[ii], 1);
		d_zeros_align(&hrb_tv[ii], pnx_tv[ii+1], 1);
		d_zeros_align(&hrd_tv[ii], 2*pnb_tv[ii]+2*png_tv[ii]+2*pns_tv[ii], 1);
		d_zeros_align(&hrz_tv[ii], 2*pns_tv[ii], 1);
		d_zeros_align(&hq_tv[ii], pnz_tv[ii], 1);
		}
	d_zeros_align(&hrq_tv[N], pnz_tv[N], 1);
	d_zeros_align(&hrd_tv[N], 2*pnb_tv[N]+2*png_tv[N]+2*pns_tv[N], 1);
	d_zeros_align(&hrz_tv[N], 2*pns_tv[N], 1);
	d_zeros_align(&hq_tv[N], pnz_tv[N], 1);


	// restore linear part of cost function 
	for(ii=0; ii<=N; ii++)
		{
		drowex_lib(nu_tv[ii]+nx_tv[ii], hpQ_tv[ii]+(nu_tv[ii]+nx_tv[ii])/bs*bs*cnz_tv[ii]+(nu_tv[ii]+nx_tv[ii])%bs, hq_tv[ii]);
		}



	// residuals computation
//	d_res_ip_soft_mpc(nx, nu, N, nh, ns, hpBAbt, hpQ, hq, hZ, hz, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, hrz, &mu);
	d_res_ip_soft_mpc_tv(N, nx_tv, nu_tv, nb_tv, idxb_tv, ng_tv, ns_tv, hpBAbt_tv, hpQ_tv, hq_tv, hZ_tv, hz_tv, hux_tv, pdummyd, hdb_tv, hpi_tv, hlam_tv, ht_tv, hrq_tv, hrb_tv, hrd_tv, hrz_tv, &mu);




	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print IP statistics of the last run (soft-constraints time-variant solver)\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		// print solution
		printf("\nhux_tv = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nu_tv[ii]+nx_tv[ii], hux_tv[ii], 1);
		
		}

	if(PRINTRES==1 && COMPUTE_MULT==1)
		{
		// print result 
		// print result 
		printf("\n");
		printf("\n");
		printf(" Print residuals\n\n");
		printf("\n");
		printf("\n");
		printf("rq = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, nu_tv[ii]+nx_tv[ii], hrq_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("rz = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*pns_tv[ii], hrz_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("rb = \n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat(1, nx_tv[ii], hrb_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("rd = \n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat(1, 2*pnb_tv[ii]+2*png_tv[ii]+2*pns_tv[ii], hrd_tv[ii], 1);
		printf("\n");
		printf("\n");
		printf("mu = %e\n\n", mu);
		
		}



	// free memory
	free(pA);
	free(b0);
	free(pBAbt0);
	free(pBAbt1);
	free(pQ0);
	free(pQ1);
	free(pQN);
	free(idxb0);
	free(idxb1);
	free(idxbN);
	free(db0);
	free(db1);
	free(dbN);
	free(Z1);
	free(z1);
	for(ii=0; ii<=N; ii++) free(hpL_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hdL_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hux_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hpi_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hlam_tv[ii]);
	for(ii=0; ii<=N; ii++) free(ht_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hrq_tv[ii]);
	for(ii=0; ii<N; ii++) free(hrb_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hrd_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hrz_tv[ii]);
	for(ii=0; ii<=N; ii++) free(hq_tv[ii]);



/**************************************************************************************************
*	printing timings
**************************************************************************************************/

	double times = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
	double time = (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
	double time_tv = (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6);
	
/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f (soft-constraints solver)\n", nrep, kk_avg / (double) nrep);
	printf(" Average number of iterations over %d runs: %5.1f (general-constraints solver)\n", nrep, kks_avg / (double) nrep);
	printf(" Average number of iterations over %d runs: %5.1f (soft-constraints time-variant solver)\n", nrep, kk_avg_tv / (double) nrep);
	printf("\n");
	printf(" Average solution time over %d runs: %5.2e seconds (soft-constraints solver)\n", nrep, time);
	printf(" Average solution time over %d runs: %5.2e seconds (general-constraints solver)\n", nrep, times);
	printf(" Average solution time over %d runs: %5.2e seconds (soft-constraints time-variant solver)\n", nrep, time_tv);
	printf("\n");



/************************************************
* free memory and return
************************************************/

	free(A);
	free(B);
	free(b);
	free(x0);
/*	free(BAb);*/
/*	free(BAbt);*/
	free(pBAbt);
	free(db);
	free(Q);
	free(pQ);
	free(Z);
	free(z);
	free(work);
	free(stat);
	for(jj=0; jj<N; jj++)
		{
//		free(hpQ[jj]);
		free(hq[jj]);
		free(hux[jj]);
		free(hpi[jj]);
		free(hlam[jj]);
		free(ht[jj]);
		free(hrb[jj]);
		free(hrq[jj]);
		free(hrd[jj]);
		free(hrz[jj]);
		}
//	free(hpQ[N]);
	free(hq[N]);
	free(hux[N]);
	free(hpi[N]);
	free(hlam[N]);
	free(ht[N]);
	free(hrq[N]);
	free(hrd[N]);
	free(hrz[N]);



	return 0;

	}
Exemplo n.º 6
0
int main()
	{
		
#if defined(REF_BLAS_OPENBLAS)
	openblas_set_num_threads(1);
#endif
#if defined(REF_BLAS_BLIS)
	omp_set_num_threads(1);
#endif

	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

	printf("BLAS performance test - double precision\n");
	printf("\n");

	// maximum frequency of the processor
	const float GHz_max = GHZ_MAX;
	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
	printf("\n");

	// maximum flops per cycle, double precision
#if defined(TARGET_X64_AVX2)
	const float flops_max = 16;
	printf("Testing BLAS version for AVX2 & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_AVX)
	const float flops_max = 8;
	printf("Testing BLAS version for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	const float flops_max = 4;
	printf("Testing BLAS version for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A15)
	const float flops_max = 2;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A9)
	const float flops_max = 1;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A7)
	const float flops_max = 0.5;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X86_ATOM)
	const float flops_max = 1;
	printf("Testing BLAS version for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_POWERPC_G2)
	const float flops_max = 1;
	printf("Testing BLAS version for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4_PREFETCH)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 4x4 kernel with register prefetch: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_2X2)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#endif
	
	FILE *f;
	f = fopen("./test_problems/results/test_blas.m", "w"); // a

#if defined(TARGET_X64_AVX2)
	fprintf(f, "C = 'd_x64_avx2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_AVX)
	fprintf(f, "C = 'd_x64_avx';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	fprintf(f, "C = 'd_x64_sse3';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A9)
	fprintf(f, "C = 'd_ARM_cortex_A9';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A7)
	fprintf(f, "C = 'd_ARM_cortex_A7';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A15)
	fprintf(f, "C = 'd_ARM_cortex_A15';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X86_ATOM)
	fprintf(f, "C = 'd_x86_atom';\n");
	fprintf(f, "\n");
#elif defined(TARGET_POWERPC_G2)
	fprintf(f, "C = 'd_PowerPC_G2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4_PREFETCH)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_2X2)
	fprintf(f, "C = 'd_c99_2x2';\n");
	fprintf(f, "\n");
#endif

	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
	fprintf(f, "\n");

	fprintf(f, "B = [\n");
	


	int i, j, rep, ll;
	
	const int bsd = D_MR; //d_get_mr();

/*	int info = 0;*/
	
	printf("\nn\t  kernel_dgemm\t  dgemm\t\t  dsyrk_dpotrf\t  dtrmm\t\t  dtrtr\t\t  dgemv_n\t  dgemv_t\t  dtrmv_n\t  dtrmv_t\t  dtrsv_n\t  dtrsv_t\t  dsymv\t\t  dgemv_nt\t\t  dsyrk+dpotrf\t  BLAS dgemm\t  BLAS dgemv_n\t  BLAS dgemv_t\n");
	printf("\nn\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\n\n");
	
#if 1
	int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
	
	for(ll=0; ll<75; ll++)
//	for(ll=0; ll<115; ll++)
//	for(ll=0; ll<120; ll++)

		{

		int n = nn[ll];
		int nrep = nnrep[ll];

#else
	int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
	
	for(ll=0; ll<24; ll++)

		{

		int n = nn[ll];
		int nrep = 40000; //nnrep[ll];
#endif


#if defined(REF_BLAS_BLIS)
		f77_int n77 = n;
#endif
	
		double *A; d_zeros(&A, n, n);
		double *B; d_zeros(&B, n, n);
		double *C; d_zeros(&C, n, n);
		double *M; d_zeros(&M, n, n);

		char c_n = 'n';
		char c_t = 't';
		int i_1 = 1;
#if defined(REF_BLAS_BLIS)
		f77_int i77_1 = i_1;
#endif
		double d_1 = 1;
		double d_0 = 0;
	
		for(i=0; i<n*n; i++)
			A[i] = i;
	
		for(i=0; i<n; i++)
			B[i*(n+1)] = 1;
	
		for(i=0; i<n*n; i++)
			M[i] = 1;
	
		int pnd = ((n+bsd-1)/bsd)*bsd;	
		int cnd = ((n+D_NCL-1)/D_NCL)*D_NCL;	
		int cnd2 = 2*((n+D_NCL-1)/D_NCL)*D_NCL;	
		int pad = (D_NCL-n%D_NCL)%D_NCL;

		double *pA; d_zeros_align(&pA, pnd, cnd);
		double *pB; d_zeros_align(&pB, pnd, cnd);
		double *pC; d_zeros_align(&pC, pnd, cnd);
		double *pD; d_zeros_align(&pD, pnd, cnd);
		double *pE; d_zeros_align(&pE, pnd, cnd2);
		double *pF; d_zeros_align(&pF, 2*pnd, cnd);
		double *pL; d_zeros_align(&pL, pnd, cnd);
		double *pM; d_zeros_align(&pM, pnd, cnd);
		double *x; d_zeros_align(&x, pnd, 1);
		double *y; d_zeros_align(&y, pnd, 1);
		double *x2; d_zeros_align(&x2, pnd, 1);
		double *y2; d_zeros_align(&y2, pnd, 1);
		double *diag; d_zeros_align(&diag, pnd, 1);
	
		d_cvt_mat2pmat(n, n, A, n, 0, pA, cnd);
		d_cvt_mat2pmat(n, n, B, n, 0, pB, cnd);
		d_cvt_mat2pmat(n, n, B, n, 0, pD, cnd);
		d_cvt_mat2pmat(n, n, A, n, 0, pE, cnd2);
		d_cvt_mat2pmat(n, n, M, n, 0, pM, cnd);
/*		d_cvt_mat2pmat(n, n, B, n, 0, pE+n*bsd, pnd);*/
		
/*		d_print_pmat(n, 2*n, bsd, pE, 2*pnd);*/
/*		exit(2);*/
	
		for(i=0; i<pnd*cnd; i++) pC[i] = -1;
		
		for(i=0; i<pnd; i++) x[i] = 1;
		for(i=0; i<pnd; i++) x2[i] = 1;

		double *dummy;

		/* timing */
		struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;

		/* warm up */
		for(rep=0; rep<nrep; rep++)
			{
			dgemm_nt_lib(n, n, n, pA, cnd, pB, cnd, 1, pC, cnd, pC, cnd, 1, 1);
			}

		gettimeofday(&tvm1, NULL); // start
	
		for(rep=0; rep<nrep; rep++)
			{

			//dgemm_kernel_nt_lib(n, n, n, pA, cnd, pB, cnd, pC, cnd, pC, cnd, 0, 0, 0);
			dgemm_nn_lib(n, n, n, pA, cnd, pB, cnd, 0, pC, cnd, pC, cnd, 0, 0);

			}

		gettimeofday(&tv0, NULL); // start
	
		for(rep=0; rep<nrep; rep++)
			{

			dgemm_nt_lib(n, n, n, pA, cnd, pB, cnd, 0, pC, cnd, pC, cnd, 0, 0);

			}
	
		gettimeofday(&tv1, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			//dsyrk_dpotrf_lib(n, n, n, pA, cnd, 1, pD, cnd, pC, cnd, diag, 0);
			dsyrk_dpotrf_lib_new(n, n, n, pA, cnd, pA, cnd, 1, pD, cnd, pC, cnd, diag);

			}
	
		gettimeofday(&tv2, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrmm_nt_u_lib(n, n, pA, cnd, pB, cnd, pC, cnd);

			}
	
		gettimeofday(&tv3, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrtr_l_lib(n, 0, pA, cnd, pC, cnd); // triangualr matrix transpose
			//dgetr_lib(n, n, 0, pA, cnd, 0, pC, cnd); // general matrix transpose

			}
	
		gettimeofday(&tv4, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dgemv_n_lib(n, n, pA, cnd, x, 0, y, y);

			}
	
		gettimeofday(&tv5, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dgemv_t_lib(n, n, pA, cnd, x, 0, y, y);

			}
	
		gettimeofday(&tv6, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrmv_u_n_lib(n, pA, cnd, x, 0, y);

			}
	
		gettimeofday(&tv7, NULL); // stop


		for(rep=0; rep<nrep; rep++)
			{

			dtrmv_u_t_lib(n, pA, cnd, x, 0, y);

			}
	
		gettimeofday(&tv8, NULL); // stop


		for(rep=0; rep<nrep; rep++)
			{

			dtrsv_n_lib(2*n, n, 1, pF, cnd, x);

			}
	
		gettimeofday(&tv9, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrsv_t_lib(2*n, n, 1, pF, cnd, x);

			}
	
		gettimeofday(&tv10, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dsymv_lib(n, n, pA, cnd, x, 0, y, y);

			}
	
		gettimeofday(&tv11, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dgemv_nt_lib(n, n, pA, cnd, x, x2, 0, y, y2, y, y2);

			}
	
		gettimeofday(&tv12, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dsyrk_nt_lib(n, n, n, pE, cnd2, pE, cnd2, 1, pD, cnd, pE+(n+pad)*bsd, cnd2);
			//dpotrf_lib(n, n, pE+(n+pad)*bsd, cnd2, pE+(n+pad)*bsd, cnd2, diag);
			dpotrf_lib_new(n, n, pE+(n+pad)*bsd, cnd2, pE+(n+pad)*bsd, cnd2, diag);
			//d_print_pmat(pnd, cnd2, bsd, pE, cnd2);
			//exit(1);
			//break;

			}
	
		gettimeofday(&tv13, NULL); // stop
	
		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB)
			dgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
#endif
#if defined(REF_BLAS_BLIS)
			dgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
#endif
			}

		gettimeofday(&tv14, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB)
			dgemv_(&c_n, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y, &i_1);
#endif
#if defined(REF_BLAS_BLIS)
			dgemv_(&c_n, &n77, &n77, &d_1, A, &n77, x2, &i77_1, &d_0, y, &i77_1);
#endif
			}

		gettimeofday(&tv15, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB)
			dgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y, &i_1);
#endif
#if defined(REF_BLAS_BLIS)
			dgemv_(&c_t, &n77, &n77, &d_1, A, &n77, x2, &i77_1, &d_0, y, &i77_1);
#endif
			}

		gettimeofday(&tv16, NULL); // stop



		float Gflops_max = flops_max * GHz_max;

		float time_dgemm_kernel = (float) (tv0.tv_sec-tvm1.tv_sec)/(nrep+0.0)+(tv0.tv_usec-tvm1.tv_usec)/(nrep*1e6);
		float flop_dgemm_kernel = 2.0*n*n*n;
		float Gflops_dgemm_kernel = 1e-9*flop_dgemm_kernel/time_dgemm_kernel;

		float time_dgemm = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
		float flop_dgemm = 2.0*n*n*n;
		float Gflops_dgemm = 1e-9*flop_dgemm/time_dgemm;

		float time_dsyrk_dpotrf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
		float flop_dsyrk_dpotrf = 1.0*n*n*n + 1.0/3.0*n*n*n;
		float Gflops_dsyrk_dpotrf = 1e-9*flop_dsyrk_dpotrf/time_dsyrk_dpotrf;

		float time_dtrmm = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
		float flop_dtrmm = 1.0*n*n*n;
		float Gflops_dtrmm = 1e-9*flop_dtrmm/time_dtrmm;
	
		float time_dtrtr = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6);
		float flop_dtrtr = 0.5*n*n;
		float Gflops_dtrtr = 1e-9*flop_dtrtr/time_dtrtr;

		float time_dgemv_n = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6);
		float flop_dgemv_n = 2.0*n*n;
		float Gflops_dgemv_n = 1e-9*flop_dgemv_n/time_dgemv_n;

		float time_dgemv_t = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6);
		float flop_dgemv_t = 2.0*n*n;
		float Gflops_dgemv_t = 1e-9*flop_dgemv_t/time_dgemv_t;

		float time_dtrmv_n = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6);
		float flop_dtrmv_n = 1.0*n*n;
		float Gflops_dtrmv_n = 1e-9*flop_dtrmv_n/time_dtrmv_n;

		float time_dtrmv_t = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6);
		float flop_dtrmv_t = 1.0*n*n;
		float Gflops_dtrmv_t = 1e-9*flop_dtrmv_t/time_dtrmv_t;

		float time_dtrsv_n = (float) (tv9.tv_sec-tv8.tv_sec)/(nrep+0.0)+(tv9.tv_usec-tv8.tv_usec)/(nrep*1e6);
		float flop_dtrsv_n = 3.0*n*n;
		float Gflops_dtrsv_n = 1e-9*flop_dtrsv_n/time_dtrsv_n;

		float time_dtrsv_t = (float) (tv10.tv_sec-tv9.tv_sec)/(nrep+0.0)+(tv10.tv_usec-tv9.tv_usec)/(nrep*1e6);
		float flop_dtrsv_t = 3.0*n*n;
		float Gflops_dtrsv_t = 1e-9*flop_dtrsv_t/time_dtrsv_t;

		float time_dsymv = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6);
		float flop_dsymv = 2.0*n*n;
		float Gflops_dsymv = 1e-9*flop_dsymv/time_dsymv;

		float time_dgemv_nt = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6);
		float flop_dgemv_nt = 4.0*n*n;
		float Gflops_dgemv_nt = 1e-9*flop_dgemv_nt/time_dgemv_nt;

		float time_dsyrk_dpotrf2 = (float) (tv13.tv_sec-tv12.tv_sec)/(nrep+0.0)+(tv13.tv_usec-tv12.tv_usec)/(nrep*1e6);
		float flop_dsyrk_dpotrf2 = 1.0*n*n*n + 1.0/3.0*n*n*n;
		float Gflops_dsyrk_dpotrf2 = 1e-9*flop_dsyrk_dpotrf2/time_dsyrk_dpotrf2;

		float time_dgemm_blas = (float) (tv14.tv_sec-tv13.tv_sec)/(nrep+0.0)+(tv14.tv_usec-tv13.tv_usec)/(nrep*1e6);
		float flop_dgemm_blas = 2.0*n*n*n;
		float Gflops_dgemm_blas = 1e-9*flop_dgemm_blas/time_dgemm_blas;

		float time_dgemv_n_blas = (float) (tv15.tv_sec-tv14.tv_sec)/(nrep+0.0)+(tv15.tv_usec-tv14.tv_usec)/(nrep*1e6);
		float flop_dgemv_n_blas = 2.0*n*n;
		float Gflops_dgemv_n_blas = 1e-9*flop_dgemv_n_blas/time_dgemv_n_blas;

		float time_dgemv_t_blas = (float) (tv16.tv_sec-tv15.tv_sec)/(nrep+0.0)+(tv16.tv_usec-tv15.tv_usec)/(nrep*1e6);
		float flop_dgemv_t_blas = 2.0*n*n;
		float Gflops_dgemv_t_blas = 1e-9*flop_dgemv_t_blas/time_dgemv_t_blas;

		printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm_kernel, 100.0*Gflops_dgemm_kernel/Gflops_max, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dgemv_nt, 100.0*Gflops_dgemv_nt/Gflops_max, Gflops_dsyrk_dpotrf2, 100.0*Gflops_dsyrk_dpotrf2/Gflops_max, Gflops_dgemm_blas, 100.0*Gflops_dgemm_blas/Gflops_max, Gflops_dgemv_n_blas, 100.0*Gflops_dgemv_n_blas/Gflops_max, Gflops_dgemv_t_blas, 100.0*Gflops_dgemv_t_blas/Gflops_max);

	fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm_kernel, 100.0*Gflops_dgemm_kernel/Gflops_max, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dgemv_nt, 100.0*Gflops_dgemv_nt/Gflops_max, Gflops_dsyrk_dpotrf2, 100.0*Gflops_dsyrk_dpotrf2/Gflops_max, Gflops_dgemm_blas, 100.0*Gflops_dgemm_blas/Gflops_max, Gflops_dgemv_n_blas, 100.0*Gflops_dgemv_n_blas/Gflops_max, Gflops_dgemv_t_blas, 100.0*Gflops_dgemv_t_blas/Gflops_max);

		free(A);
		free(B);
		free(M);
		free(pA);
		free(pB);
		free(pC);
		free(pD);
		free(pE);
		free(pF);
		free(pL);
		free(pM);
		free(x);
		free(y);
		free(x2);
		free(y2);
		
		}

	printf("\n");

	fprintf(f, "];\n");
	fclose(f);

	return 0;
	
	}
Exemplo n.º 7
0
int main()
	{
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");
	
#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	int ii, jj;
	
	int rep, nrep=1000;//NREP;

	int nx = NX; // number of states (it has to be even for the mass-spring system test problem)
	int nu = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
	int N  = NN; // horizon lenght
	int nb  = nu+nx; // number of box constrained inputs and states
	int ng  = nx; //4;  // number of general constraints
	int ngN = nx; // number of general constraints at the last stage

# define USE_IPM_RES 1
	
//	int M = 32; // where the equality constraint hold

	int nbu = nu<nb ? nu : nb ;
	int nbx = nb-nu>0 ? nb-nu : 0;

#define KEEP_X0 0

	// stage-wise variant size
	int nx_v[N+1];
#if KEEP_X0
	nx_v[0] = nx;
#else
	nx_v[0] = 0;
#endif
	for(ii=1; ii<=N; ii++)
		nx_v[ii] = nx;

	int nu_v[N+1];
	for(ii=0; ii<N; ii++)
		nu_v[ii] = nu;
	nu_v[N] = 0;

	int nb_v[N+1];
#if KEEP_X0
	nb_v[0] = nb;
#else
	nb_v[0] = nbu;
#endif
	for(ii=1; ii<N; ii++)
		nb_v[ii] = nb;
	nb_v[N] = nbx;

	int ng_v[N+1];
	for(ii=0; ii<N; ii++)
		ng_v[ii] = ng;
	ng_v[N] = ngN;
//	ng_v[M] = nx; // XXX
	



	printf(" Test problem: mass-spring system with %d masses and %d controls.\n", nx/2, nu);
	printf("\n");
	printf(" MPC problem size: %d states, %d inputs, %d horizon length, %d two-sided box constraints, %d two-sided general constraints.\n", nx, nu, N, nb, ng);
	printf("\n");
#if IP == 1
	printf(" IP method parameters: primal-dual IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_param.c to change them).\n", K_MAX, MU_TOL);
#elif IP == 2
	printf(" IP method parameters: predictor-corrector IP, double precision, %d maximum iterations, %5.1e exit tolerance in duality measure (edit file test_param.c to change them).\n", K_MAX, MU_TOL);
#else
	printf(" Wrong value for IP solver choice: %d\n", IP);
#endif

	int info = 0;
		
	const int bs  = D_MR; //d_get_mr();
	const int ncl = D_NCL;

	int pnz = (nu+nx+1+bs-1)/bs*bs;
	int pnu = (nu+bs-1)/bs*bs;
	int pnu1 = (nu+1+bs-1)/bs*bs;
	int pnx = (nx+bs-1)/bs*bs;
	int pnx1 = (nx+1+bs-1)/bs*bs;
	int pnux = (nu+nx+bs-1)/bs*bs;
	int cnx = (nx+ncl-1)/ncl*ncl;
	int cnu = (nu+ncl-1)/ncl*ncl;
	int cnux = (nu+nx+ncl-1)/ncl*ncl;

	int pnb_v[N+1]; 
	int png_v[N+1]; 
	int pnx_v[N+1]; 
	int pnz_v[N+1]; 
	int pnux_v[N+1]; 
	int cnx_v[N+1]; 
	int cnux_v[N+1]; 
	int cng_v[N+1]; 

	for(ii=0; ii<N; ii++) 
		{
		pnb_v[ii] = (nb_v[ii]+bs-1)/bs*bs;
		png_v[ii] = (ng_v[ii]+bs-1)/bs*bs;
		pnx_v[ii] = (nx_v[ii]+bs-1)/bs*bs;
		pnz_v[ii] = (nu_v[ii]+nx_v[ii]+1+bs-1)/bs*bs;
		pnux_v[ii] = (nu_v[ii]+nx_v[ii]+bs-1)/bs*bs;
		cnx_v[ii] = (nx_v[ii]+ncl-1)/ncl*ncl;
		cnux_v[ii] = (nu_v[ii]+nx_v[ii]+ncl-1)/ncl*ncl;
		cng_v[ii] = (ng_v[ii]+ncl-1)/ncl*ncl;
		}
	ii = N;
	pnb_v[ii] = (nb_v[ii]+bs-1)/bs*bs;
	png_v[ii] = (ng_v[ii]+bs-1)/bs*bs;
	pnx_v[ii] = (nx_v[ii]+bs-1)/bs*bs;
	pnz_v[ii] = (nx_v[ii]+1+bs-1)/bs*bs;
	pnux_v[ii] = (nx_v[ii]+bs-1)/bs*bs;
	cnx_v[ii] = (nx_v[ii]+ncl-1)/ncl*ncl;
	cnux_v[ii] = (nx_v[ii]+ncl-1)/ncl*ncl;
	cng_v[ii] = (ng_v[ii]+ncl-1)/ncl*ncl;


/************************************************
* dynamical system
************************************************/	

	double *A; d_zeros(&A, nx, nx); // states update matrix

	double *B; d_zeros(&B, nx, nu); // inputs matrix

	double *b; d_zeros_align(&b, nx, 1); // states offset
	double *x0; d_zeros_align(&x0, nx, 1); // initial state

	double Ts = 0.5; // sampling time
	mass_spring_system(Ts, nx, nu, N, A, B, b, x0);
	
	for(jj=0; jj<nx; jj++)
		b[jj] = 0.1;
	
	for(jj=0; jj<nx; jj++)
		x0[jj] = 0;
	x0[0] = 2.5;
	x0[1] = 2.5;

	double *pA; d_zeros_align(&pA, pnx, cnx);
	d_cvt_mat2pmat(nx, nx, A, nx, 0, pA, cnx);
	double *b0; d_zeros_align(&b0, pnx, 1);
	for(ii=0; ii<nx; ii++) b0[ii] = b[ii];
#if ! KEEP_X0
	dgemv_n_lib(nx, nx, pA, cnx, x0, 1, b0, b0);
#endif

	double *pBAbt0; 
	d_zeros_align(&pBAbt0, pnz_v[0], cnx_v[1]);
	d_cvt_tran_mat2pmat(nx_v[1], nu_v[0], B, nx_v[1], 0, pBAbt0, cnx_v[1]);
	d_cvt_tran_mat2pmat(nx_v[1], nx_v[0], A, nx_v[1], nu_v[0], pBAbt0+nu_v[0]/bs*bs*cnx_v[1]+nu_v[0]%bs, cnx_v[1]);
	d_cvt_tran_mat2pmat(nx_v[1], 1, b0, nx_v[1], nu_v[0]+nx_v[0], pBAbt0+(nu_v[0]+nx_v[0])/bs*bs*cnx_v[1]+(nu_v[0]+nx_v[0])%bs, cnx_v[1]);

	double *pBAbt1; 
	if(N>1)
		{
		d_zeros_align(&pBAbt1, pnz_v[1], cnx_v[2]);
		d_cvt_tran_mat2pmat(nx_v[2], nu_v[1], B, nx_v[2], 0, pBAbt1, cnx_v[2]);
		d_cvt_tran_mat2pmat(nx_v[2], nx_v[1], A, nx_v[2], nu_v[1], pBAbt1+nu_v[1]/bs*bs*cnx_v[2]+nu_v[1]%bs, cnx_v[2]);
		d_cvt_tran_mat2pmat(nx_v[2], 1, b, nx_v[2], nu_v[1]+nx_v[1], pBAbt1+(nu_v[1]+nx_v[1])/bs*bs*cnx_v[2]+(nu_v[1]+nx_v[1])%bs, cnx_v[2]);
		}

#if 0
d_print_pmat(nu_v[0]+nx_v[0]+1, nx_v[1], bs, pBAbt0, cnx_v[1]);
d_print_pmat(nu_v[1]+nx_v[1]+1, nx_v[2], bs, pBAbt1, cnx_v[2]);
exit(2);
#endif

/************************************************
* box & general constraints
************************************************/	

	int *idx0; i_zeros(&idx0, nb_v[0], 1);
	double *d0; d_zeros_align(&d0, 2*pnb_v[0]+2*png_v[0], 1);
#if KEEP_X0
	for(jj=0; jj<nbu; jj++)
		{
		d0[jj]          = - 0.5;   //   umin
		d0[pnb_v[0]+jj] =   0.5;   //   umax
		idx0[jj] = jj;
		}
	for(; jj<nb; jj++)
		{
		d0[jj]          =   x0[jj-nu];   //   xmin
		d0[pnb_v[0]+jj] =   x0[jj-nu];   //   xmax
		idx0[jj] = jj;
		}
#else
	for(jj=0; jj<nbu; jj++)
		{
		d0[jj]          = - 0.5;   //   umin
		d0[pnb_v[0]+jj] =   0.5;   //   umax
		idx0[jj] = jj;
		}
#endif
	for(jj=0; jj<ng_v[0]; jj++)
		{
		d0[2*pnb_v[0]+jj]          = - 100.0;   //   xmin
		d0[2*pnb_v[0]+png_v[0]+jj] =   100.0;   //   xmax
		}
#if 0
	i_print_mat(1, nb_v[0], idx0, 1);
	d_print_mat(1, 2*pnb_v[0]+2*png_v[0], d0, 1);
	exit(2);
#endif

	int *idx1; i_zeros(&idx1, nb_v[1], 1);
	double *d1; d_zeros_align(&d1, 2*pnb_v[1]+2*png_v[1], 1);
	for(jj=0; jj<nbu; jj++)
		{
		d1[jj]          = - 0.5;   //   umin
		d1[pnb_v[1]+jj] =   0.5;   //   umax
		idx1[jj] = jj;
		}
	for(; jj<nb; jj++)
		{
		d1[jj]          = - 10.0;   //   xmin
		d1[pnb_v[1]+jj] =   10.0;   //   xmax
		idx1[jj] = jj;
		}
	for(jj=0; jj<ng_v[1]; jj++)
		{
		d1[2*pnb_v[1]+jj]          = - 100.0;   //   xmin
		d1[2*pnb_v[1]+png_v[1]+jj] =   100.0;   //   xmax
		}
//	i_print_mat(nb, 1, idx1, nb);

	int *idxN; i_zeros(&idxN, nb_v[N], 1);
	double *dN; d_zeros_align(&dN, 2*pnb_v[N]+2*png_v[N], 1);
	for(jj=0; jj<nbx; jj++)
		{
		dN[jj]          = - 10.0;   //   xmin
		dN[pnb_v[N]+jj] =   10.0;   //   xmax
		idxN[jj] = jj;
		}
	for(jj=0; jj<ng_v[N]; jj++)
		{
		dN[2*pnb_v[N]+jj]          = - 0.0;   //   xmin
		dN[2*pnb_v[N]+png_v[N]+jj] =   0.0;   //   xmax
		}
//	d_print_mat(1, 2*pnb+2*png, d, 1);
//	d_print_mat(1, 2*pnb_v[N]+2*png_v[N], dN, 1);
//	exit(1);
	
//	double *dM; d_zeros_align(&dM, 2*pnb_v[M]+2*png_v[M], 1);
//	for(jj=0; jj<nbu; jj++)
//		{
//		dM[jj]          = - 0.5;   //   umin
//		dM[pnb_v[1]+jj] =   0.5;   //   umax
//		}
//	for(; jj<nb; jj++)
//		{
//		dM[jj]          = - 4.0;   //   xmin
//		dM[pnb_v[1]+jj] =   4.0;   //   xmax
//		}
//	for(jj=0; jj<ng_v[M]; jj++)
//		{
//		dM[2*pnb_v[M]+jj]          = - 0.5;   //   xmin
//		dM[2*pnb_v[M]+png_v[M]+jj] = - 0.5;   //   xmax
//		}

	double *C; d_zeros(&C, ng, nx);
	for(ii=0; ii<ng; ii++)
		C[ii*(ng+1)] = 1.0;
	double *D; d_zeros(&D, ng, nu);

	// first stage
	double *pDCt0; d_zeros_align(&pDCt0, pnux_v[0], cng_v[0]);
	// middle stage
	double *DC1; d_zeros(&DC1, ng_v[1], nu_v[1]+nx_v[1]);
	for(jj=0; jj<ng_v[1]; jj++) DC1[jj+(nu_v[1]+jj)*ng_v[1]] = 1.0;
//	d_print_mat(ng_v[1], nu_v[1]+nx_v[1], DC1, ng_v[1]);
	double *pDCt1; d_zeros_align(&pDCt1, pnux_v[1], cng_v[1]);
	d_cvt_tran_mat2pmat(ng_v[1], nu_v[1]+nx_v[1], DC1, ng_v[1], 0, pDCt1, cng_v[1]);
//	d_print_pmat(nu_v[1]+nx_v[1], ng_v[1], bs, pDCt1, cng_v[1]);
//	exit(2);
	// last stage
	double *DCN; d_zeros(&DCN, ng_v[N], nx_v[N]);
	for(jj=0; jj<ng_v[N]; jj++) DCN[jj*(ng_v[N]+1)] = 1.0;
//	d_print_mat(ng_v[N], nx_v[N], DCN, ng_v[N]);
	double *pDCtN; d_zeros_align(&pDCtN, pnx_v[N], cng_v[N]);
	d_cvt_tran_mat2pmat(ng_v[N], nx_v[N], DCN, ng_v[N], 0, pDCtN, cng_v[N]);
//	d_print_pmat(nx_v[N], ng_v[N], bs, pDCtN, cng_v[N]);
	// constrained stage
//	double *DCM; d_zeros(&DCM, ng_v[M], nu_v[M]+nx_v[M]);
//	for(jj=0; jj<ng_v[M]; jj++) DCM[jj+(jj+nu_v[M])*ng_v[M]] = 1.0;
//	d_print_mat(ng_v[M], nu_v[M]+nx_v[M], DCM, ng_v[M]);
//	double *pDCtM; d_zeros_align(&pDCtM, pnux_v[M], cng_v[M]);
//	d_cvt_tran_mat2pmat(ng_v[M], nu_v[M]+nx_v[M], DCM, ng_v[M], 0, pDCtM, cng_v[M]);
//	d_print_pmat(nu_v[M]+nx_v[M], ng_v[M], bs, pDCtM, cng_v[M]);
//	exit(2);

/************************************************
* cost function
************************************************/	
	
	double *Q; d_zeros(&Q, nx, nx);
	for(ii=0; ii<nx; ii++) Q[ii*(nx+1)] = 1.0;

	double *R; d_zeros(&R, nu, nu);
	for(ii=0; ii<nu; ii++) R[ii*(nu+1)] = 2.0;

	double *S; d_zeros(&S, nu, nx); // S=0, so no need to update r0

	double *q; d_zeros(&q, nx, 1);
	for(ii=0; ii<nx; ii++) q[ii] = 0.1;

	double *r; d_zeros(&r, nu, 1);
	for(ii=0; ii<nu; ii++) r[ii] = 0.2;

#if KEEP_X0
	double  *pRSQ0; d_zeros_align(&pRSQ0, pnz, cnux);
	d_cvt_mat2pmat(nu, nu, R, nu, 0, pRSQ0, cnux);
	d_cvt_tran_mat2pmat(nu, nx, S, nu, nu, pRSQ0+nu/bs*bs*cnux+nu%bs, cnux);
	d_cvt_tran_mat2pmat(nu, 1, r, nu, nu+nx, pRSQ0+(nu+nx)/bs*bs*cnux+(nu+nx)%bs, cnux);
	d_cvt_mat2pmat(nx, nx, Q, nx, nu, pRSQ0+nu/bs*bs*cnux+nu%bs+nu*bs, cnux);
	d_cvt_tran_mat2pmat(nx, 1, q, nx, nu+nx, pRSQ0+(nu+nx)/bs*bs*cnux+(nu+nx)%bs+nu*bs, cnux);
//	d_print_pmat(nu+nx+1, nu+nx, bs, pRSQ0, cnux);
	double *rq0; d_zeros_align(&rq0, pnux, 1);
	d_copy_mat(nu, 1, r, nu, rq0, pnux);
	d_copy_mat(nx, 1, q, nx, rq0+nu, pnux);
#else
	double  *pRSQ0; d_zeros_align(&pRSQ0, pnu1, cnu);
	d_cvt_mat2pmat(nu, nu, R, nu, 0, pRSQ0, cnu);
	d_cvt_tran_mat2pmat(nu, 1, r, nu, nu, pRSQ0+nu/bs*bs*cnu+nu%bs, cnu);
//	d_print_pmat(nu+1, nu, bs, pRSQ0, cnu);
	double *rq0; d_zeros_align(&rq0, pnu, 1);
	d_copy_mat(nu, 1, r, nu, rq0, pnu);
#endif

	double  *pRSQ1; d_zeros_align(&pRSQ1, pnz, cnux);
	d_cvt_mat2pmat(nu, nu, R, nu, 0, pRSQ1, cnux);
	d_cvt_tran_mat2pmat(nu, nx, S, nu, nu, pRSQ1+nu/bs*bs*cnux+nu%bs, cnux);
	d_cvt_tran_mat2pmat(nu, 1, r, nu, nu+nx, pRSQ1+(nu+nx)/bs*bs*cnux+(nu+nx)%bs, cnux);
	d_cvt_mat2pmat(nx, nx, Q, nx, nu, pRSQ1+nu/bs*bs*cnux+nu%bs+nu*bs, cnux);
	d_cvt_tran_mat2pmat(nx, 1, q, nx, nu+nx, pRSQ1+(nu+nx)/bs*bs*cnux+(nu+nx)%bs+nu*bs, cnux);
//	d_print_pmat(nu+nx+1, nu+nx, bs, pRSQ1, cnux);
	double *rq1; d_zeros_align(&rq1, pnux, 1);
	d_copy_mat(nu, 1, r, nu, rq1, pnux);
	d_copy_mat(nx, 1, q, nx, rq1+nu, pnux);

	double  *pRSQN; d_zeros_align(&pRSQN, pnx1, cnx);
	d_cvt_mat2pmat(nx, nx, Q, nx, 0, pRSQN, cnx);
	d_cvt_tran_mat2pmat(nx, 1, q, nx, nx, pRSQN+(nx)/bs*bs*cnx+(nx)%bs, cnx);
//	d_print_pmat(nx+1, nx, bs, pRSQN, cnx);
	double *rqN; d_zeros_align(&rqN, pnx, 1);
	d_copy_mat(nx, 1, q, nx, rqN, pnx);


	// maximum element in cost functions
	double mu0 = 2.0;

/************************************************
* high level interface work space
************************************************/	

#if 0
	double *rA; d_zeros(&rA, nx, N*nx);
	d_rep_mat(N, nx, nx, A, nx, rA, nx);

	double *rB; d_zeros(&rB, nx, N*nu);
	d_rep_mat(N, nx, nu, B, nx, rB, nx);

	double *rC; d_zeros(&rC, ng, (N+1)*nx);
	d_rep_mat(N, ng, nx, C, ng, rC+nx*ng, ng);

	double *CN = DCN;

	double *rD; d_zeros(&rD, ng, N*nu);
	d_rep_mat(N, ng, nu, D, ng, rD, ng);

	double *rb; d_zeros(&rb, nx, N*1);
	d_rep_mat(N, nx, 1, b, nx, rb, nx);

	double *rQ; d_zeros(&rQ, nx, N*nx);
	d_rep_mat(N, nx, nx, Q, nx, rQ, nx);

	double *rQf; d_zeros(&rQf, nx, nx);
	d_copy_mat(nx, nx, Q, nx, rQf, nx);

	double *rS; d_zeros(&rS, nu, N*nx);
	d_rep_mat(N, nu, nx, S, nu, rS, nu);

	double *rR; d_zeros(&rR, nu, N*nu);
	d_rep_mat(N, nu, nu, R, nu, rR, nu);

	double *rq; d_zeros(&rq, nx, N);
	d_rep_mat(N, nx, 1, q, nx, rq, nx);

	double *rqf; d_zeros(&rqf, nx, 1);
	d_copy_mat(nx, 1, q, nx, rqf, nx);

	double *rr; d_zeros(&rr, nu, N);
	d_rep_mat(N, nu, 1, r, nu, rr, nu);

	double *lb; d_zeros(&lb, nb, 1);
	for(ii=0; ii<nb; ii++)
		lb[ii] = d1[ii];
	double *rlb; d_zeros(&rlb, nb, N+1);
	d_rep_mat(N+1, nb, 1, lb, nb, rlb, nb);
//	d_print_mat(nb, N+1, rlb, nb);

	double *lg; d_zeros(&lg, ng, 1);
	for(ii=0; ii<ng; ii++)
		lg[ii] = d1[2*pnb_v[1]+ii];
	double *rlg; d_zeros(&rlg, ng, N);
	d_rep_mat(N, ng, 1, lg, ng, rlg, ng);
//	d_print_mat(ng, N, rlg, ng);

	double *lgN; d_zeros(&lgN, ngN, 1);
	for(ii=0; ii<ngN; ii++)
		lgN[ii] = dN[2*pnb_v[N]+ii];
//	d_print_mat(ngN, 1, lgN, ngN);

	double *ub; d_zeros(&ub, nb, 1);
	for(ii=0; ii<nb; ii++)
		ub[ii] = d1[pnb_v[1]+ii];
	double *rub; d_zeros(&rub, nb, N+1);
	d_rep_mat(N+1, nb, 1, ub, nb, rub, nb);
//	d_print_mat(nb, N+1, rub, nb);

	double *ug; d_zeros(&ug, ng, 1);
	for(ii=0; ii<ng; ii++)
		ug[ii] = d1[2*pnb_v[1]+png_v[1]+ii];
	double *rug; d_zeros(&rug, ng, N);
	d_rep_mat(N, ng, 1, ug, ng, rug, ng);
//	d_print_mat(ng, N, rug, ng);

	double *ugN; d_zeros(&ugN, ngN, 1);
	for(ii=0; ii<ngN; ii++)
		ugN[ii] = dN[2*pnb_v[N]+png_v[N]+ii];
//	d_print_mat(ngN, 1, ugN, ngN);

	double *rx; d_zeros(&rx, nx, N+1);
	d_copy_mat(nx, 1, x0, nx, rx, nx);

	double *ru; d_zeros(&ru, nu, N);

	double *rpi; d_zeros(&rpi, nx, N);

	double *rlam; d_zeros(&rlam, N*2*(nb+ng)+2*(nb+ngN), 1);

	double *rt; d_zeros(&rt, N*2*(nb+ng)+2*(nb+ngN), 1);

	double *rwork = (double *) malloc(hpmpc_d_ip_mpc_hard_tv_work_space_size_bytes(N, nx, nu, nb, ng, ngN));

	double inf_norm_res[4] = {}; // infinity norm of residuals: rq, rb, rd, mu
#endif

/************************************************
* low level interface work space
************************************************/	

	double *hpBAbt[N];
	double *hpDCt[N+1];
	double *hb[N];
	double *hpRSQ[N+1];
	double *hrq[N+1];
	double *hd[N+1];
	int *idx[N+1];
	double *hux[N+1];
	double *hpi[N];
	double *hlam[N+1];
	double *ht[N+1];
	double *hrb[N];
	double *hrrq[N+1];
	double *hrd[N+1];
	hpBAbt[0] = pBAbt0;
	hpDCt[0] = pDCt0;
	hb[0] = b0;
	hpRSQ[0] = pRSQ0;
	hrq[0] = rq0;
	hd[0] = d0;
	idx[0] = idx0;
	d_zeros_align(&hux[0], pnux_v[0], 1);
	d_zeros_align(&hpi[0], pnx_v[1], 1);
	d_zeros_align(&hlam[0], 2*pnb_v[0]+2*png_v[0], 1);
	d_zeros_align(&ht[0], 2*pnb_v[0]+2*png_v[0], 1);
	d_zeros_align(&hrb[0], pnx_v[1], 1);
	d_zeros_align(&hrrq[0], pnz_v[0], 1);
	d_zeros_align(&hrd[0], 2*pnb_v[0]+2*png_v[0], 1);
	for(ii=1; ii<N; ii++)
		{
		hpBAbt[ii] = pBAbt1;
//		d_zeros_align(&hpBAbt[ii], pnz_v[ii], cnx_v[ii+1]); for(jj=0; jj<pnz_v[ii]*cnx_v[ii+1]; jj++) hpBAbt[ii][jj] = pBAbt1[jj];
		hpDCt[ii] = pDCt1;
		hb[ii] = b;
		hpRSQ[ii] = pRSQ1;
//		d_zeros_align(&hpRSQ[ii], pnz_v[ii], cnux_v[ii]); for(jj=0; jj<pnz_v[ii]*cnux_v[ii]; jj++) hpRSQ[ii][jj] = pRSQ1[jj];
		hrq[ii] = rq1;
		hd[ii] = d1;
		idx[ii] = idx1;
		d_zeros_align(&hux[ii], pnux_v[ii], 1);
		d_zeros_align(&hpi[ii], pnx_v[ii+1], 1);
		d_zeros_align(&hlam[ii], 2*pnb_v[ii]+2*png_v[ii], 1);
		d_zeros_align(&ht[ii], 2*pnb_v[ii]+2*png_v[ii], 1);
		d_zeros_align(&hrb[ii], pnx_v[ii+1], 1);
		d_zeros_align(&hrrq[ii], pnz_v[ii], 1);
		d_zeros_align(&hrd[ii], 2*pnb_v[ii]+2*png_v[ii], 1);
		}
	hpDCt[N] = pDCtN;
	hpRSQ[N] = pRSQN;
	hrq[N] = rqN;
	hd[N] = dN;
	idx[N] = idxN;
	d_zeros_align(&hux[N], pnx, 1);
	d_zeros_align(&hlam[N], 2*pnb_v[N]+2*png_v[N], 1);
	d_zeros_align(&ht[N], 2*pnb_v[N]+2*png_v[N], 1);
	d_zeros_align(&hrrq[N], pnz_v[N], 1);
	d_zeros_align(&hrd[N], 2*pnb_v[N]+2*png_v[N], 1);

//	hpDCt[M] = pDCtM;
//	hd[M] = dM;

	double mu = 0.0;

#if USE_IPM_RES
	double *work; d_zeros_align(&work, d_ip2_res_mpc_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v)/sizeof(double), 1);
#else
	double *work; d_zeros_align(&work, d_ip2_mpc_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v)/sizeof(double), 1);
#endif

/************************************************
* (new) high level interface work space
************************************************/	

	// box constraints
	double *lb0; d_zeros(&lb0, nb_v[0], 1);
	for(ii=0; ii<nb_v[0]; ii++)
		lb0[ii] = d0[ii];
	double *ub0; d_zeros(&ub0, nb_v[0], 1);
	for(ii=0; ii<nb_v[0]; ii++)
		ub0[ii] = d0[pnb_v[0]+ii];
	double *lb1; d_zeros(&lb1, nb_v[1], 1);
	for(ii=0; ii<nb_v[1]; ii++)
		lb1[ii] = d1[ii];
	double *ub1; d_zeros(&ub1, nb_v[1], 1);
	for(ii=0; ii<nb_v[1]; ii++)
		ub1[ii] = d1[pnb_v[1]+ii];
	double *lbN; d_zeros(&lbN, nb_v[N], 1);
	for(ii=0; ii<nb_v[N]; ii++)
		lbN[ii] = dN[ii];
	double *ubN; d_zeros(&ubN, nb_v[N], 1);
	for(ii=0; ii<nb_v[N]; ii++)
		ubN[ii] = dN[pnb_v[N]+ii];

	// general constraints
	double *lg0; d_zeros(&lg0, ng_v[0], 1);
	for(ii=0; ii<ng_v[0]; ii++)
		lg0[ii] = d0[2*pnb_v[0]+ii];
	double *ug0; d_zeros(&ug0, ng_v[0], 1);
	for(ii=0; ii<ng_v[0]; ii++)
		ug0[ii] = d0[2*pnb_v[0]+png_v[0]+ii];
	double *lg1; d_zeros(&lg1, ng_v[1], 1);
	for(ii=0; ii<ng_v[1]; ii++)
		lg1[ii] = d1[2*pnb_v[1]+ii];
	double *ug1; d_zeros(&ug1, ng_v[1], 1);
	for(ii=0; ii<ng_v[1]; ii++)
		ug1[ii] = d1[2*pnb_v[1]+png_v[1]+ii];
	double *lgN; d_zeros(&lgN, ng_v[N], 1);
	for(ii=0; ii<ng_v[N]; ii++)
		lgN[ii] = dN[2*pnb_v[N]+ii];
	double *ugN; d_zeros(&ugN, ng_v[N], 1);
	for(ii=0; ii<ng_v[N]; ii++)
		ugN[ii] = dN[2*pnb_v[N]+png_v[N]+ii];

	// data matrices
	double *hA[N];
	double *hB[N];
	double *hC[N+1];
	double *hD[N];
	double *hQ[N+1];
	double *hS[N];
	double *hR[N];
	double *hq[N+1];
	double *hr[N];
	double *hlb[N+1];
	double *hub[N+1];
	double *hlg[N+1];
	double *hug[N+1];
	double *hx[N+1];
	double *hu[N];
	double *hpi1[N];
	double *hlam1[N+1];
	double *ht1[N+1];
	double inf_norm_res[4] = {}; // infinity norm of residuals: rq, rb, rd, mu

	ii = 0;
	hA[0] = A;
	hB[0] = B;
	hC[0] = C;
	hD[0] = D;
	hQ[0] = Q;
	hS[0] = S;
	hR[0] = R;
	hq[0] = q;
	hr[0] = r;
	hlb[0] = lb0;
	hub[0] = ub0;
	hlg[0] = lg0;
	hug[0] = ug0;
	d_zeros(&hx[0], nx_v[0], 1);
	d_zeros(&hu[0], nu_v[0], 1);
	d_zeros(&hpi1[0], nx_v[1], 1);
	d_zeros(&hlam1[0], 2*nb_v[0]+2*ng_v[0], 1);
	d_zeros(&ht1[0], 2*nb_v[0]+2*ng_v[0], 1);
	for(ii=1; ii<N; ii++)
		{
		hA[ii] = A;
		hB[ii] = B;
		hC[ii] = C;
		hD[ii] = D;
		hQ[ii] = Q;
		hS[ii] = S;
		hR[ii] = R;
		hq[ii] = q;
		hr[ii] = r;
		hlb[ii] = lb1;
		hub[ii] = ub1;
		hlg[ii] = lg1;
		hug[ii] = ug1;
		d_zeros(&hx[ii], nx_v[ii], 1);
		d_zeros(&hu[ii], nu_v[ii], 1);
		d_zeros(&hpi1[ii], nx_v[ii+1], 1);
		d_zeros(&hlam1[ii], 2*nb_v[ii]+2*ng_v[ii], 1);
		d_zeros(&ht1[ii], 2*nb_v[ii]+2*ng_v[ii], 1);
		}
	ii = N;
	hC[N] = C;
	hQ[N] = Q;
	hq[N] = q;
	hlb[N] = lbN;
	hub[N] = ubN;
	hlg[N] = lgN;
	hug[N] = ugN;
	d_zeros(&hx[N], nx_v[N], 1);
	d_zeros(&hlam1[N], 2*nb_v[N]+2*ng_v[N], 1);
	d_zeros(&ht1[N], 2*nb_v[N]+2*ng_v[N], 1);

	// work space
#if 0
	printf("work space in bytes: %d\n", hpmpc_d_ip_ocp_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v));
	exit(3);
#endif
	void *work1 = malloc(hpmpc_d_ip_ocp_hard_tv_work_space_size_bytes(N, nx_v, nu_v, nb_v, ng_v));
	double *ptr_work1 = (double *) work1;

/************************************************
* solvers common stuff
************************************************/	

	int hpmpc_status;
	int kk, kk_avg;
	int k_max = 10;
	double mu_tol = 1e-20;
	double alpha_min = 1e-8;
	int warm_start = 0; // read initial guess from x and u
	double *stat; d_zeros(&stat, k_max, 5);
	int compute_res = 1;
	int compute_mult = 1;

	struct timeval tv0, tv1, tv2, tv3;
	double time;

	double **dummy;

/************************************************
* call the solver (high-level interface)
************************************************/	

#if 1
	int time_invariant = 0; // assume the problem to be time invariant
	int free_x0 = 0; // assume x0 as optimization variable

	gettimeofday(&tv0, NULL); // stop

	kk_avg = 0;

	for(rep=0; rep<nrep; rep++)
		{

//		hpmpc_status = fortran_order_d_ip_mpc_hard_tv(&kk, k_max, mu0, mu_tol, N, nx, nu, nb, ng, ngN, time_invariant, free_x0, warm_start, rA, rB, rb, rQ, rQf, rS, rR, rq, rqf, rr, rlb, rub, rC, rD, rlg, rug, CN, lgN, ugN, rx, ru, rpi, rlam, rt, inf_norm_res, rwork, stat);
		hpmpc_status = fortran_order_d_ip_ocp_hard_tv(&kk, k_max, mu0, mu_tol, N, nx_v, nu_v, nb_v, ng_v, warm_start, hA, hB, hb, hQ, hS, hR, hq, hr, hlb, hub, hC, hD, hlg, hug, hx, hu, hpi1, hlam1, ht1, inf_norm_res, work1, stat);

		kk_avg += kk;

		}
	
	gettimeofday(&tv1, NULL); // stop

	printf("\nsolution from high-level interface\n\n");
//	d_print_mat(nx, N+1, rx, nx);
//	d_print_mat(nu, N, ru, nu);
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nx_v[ii], hx[ii], 1);
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nu_v[ii], hu[ii], 1);

	printf("\ninfinity norm of residuals\n\n");
	d_print_mat_e(1, 4, inf_norm_res, 1);

	time = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);

	printf("\nstatistics from last run\n\n");
	for(jj=0; jj<kk; jj++)
		printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
	printf("\n");
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f\n", nrep, kk_avg / (double) nrep);
	printf(" Average solution time over %d runs: %5.2e seconds\n", nrep, time);
	printf("\n\n");

	gettimeofday(&tv0, NULL); // stop

	kk_avg = 0;

	for(rep=0; rep<nrep; rep++)
		{

//		fortran_order_d_solve_kkt_new_rhs_mpc_hard_tv(N, nx, nu, nb, ng, ngN, time_invariant, free_x0, rA, rB, rb, rQ, rQf, rS, rR, rq, rqf, rr, rlb, rub, rC, rD, rlg, rug, CN, lgN, ugN, rx, ru, rpi, rlam, rt, inf_norm_res, rwork);
		fortran_order_d_solve_kkt_new_rhs_ocp_hard_tv(N, nx_v, nu_v, nb_v, ng_v, hA, hB, hb, hQ, hS, hR, hq, hr, hlb, hub, hC, hD, hlg, hug, hx, hu, hpi1, hlam1, ht1, inf_norm_res, work1);

		kk_avg += kk;

		}
	
	gettimeofday(&tv1, NULL); // stop

	printf("\nsolution from high-level interface (resolve final kkt)\n\n");
//	d_print_mat(nx, N+1, rx, nx);
//	d_print_mat(nu, N, ru, nu);
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nx_v[ii], hx[ii], 1);
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nu_v[ii], hu[ii], 1);

	printf("\ninfinity norm of residuals\n\n");
	d_print_mat_e(1, 4, inf_norm_res, 1);

	time = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);

	printf(" Average solution time over %d runs: %5.2e seconds\n", nrep, time);
#endif

/************************************************
* call the solver (low-level interface)
************************************************/	

//	for(ii=0; ii<N; ii++)
//		d_print_pmat(nu_v[ii]+nx_v[ii]+1, nx_v[ii+1], bs, hpBAbt[ii], cnx_v[ii+1]);
//	exit(3);

	gettimeofday(&tv0, NULL); // stop

	kk_avg = 0;

	printf("\nsolution...\n");
	for(rep=0; rep<nrep; rep++)
		{

#if USE_IPM_RES
		hpmpc_status = d_ip2_res_mpc_hard_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, stat, N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hpRSQ, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#else
		hpmpc_status = d_ip2_mpc_hard_tv(&kk, k_max, mu0, mu_tol, alpha_min, warm_start, stat, N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hpRSQ, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#endif
		
		kk_avg += kk;

		}
	printf("\ndone\n");

	gettimeofday(&tv1, NULL); // stop

	printf("\nsolution from low-level interface (original problem)\n\n");
	printf("\nux\n\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nu_v[ii]+nx_v[ii], hux[ii], 1);
	printf("\npi\n\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nx_v[ii+1], hpi[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], hlam[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], ht[ii], 1);
	
	// residuals
	if(compute_res)
		{
		// compute residuals
		d_res_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hux, hpDCt, hd, hpi, hlam, ht, hrrq, hrb, hrd, &mu);

		// print residuals
		printf("\nhrrq\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nu_v[ii]+nx_v[ii], hrrq[ii], 1);

		printf("\nhrb\n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat_e(1, nx_v[ii+1], hrb[ii], 1);

		printf("\nhrd low\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii], 1);

		printf("\nhrd up\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii]+pnb_v[ii], 1);

		}



	// zero the solution again
	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nu_v[ii]+nx_v[ii]; jj++) hux[ii][jj] = 0.0;

	// modify constraints
#if 0
	for(jj=0; jj<nbx; jj++)
		{
		dN[jj]          = - 4.0;   //   xmin
		dN[pnb_v[N]+jj] =   4.0;   //   xmax
		idxN[jj] = jj;
		}
	for(jj=0; jj<ng_v[N]; jj++)
		{
		dN[2*pnb_v[N]+jj]          =   0.1;   //   xmin
		dN[2*pnb_v[N]+png_v[N]+jj] =   0.1;   //   xmax
		}
#endif

#if 0
for(ii=0; ii<=N; ii++)
	d_print_pmat(nu_v[ii]+nx_v[ii]+1, nu_v[ii]+nx_v[ii], bs, hpRSQ[ii], cnux_v[ii]);
for(ii=0; ii<=N; ii++)
	d_print_mat(1, nu_v[ii]+nx_v[ii], hrq[ii], 1);
exit(1);
#endif

	gettimeofday(&tv2, NULL); // stop

	printf("\nsolution...\n");
	for(rep=0; rep<nrep; rep++)
		{

#if USE_IPM_RES
		d_kkt_solve_new_rhs_res_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#else
		d_kkt_solve_new_rhs_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hpDCt, hd, hux, compute_mult, hpi, hlam, ht, work);
#endif

		}
	printf("\ndone\n");

	gettimeofday(&tv3, NULL); // stop

	printf("\nsolution from low-level interface (resolve final kkt)\n\n");
	printf("\nux\n\n");
	for(ii=0; ii<=N; ii++)
		d_print_mat(1, nu_v[ii]+nx_v[ii], hux[ii], 1);
	printf("\npi\n\n");
	for(ii=0; ii<N; ii++)
		d_print_mat(1, nx_v[ii+1], hpi[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], hlam[ii], 1);
//	printf("\nux\n\n");
//	for(ii=0; ii<=N; ii++)
//		d_print_mat(1, 2*pnb_v[ii]+2*png_v[ii], ht[ii], 1);

	// residuals
	if(compute_res)
		{
		// compute residuals
		d_res_mpc_hard_tv(N, nx_v, nu_v, nb_v, idx, ng_v, hpBAbt, hb, hpRSQ, hrq, hux, hpDCt, hd, hpi, hlam, ht, hrrq, hrb, hrd, &mu);

		// print residuals
		printf("\nhrrq\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nu_v[ii]+nx_v[ii], hrrq[ii], 1);

		printf("\nhrb\n\n");
		for(ii=0; ii<N; ii++)
			d_print_mat_e(1, nx_v[ii+1], hrb[ii], 1);

		printf("\nhrd low\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii], 1);

		printf("\nhrd up\n\n");
		for(ii=0; ii<=N; ii++)
			d_print_mat_e(1, nb_v[ii], hrd[ii]+pnb_v[ii], 1);

		}

	double time_ipm = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
	double time_final = (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);

	printf("\nstatistics from last run\n\n");
	for(jj=0; jj<kk; jj++)
		printf("k = %d\tsigma = %f\talpha = %f\tmu = %f\t\tmu = %e\talpha = %f\tmu = %f\tmu = %e\n", jj, stat[5*jj], stat[5*jj+1], stat[5*jj+2], stat[5*jj+2], stat[5*jj+3], stat[5*jj+4], stat[5*jj+4]);
	printf("\n");
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f\n", nrep, kk_avg / (double) nrep);
	printf(" Average solution time over %d runs: %5.2e seconds (IPM)\n", nrep, time_ipm);
	printf(" Average solution time over %d runs: %5.2e seconds (resolve final kkt)\n", nrep, time_final);
	printf("\n\n");

/************************************************
* compute residuals
************************************************/	

/************************************************
* free memory
************************************************/	

	// problem data
	free(A);
	free(B);
	d_free_align(b);
	d_free_align(x0);
	free(C);
	free(D);
	free(Q);
	free(S);
	free(R);
	free(q);
	free(r);

	// low level interface
	d_free_align(pA);
	d_free_align(b0);
	d_free_align(pBAbt0);
	d_free_align(pBAbt1);
	d_free_align(d0);
	d_free_align(d1);
	d_free_align(dN);
	d_free_align(pDCt0);
	d_free_align(pDCt1);
	free(DCN);
	d_free_align(pDCtN);
	free(idx0);
	free(idx1);
	free(idxN);
	d_free_align(pRSQ0);
	d_free_align(pRSQ1);
	d_free_align(pRSQN);
	d_free_align(rq0);
	d_free_align(rq1);
	d_free_align(rqN);
	d_free_align(work);
	free(stat);
	for(ii=0; ii<N; ii++)
		{
		d_free_align(hux[ii]);
		d_free_align(hpi[ii]);
		d_free_align(hlam[ii]);
		d_free_align(ht[ii]);
		d_free_align(hrb[ii]);
		d_free_align(hrrq[ii]);
		d_free_align(hrd[ii]);
		}
	d_free_align(hux[N]);
	d_free_align(hlam[N]);
	d_free_align(ht[N]);
	d_free_align(hrrq[N]);
	d_free_align(hrd[N]);
	
#if 0
	// high level interface
	free(rA);
	free(rB);
	free(rC);
	free(rD);
	free(rb);
	free(rQ);
	free(rQf);
	free(rS);
	free(rR);
	free(rq);
	free(rqf);
	free(rr);
	free(lb);
	free(rlb);
	free(lg);
	free(rlg);
	free(lgN);
	free(ub);
	free(rub);
	free(ug);
	free(rug);
	free(ugN);
	free(rx);
	free(ru);
	free(rpi);
	free(rlam);
	free(rt);
	free(rwork);
#endif
	
	// new high level interface
	free(lb0);
	free(ub0);
	free(lb1);
	free(ub1);
	free(lbN);
	free(ubN);
	free(lg0);
	free(ug0);
	free(lg1);
	free(ug1);
	free(work1);
	for(ii=0; ii<N; ii++)
		{
		free(hx[ii]);
		free(hu[ii]);
		free(hpi1[ii]);
		free(hlam1[ii]);
		free(ht1[ii]);
		}
	free(hx[N]);
	free(hlam1[N]);
	free(ht1[N]);

	return 0;
	
	}