C++ (Cpp) dsymv_lib 예제들

프로그래밍 언어: C++ (Cpp)

메소드/함수: dsymv_lib

hotexamples.com에서의 예제들: 8

C++ (Cpp) dsymv_lib - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C++ (Cpp)의 dsymv_lib에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: d_res.c 프로젝트: wuyou33/hpmpc

void d_res_mpc(int nx, int nu, int N, double **hpBAbt, double **hpQ, double **hq, double **hux, double **hpi, double **hrq, double **hrb)
	{

	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;

	const int pnz = bs*((nx+nu+1+bs-1)/bs);
	const int cnz = ncl*((nx+nu+1+ncl-1)/ncl);
	const int cnx = ncl*((nx+ncl-1)/ncl);

	static double temp[D_MR] = {};

	int ii, jj;
	
	int nxu = nx+nu;

	// first block
	for(jj=0; jj<nu; jj++) hrq[0][jj] = - hq[0][jj];
	for(jj=0; jj<nu%bs; jj++) { temp[jj] = hux[0][(nu/bs)*bs+jj]; hux[0][(nu/bs)*bs+jj] = 0.0; }
	dgemv_t_lib(nx, nu, hpQ[0]+(nu/bs)*bs*cnz+nu%bs, cnz, hux[0]+nu, -1, hrq[0], hrq[0]);
	for(jj=0; jj<nu%bs; jj++) hux[0][(nu/bs)*bs+jj] = temp[jj];
	dsymv_lib(nu, nu, hpQ[0], cnz, hux[0], -1, hrq[0], hrq[0]);
	dgemv_n_lib(nu, nx, hpBAbt[0], cnx, hpi[1], -1, hrq[0], hrq[0]);
	for(jj=0; jj<nx; jj++) hrb[0][jj] = hux[1][nu+jj] - hpBAbt[0][(nxu/bs)*bs*cnx+nxu%bs+bs*jj];
	dgemv_t_lib(nxu, nx, hpBAbt[0], cnx, hux[0], -1, hrb[0], hrb[0]);

	// middle blocks
	for(ii=1; ii<N; ii++)
		{
		for(jj=0; jj<nu; jj++) hrq[ii][jj] = - hq[ii][jj];
		for(jj=0; jj<nx; jj++) hrq[ii][nu+jj] = hpi[ii][jj] - hq[ii][nu+jj];
		dsymv_lib(nxu, nxu, hpQ[ii], cnz, hux[ii], -1, hrq[ii], hrq[ii]);
		for(jj=0; jj<nx; jj++) hrb[ii][jj] = hux[ii+1][nu+jj] - hpBAbt[ii][(nxu/bs)*bs*cnx+nxu%bs+bs*jj];
		dgemv_nt_lib(nxu, nx, hpBAbt[ii], cnx, hpi[ii+1], hux[ii], -1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);
		}

	// last block
	for(jj=0; jj<nx; jj++) hrq[N][nu+jj] = hpi[N][jj] - hq[N][nu+jj];
	dsymv_lib(nx+nu%bs, nx+nu%bs, hpQ[N]+(nu/bs)*bs*cnz+(nu/bs)*bs*bs, cnz, hux[N]+(nu/bs)*bs, -1, hrq[N]+(nu/bs)*bs, hrq[N]+(nu/bs)*bs);
	
	}

예제 #2

파일 보기

파일: d_res_ip_soft.c 프로젝트: mkotlyar/hpmpc

void d_res_mpc_soft_tv(int N, int *nx, int *nu, int *nb, int **idxb, int *ng, int *ns, double **hpBAbt, double **hpQ, double **hq, double **hZ, double **hz, double **hux, double **hpDCt, double **hd, double **hpi, double **hlam, double **ht, double **hrq, double **hrb, double **hrd, double **hrz, double *mu)
	{

	const int bs = D_MR;
	const int ncl = D_NCL;

	static double temp[D_MR] = {};

	int ii, jj;
	
	int nu0, nu1, cnz0, nx0, nx1, nxm, cnx0, cnx1, nb0, pnb, ng0, png, cng, ns0, pns, nb_tot;


	// initialize mu
	nb_tot = 0;
	mu[0] = 0;



	nu1 = nu[0];
	nx1 = nx[0];
	cnx1  = (nx1+ncl-1)/ncl*ncl;
	// first blocks
	for(ii=0; ii<N; ii++)
		{
		nu0 = nu1;
		nu1 = nu[ii+1];
		nx0 = nx1;
		nx1 = nx[ii+1];
		cnx0 = cnx1;
		cnx1  = (nx1+ncl-1)/ncl*ncl;
		cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;
		nb0 = nb[ii];
		pnb = (nb0+bs-1)/bs*bs;
		ng0 = ng[ii];
		png = (ng0+bs-1)/bs*bs;
		cng = (ng0+ncl-1)/ncl*ncl;
		ns0 = ns[ii];
		pns = (ns0+bs-1)/bs*bs;
		nb_tot += nb0 + ng0 + ns0;

		for(jj=0; jj<nb0; jj++)
			mu[0] += hlam[ii][jj] * ht[ii][jj] + hlam[ii][pnb+jj] * ht[ii][pnb+jj];
		for(jj=0; jj<ng0; jj++) 
			mu[0] += hlam[ii][2*pnb+jj] * ht[ii][2*pnb+jj] + hlam[ii][2*pnb+png+jj] * ht[ii][2*pnb+png+jj];
		for(jj=0; jj<ns0; jj++) 
			mu[0] += hlam[ii][2*pnb+2*png+0*pns+jj] * ht[ii][2*pnb+2*png+0*pns+jj] + hlam[ii][2*pnb+2*png+1*pns+jj] * ht[ii][2*pnb+2*png+1*pns+jj] + hlam[ii][2*pnb+2*png+2*pns+jj] * ht[ii][2*pnb+2*png+2*pns+jj] + hlam[ii][2*pnb+2*png+3*pns+jj] * ht[ii][2*pnb+2*png+3*pns+jj];

		for(jj=0; jj<nb0; jj++)
			{
			hrd[ii][jj]     =   hux[ii][idxb[ii][jj]] - hd[ii][jj]     - ht[ii][jj];
			hrd[ii][pnb+jj] = - hux[ii][idxb[ii][jj]] - hd[ii][pnb+jj] - ht[ii][pnb+jj];
			}
		if(ng0>0)
			{
			dgemv_t_lib(nu0+nx0, ng0, hpDCt[ii], cng, hux[ii], 0, hrd[ii]+2*pnb, hrd[ii]+2*pnb);
			for(jj=0; jj<ng0; jj++)
				{
				hrd[ii][2*pnb+png+jj] = - hrd[ii][2*pnb+jj];
				hrd[ii][2*pnb+jj] += - hd[ii][2*pnb+jj] - ht[ii][2*pnb+jj];
				hrd[ii][2*pnb+png+jj] += - hd[ii][2*pnb+png+jj] - ht[ii][2*pnb+png+jj];
				}
			}
		for(jj=0; jj<ns0; jj++)
			{
			hrd[ii][2*pnb+2*png+0*pns+jj] = ht[ii][2*pnb+2*png+2*pns+jj] + hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+0*pns+jj] - ht[ii][2*pnb+2*png+0*pns+jj];
			hrd[ii][2*pnb+2*png+1*pns+jj] = ht[ii][2*pnb+2*png+3*pns+jj] - hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+1*pns+jj] - ht[ii][2*pnb+2*png+1*pns+jj];
			}

		for(jj=0; jj<nu0; jj++) 
			hrq[ii][jj] = - hq[ii][jj];
		for(jj=0; jj<nx0; jj++) 
			hrq[ii][nu0+jj] = - hq[ii][nu0+jj] + hpi[ii][jj];
		dsymv_lib(nu0+nx0, nu0+nx0, hpQ[ii], cnz0, hux[ii], -1, hrq[ii], hrq[ii]);
		for(jj=0; jj<nb0; jj++) 
			hrq[ii][idxb[ii][jj]] += hlam[ii][jj] - hlam[ii][pnb+jj];
		if(ng0>0)
			{
			// TODO work space + one dgemv call
			dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb, 1, hrq[ii], hrq[ii]);
			dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb+png, -1, hrq[ii], hrq[ii]);
			}
		for(jj=0; jj<ns0; jj++) 
			hrq[ii][idxb[ii][nu0+jj]] += hlam[ii][2*pnb+2*png+0*pns+jj] - hlam[ii][2*pnb+2*png+1*pns+jj];
		for(jj=0; jj<nx1; jj++) 
			hrb[ii][jj] = hux[ii+1][nu1+jj] - hpBAbt[ii][(nu0+nx0)/bs*bs*cnx1+(nu0+nx0)%bs+bs*jj];
		dgemv_nt_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hpi[ii+1], hux[ii], -1, -1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);

		for(jj=0; jj<ns0; jj++) 
			{ 
			hrz[ii][0*pns+jj] = hz[ii][0*pns+jj] + hZ[ii][0*pns+jj]*ht[ii][2*pnb+2*png+2*pns+jj] - hlam[ii][2*pnb+2*png+0*pns+jj] - hlam[ii][2*pnb+2*png+2*pns+jj]; 
			hrz[ii][1*pns+jj] = hz[ii][1*pns+jj] + hZ[ii][1*pns+jj]*ht[ii][2*pnb+2*png+3*pns+jj] - hlam[ii][2*pnb+2*png+1*pns+jj] - hlam[ii][2*pnb+2*png+3*pns+jj]; 
			}

		}
	

	// last block
	ii = N;
	nu0 = nu1;
	nx0 = nx1;
	cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;
	nb0 = nb[ii];
	pnb = (nb0+bs-1)/bs*bs;
	ng0 = ng[ii];
	png = (ng0+bs-1)/bs*bs;
	cng = (ng0+ncl-1)/ncl*ncl;
	ns0 = ns[ii];
	pns = (ns0+bs-1)/bs*bs;
	nb_tot += nb0 + ng0 + ns0;

	for(jj=0; jj<nb0; jj++)
		mu[0] += hlam[ii][jj] * ht[ii][jj] + hlam[ii][pnb+jj] * ht[ii][pnb+jj];
	for(jj=0; jj<ng0; jj++) 
		mu[0] += hlam[ii][2*pnb+jj] * ht[ii][2*pnb+jj] + hlam[ii][2*pnb+png+jj] * ht[ii][2*pnb+png+jj];
	for(jj=0; jj<ns0; jj++) 
		mu[0] += hlam[ii][2*pnb+2*png+0*pns+jj] * ht[ii][2*pnb+2*png+0*pns+jj] + hlam[ii][2*pnb+2*png+1*pns+jj] * ht[ii][2*pnb+2*png+1*pns+jj] + hlam[ii][2*pnb+2*png+2*pns+jj] * ht[ii][2*pnb+2*png+2*pns+jj] + hlam[ii][2*pnb+2*png+3*pns+jj] * ht[ii][2*pnb+2*png+3*pns+jj];

	for(jj=0; jj<nb0; jj++)
		{
		hrd[ii][jj]     =   hux[ii][idxb[ii][jj]] - hd[ii][jj]     - ht[ii][jj];
		hrd[ii][pnb+jj] = - hux[ii][idxb[ii][jj]] - hd[ii][pnb+jj] - ht[ii][pnb+jj];
		}
	if(ng0>0)
		{
		dgemv_t_lib(nu0+nx0, ng0, hpDCt[ii], cng, hux[ii], 0, hrd[ii]+2*pnb, hrd[ii]+2*pnb);
		for(jj=0; jj<ng0; jj++)
			{
			hrd[ii][2*pnb+png+jj] = - hrd[ii][2*pnb+jj];
			hrd[ii][2*pnb+jj] += - hd[ii][2*pnb+jj] - ht[ii][2*pnb+jj];
			hrd[ii][2*pnb+png+jj] += - hd[ii][2*pnb+png+jj] - ht[ii][2*pnb+png+jj];
			}
		}
	for(jj=0; jj<ns0; jj++)
		{
		hrd[ii][2*pnb+2*png+0*pns+jj] = ht[ii][2*pnb+2*png+2*pns+jj] + hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+0*pns+jj] - ht[ii][2*pnb+2*png+0*pns+jj];
		hrd[ii][2*pnb+2*png+1*pns+jj] = ht[ii][2*pnb+2*png+3*pns+jj] - hux[ii][idxb[ii][nu0+jj]] - hd[ii][2*pnb+2*png+1*pns+jj] - ht[ii][2*pnb+2*png+1*pns+jj];
		}


	for(jj=0; jj<nx0; jj++) 
		hrq[ii][nu0+jj] = hpi[ii][jj] - hq[ii][nu0+jj];
	for(jj=0; jj<nb0; jj++) 
		hrq[ii][idxb[ii][jj]] += hlam[ii][jj] - hlam[ii][pnb+jj];
	dsymv_lib(nx0+nu0%bs, nx0+nu0%bs, hpQ[ii]+nu0/bs*bs*cnz0+nu0/bs*bs*bs, cnz0, hux[ii]+nu0/bs*bs, -1, hrq[ii]+nu0/bs*bs, hrq[ii]+nu0/bs*bs);
	if(ng0>0)
		{
		// TODO work space + one dgemv call
		dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb, 1, hrq[ii], hrq[ii]);
		dgemv_n_lib(nu0+nx0, ng0, hpDCt[ii], cng, hlam[ii]+2*pnb+png, -1, hrq[ii], hrq[ii]);
		}
	for(jj=0; jj<ns0; jj++) 
		hrq[ii][idxb[ii][nu0+jj]] += - hlam[ii][2*pnb+2*png+2*pns+jj] + hlam[ii][2*pnb+2*png+3*pns+jj];
	
	for(jj=0; jj<ns0; jj++) 
		{ 
		hrz[ii][0*pns+jj] = hz[ii][0*pns+jj] + hZ[ii][0*pns+jj]*ht[ii][2*pnb+2*png+2*pns+jj] - hlam[ii][2*pnb+2*png+0*pns+jj] - hlam[ii][2*pnb+2*png+2*pns+jj]; 
		hrz[ii][1*pns+jj] = hz[ii][1*pns+jj] + hZ[ii][1*pns+jj]*ht[ii][2*pnb+2*png+3*pns+jj] - hlam[ii][2*pnb+2*png+1*pns+jj] - hlam[ii][2*pnb+2*png+3*pns+jj]; 
		}



	// normalize mu
	if(nb_tot!=0)
		mu[0] /= 2.0*nb_tot;

	}

예제 #3

파일 보기

파일: d_res.c 프로젝트: wuyou33/hpmpc

void d_res_mpc_tv(int N, int *nx, int *nu, double **hpBAbt, double **hpQ, double **hq, double **hux, double **hpi, double **hrq, double **hrb)
	{

	const int bs = D_MR;
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line

	static double temp[D_MR] = {};

	int ii, jj;
	
	int nu0, nu1, cnz0, nx0, nx1, nxm, cnx0, cnx1;


	// first block
	ii = 0;
	nu0 = nu[ii];
	nu1 = nu[ii+1];
	nx0 = nx[ii];
	nx1 = nx[ii+1];
	cnx1  = (nx1+ncl-1)/ncl*ncl;
	cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;
	
	for(jj=0; jj<nu0; jj++) 
		hrq[ii][jj] = - hq[ii][jj];
	if(nx0>0)
		{
		for(jj=0; jj<nu0%bs; jj++) 
			{ 
			temp[jj] = hux[ii][nu0/bs*bs+jj]; 
			hux[ii][nu0/bs*bs+jj] = 0.0; 
			}
		dgemv_t_lib(nx0+nu0%bs, nu0, hpQ[ii]+nu0/bs*bs*cnz0, cnz0, hux[ii]+nu0/bs*bs, -1, hrq[ii], hrq[ii]);
		for(jj=0; jj<nu0%bs; jj++) 
			hux[ii][nu0/bs*bs+jj] = temp[jj];
		}
	dsymv_lib(nu0, nu0, hpQ[ii], cnz0, hux[ii], -1, hrq[ii], hrq[ii]);
	dgemv_n_lib(nu0, nx1, hpBAbt[ii], cnx1, hpi[ii+1], -1, hrq[ii], hrq[ii]);
	
	for(jj=0; jj<nx1; jj++) 
		hrb[ii][jj] = hux[ii+1][nu1+jj] - hpBAbt[ii][(nu0+nx0)/bs*bs*cnx1+(nu0+nx0)%bs+bs*jj];
	dgemv_t_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hux[ii], -1, hrb[ii], hrb[ii]);



	// middle blocks
	for(ii=1; ii<N; ii++)
		{
		nu0 = nu1;
		nu1 = nu[ii+1];
		nx0 = nx1;
		nx1 = nx[ii+1];
		cnx0 = cnx1;
		cnx1  = (nx1+ncl-1)/ncl*ncl;
		cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;

		for(jj=0; jj<nu0; jj++) 
			hrq[ii][jj] = - hq[ii][jj];
		for(jj=0; jj<nx0; jj++) 
			hrq[ii][nu0+jj] = - hq[ii][nu0+jj] + hpi[ii][jj];
		dsymv_lib(nu0+nx0, nu0+nx0, hpQ[ii], cnz0, hux[ii], -1, hrq[ii], hrq[ii]);

		for(jj=0; jj<nx1; jj++) 
			hrb[ii][jj] = hux[ii+1][nu1+jj] - hpBAbt[ii][(nu0+nx0)/bs*bs*cnx1+(nu0+nx0)%bs+bs*jj];
		dgemv_nt_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hpi[ii+1], hux[ii], -1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);

		}
	


	// last block
	ii = N;
	nu0 = nu1;
	nx0 = nx1;
	cnz0  = (nu0+nx0+1+ncl-1)/ncl*ncl;

	for(jj=0; jj<nx0; jj++) 
		hrq[ii][nu0+jj] = hpi[ii][jj] - hq[ii][nu0+jj];
	dsymv_lib(nx0+nu0%bs, nx0+nu0%bs, hpQ[ii]+nu0/bs*bs*cnz0+nu0/bs*bs*bs, cnz0, hux[ii]+nu0/bs*bs, -1, hrq[ii]+nu0/bs*bs, hrq[ii]+nu0/bs*bs);

	}

예제 #4

파일 보기

파일: d_res.c 프로젝트: wuyou33/hpmpc

void d_res_mhe_if(int nx, int nw, int ndN, int N, double **hpQA, double **hpRG, double *L0_inv, double **hq, double **hr, double **hf, double *p0, double **hx, double **hw, double **hlam, double **hrq, double **hrr, double **hrf, double *work)
	{
	
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl;
	
	const int anx = nal*((nx+nal-1)/nal);
	const int anw = nal*((nw+nal-1)/nal);
	const int cnx = ncl*((nx+ncl-1)/ncl);
	const int cnw = ncl*((nw+ncl-1)/ncl);

	int ii, jj;

	double *ptr = work;

	double *x_temp; x_temp = ptr; //d_zeros_align(&x_temp, 2*anx, 1);
	//double *x_temp; d_zeros_align(&x_temp, 2*anx, 1);
	ptr += 2*anx; // assume nx >= ndN !!!!!
	double *x_temp2; x_temp2 = ptr; //d_zeros_align(&x_temp, 2*anx, 1);
	//double *x_temp2; d_zeros_align(&x_temp2, 2*anx, 1);
	ptr += 2*anx; // assume nx >= ndN !!!!!

	double *wx_temp; wx_temp = ptr; //d_zeros_align(&wx_temp, anw+anx, 1); // TODO too large 
	//double *wx_temp; d_zeros_align(&wx_temp, anw+anx, 1); // TODO too large 
	ptr += anw+anx;
	double *wx_temp2; wx_temp2 = ptr; //d_zeros_align(&wx_temp, anw+anx, 1); // TODO too large 
	//double *wx_temp2; d_zeros_align(&wx_temp2, anw+anx, 1); // TODO too large 
	ptr += anw+anx;

	// first stage
	for(jj=0; jj<nx; jj++) hrq[0][jj] = hq[0][jj] - p0[jj];
	for(jj=0; jj<nw; jj++) hrr[0][jj] = hr[0][jj];
	for(jj=0; jj<nx; jj++) hrf[0][jj] = hf[0][jj] - hx[1][jj];

	//dsymv_lib(nx, nx, L0_inv, cnx, hx[0], hrq[0], hrq[0], 1);
	dtrmv_u_t_lib(nx, L0_inv, cnx, hx[0], 0, x_temp);
	dtrmv_u_n_lib(nx, L0_inv, cnx, x_temp, 1, hrq[0]);
	//dtrmv_u_n_lib(nx, L0_inv, cnx, x_temp, x_temp2, 0);
	//d_print_mat(1, nx, x_temp2, 1);

	for(jj=0; jj<nx; jj++) x_temp[jj] = hx[0][jj];
	for(jj=0; jj<nx; jj++) x_temp[nx+jj] = hlam[0][jj];
	dsymv_lib(2*nx, nx, hpQA[0], cnx, x_temp, 0, x_temp2, x_temp2);
	for(jj=0; jj<nx; jj++) hrq[0][jj] += x_temp2[jj];
	for(jj=0; jj<nx; jj++) hrf[0][jj] += x_temp2[nx+jj];

	for(jj=0; jj<nw; jj++) wx_temp[jj] = hw[0][jj];
	for(jj=0; jj<nx; jj++) wx_temp[nw+jj] = hlam[0][jj];
	//d_print_mat(nx+nw, 1, wx_temp, nx+nw);
	//d_print_pmat(nx+nw, nw, bs, hpRG[0], cnw);
	dsymv_lib(nw+nx, nw, hpRG[0], cnw, wx_temp, 0, wx_temp2, wx_temp2);
	//d_print_mat(nx+nw, 1, wx_temp2, nx+nw);
	for(jj=0; jj<nw; jj++) hrr[0][jj] += wx_temp2[jj];
	for(jj=0; jj<nx; jj++) hrf[0][jj] += wx_temp2[nw+jj];

	//d_print_mat(1, nx, hrq[0], 1);
	//d_print_mat(1, nw, hrr[0], 1);
	//d_print_mat(1, nx, hrf[0], 1);
	//exit(2);

	// middle stages
	for(ii=1; ii<N; ii++)
		{
		for(jj=0; jj<nx; jj++) hrq[ii][jj] = hq[ii][jj] - hlam[ii-1][jj];
		for(jj=0; jj<nw; jj++) hrr[ii][jj] = hr[ii][jj];
		for(jj=0; jj<nx; jj++) hrf[ii][jj] = hf[ii][jj] - hx[ii+1][jj];

		for(jj=0; jj<nx; jj++) x_temp[jj] = hx[ii][jj];
		for(jj=0; jj<nx; jj++) x_temp[nx+jj] = hlam[ii][jj];
		dsymv_lib(2*nx, nx, hpQA[ii], cnx, x_temp, 0, x_temp2, x_temp2);
		for(jj=0; jj<nx; jj++) hrq[ii][jj] += x_temp2[jj];
		for(jj=0; jj<nx; jj++) hrf[ii][jj] += x_temp2[nx+jj];

		for(jj=0; jj<nw; jj++) wx_temp[jj] = hw[ii][jj];
		for(jj=0; jj<nx; jj++) wx_temp[nw+jj] = hlam[ii][jj];
		dsymv_lib(nw+nx, nw, hpRG[ii], cnw, wx_temp, 0, wx_temp2, wx_temp2);
		for(jj=0; jj<nw; jj++) hrr[ii][jj] += wx_temp2[jj];
		for(jj=0; jj<nx; jj++) hrf[ii][jj] += wx_temp2[nw+jj];

		//d_print_mat(1, nx, hrq[ii], 1);
		//d_print_mat(1, nw, hrr[ii], 1);
		//d_print_mat(1, nx, hrf[ii], 1);
		//exit(1);
		}
	
	// last stage
	for(jj=0; jj<nx; jj++) hrq[N][jj] = hq[N][jj] - hlam[N-1][jj];
	if(ndN<=0)
		{
		dsymv_lib(nx, nx, hpQA[N], cnx, hx[N], 1, hrq[N], hrq[N]);
		}
	else
		{
		for(jj=0; jj<nx; jj++) x_temp[jj] = hx[N][jj];
		for(jj=0; jj<ndN; jj++) x_temp[nx+jj] = hlam[N][jj];
		for(jj=0; jj<nx; jj++) x_temp2[jj] = hrq[N][jj];
		for(jj=0; jj<ndN; jj++) x_temp2[nx+jj] = - hf[N][jj];
		dsymv_lib(nx+ndN, nx, hpQA[N], cnx, x_temp, 1, x_temp2, x_temp2);
		for(jj=0; jj<nx; jj++) hrq[N][jj] = x_temp2[jj];
		for(jj=0; jj<ndN; jj++) hrf[N][jj] = x_temp2[nx+jj];
		}
	//d_print_mat(1, nx, hrq[N], 1);
	//d_print_mat(1, ndN, rd, 1);
	//d_print_pmat(nx+ndN, nx, bs, hpQA[N], cnx);
	//exit(1);

	//free(x_temp);
	//free(x_temp2);
	//free(wx_temp);
	//free(wx_temp2);

	//exit(1);
	


	return;

	}

예제 #5

파일 보기

파일: d_res.c 프로젝트: wuyou33/hpmpc

void d_res_diag_mpc(int N, int *nx, int *nu, double **hdA, double **hpBt, double **hpR, double **hpSt, double **hpQ, double **hb, double **hrq, double **hux, double **hpi, double **hres_rq, double **hres_b, double *work)
	{

	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;

	int ii, jj;

	int nu0, nu1, cnu0, nx0, nx1, nxm, cnx0, cnx1;



	// first stage
	ii = 0;
	nu0 = nu[ii];
	nu1 = nu[ii+1];
	nx0 = nx[ii]; // nx1;
	nx1 = nx[ii+1];
	cnu0  = ncl*((nu0+ncl-1)/ncl);
	cnx1  = ncl*((nx1+ncl-1)/ncl);
	nxm = (nx0<nx1) ? nx0 : nx1;

	for(jj=0; jj<nu0; jj++) hres_rq[ii][jj] = - hrq[ii][jj];
	for(jj=0; jj<nx0; jj++) work[jj] = hux[ii][nu0+jj];
	dgemv_t_lib(nx0, nu0, hpSt[ii], cnu0, work, -1, hres_rq[ii], hres_rq[ii]);
	dsymv_lib(nu0, nu0, hpR[ii], cnu0, hux[ii], -1, hres_rq[ii], hres_rq[ii]);
	dgemv_n_lib(nu0, nx1, hpBt[ii], cnx1, hpi[ii+1], -1, hres_rq[ii], hres_rq[ii]);

	for(jj=0; jj<nx1; jj++) hres_b[ii][jj] = hux[ii+1][nu1+jj] - hb[ii][jj];
	for(jj=0; jj<nxm; jj++) hres_b[ii][jj] -= hdA[ii][jj] * work[jj];
	dgemv_t_lib(nu0, nx1, hpBt[ii], cnx1, hux[ii], -1, hres_b[ii], hres_b[ii]);


	// middle stages
	for(ii=1; ii<N; ii++)
		{
		nu0 = nu1;
		nu1 = nu[ii+1];
		nx0 = nx1;
		nx1 = nx[ii+1];
		cnu0  = ncl*((nu0+ncl-1)/ncl);
		cnx0 = cnx1;
		cnx1  = ncl*((nx1+ncl-1)/ncl);
		nxm = (nx0<nx1) ? nx0 : nx1;

		for(jj=0; jj<nu0; jj++) hres_rq[ii][jj] = - hrq[ii][jj];
		for(jj=0; jj<nx0; jj++) work[jj] = hux[ii][nu0+jj];
		dgemv_t_lib(nx0, nu0, hpSt[ii], cnu0, work, -1, hres_rq[ii], hres_rq[ii]);
		dsymv_lib(nu0, nu0, hpR[ii], cnu0, hux[ii], -1, hres_rq[ii], hres_rq[ii]);
		dgemv_n_lib(nu0, nx1, hpBt[ii], cnx1, hpi[ii+1], -1, hres_rq[ii], hres_rq[ii]);

		for(jj=0; jj<nx0; jj++) hres_rq[ii][nu0+jj] = hpi[ii][jj] - hrq[ii][nu0+jj];
		for(jj=0; jj<nxm; jj++) hres_rq[ii][nu0+jj] -= hdA[ii][jj] * hpi[ii+1][jj];
		dgemv_n_lib(nx0, nu0, hpSt[ii], cnu0, hux[ii], -1, hres_rq[ii]+nu0, hres_rq[ii]+nu0);
		dsymv_lib(nx0, nx0, hpQ[ii], cnx0, work, -1, hres_rq[ii]+nu0, hres_rq[ii]+nu0);

		for(jj=0; jj<nx1; jj++) hres_b[ii][jj] = hux[ii+1][nu1+jj] - hb[ii][jj];
		for(jj=0; jj<nxm; jj++) hres_b[ii][jj] -= hdA[ii][jj] * work[jj];
		dgemv_t_lib(nu0, nx1, hpBt[ii], cnx1, hux[ii], -1, hres_b[ii], hres_b[ii]);

		}

	// last stage
	ii = N;
	nu0 = nu1;
	nx0 = nx1;
	cnx0 = cnx1;

	for(jj=0; jj<nx0; jj++) hres_rq[ii][nu0+jj] = hpi[ii][jj] - hrq[ii][nu0+jj];
	for(jj=0; jj<nx0; jj++) work[jj] = hux[ii][nu0+jj];
	dsymv_lib(nx0, nx0, hpQ[ii], cnx0, work, -1, hres_rq[ii]+nu0, hres_rq[ii]+nu0);

	}

예제 #6

파일 보기

파일: test_d_ric_mhe.c 프로젝트: wuyou33/hpmpc

int main()
	{

#if defined(REF_BLAS_OPENBLAS)
	openblas_set_num_threads(1);
#endif
#if defined(REF_BLAS_BLIS)
	omp_set_num_threads(1);
#endif

	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

	printf("Riccati solver performance test - double precision\n");
	printf("\n");

	// maximum frequency of the processor
	const float GHz_max = GHZ_MAX;
	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
	printf("\n");

	// maximum flops per cycle, double precision
#if defined(TARGET_X64_AVX2)
	const float flops_max = 16;
	printf("Testing solvers for AVX & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_AVX)
	const float flops_max = 8;
	printf("Testing solvers for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	const float flops_max = 4;
	printf("Testing solvers for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A15)
	const float flops_max = 2;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A9)
	const float flops_max = 1;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A7)
	const float flops_max = 0.5;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X86_ATOM)
	const float flops_max = 1;
	printf("Testing solvers for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_POWERPC_G2)
	const float flops_max = 1;
	printf("Testing solvers for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4)
	const float flops_max = 2;
	printf("Testing reference solvers, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4_PREFETCH)
	const float flops_max = 2;
	printf("Testing reference solvers, 4x4 kernel with register prefetch: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_2X2)
	const float flops_max = 2;
	printf("Testing reference solvers, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#endif
	
	FILE *f;
	f = fopen("./test_problems/results/test_blas.m", "w"); // a

#if defined(TARGET_X64_AVX2)
	fprintf(f, "C = 'd_x64_avx2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_AVX)
	fprintf(f, "C = 'd_x64_avx';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	fprintf(f, "C = 'd_x64_sse3';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A9)
	fprintf(f, "C = 'd_ARM_cortex_A9';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A7)
	fprintf(f, "C = 'd_ARM_cortex_A7';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A15)
	fprintf(f, "C = 'd_ARM_cortex_A15';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X86_ATOM)
	fprintf(f, "C = 'd_x86_atom';\n");
	fprintf(f, "\n");
#elif defined(TARGET_POWERPC_G2)
	fprintf(f, "C = 'd_PowerPC_G2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4_PREFETCH)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_2X2)
	fprintf(f, "C = 'd_c99_2x2';\n");
	fprintf(f, "\n");
#endif

	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
	fprintf(f, "\n");

	fprintf(f, "B = [\n");
	

	printf("\n");
	printf("Tested solvers:\n");
	printf("-sv : Riccati factorization and system solution (prediction step in IP methods)\n");
	printf("-trs: system solution after a previous call to Riccati factorization (correction step in IP methods)\n");
	printf("\n");
	printf("\n");
	
#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
/*	printf("\nflush to zero on\n");*/
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	// to throw floating-point exception
/*#ifndef __APPLE__*/
/*    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);*/
/*#endif*/
	
	int ii, jj;
	
	const int bs = D_MR; //d_get_mr();
	const int ncl = D_NCL;
	const int nal = bs*ncl; // number of doubles per cache line
	
	int nn[] = {4, 6, 8, 10, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300};
	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 4000, 4000, 2000, 2000, 1000, 1000, 400, 400, 400, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
	
	int vnx[] = {8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 1024};
	int vnrep[] = {100, 100, 100, 100, 100, 100, 50, 50, 50, 20, 10, 10};
	int vN[] = {4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256};

	int nx, nw, ny, ndN, N, nrep, Ns;
	int diag_R;

	int ll;
//	int ll_max = 77;
	int ll_max = 1;
	for(ll=0; ll<ll_max; ll++)
		{
		

		FILE* fid;
		double* yy;
		float* yy_temp;

		if(1)
			{
			fid = fopen("./test_problems/mhe_measure.dat", "r");
			if(fid==NULL)
				exit(-1);
			//printf("\nhola\n");
			int dummy_int = fscanf(fid, "%d %d %d %d", &nx, &nw, &ny, &Ns);
			//printf("\n%d %d %d %d\n", nx, nw, ny, Ns);
			yy_temp = (float*) malloc(ny*Ns*sizeof(float));
			yy = (double*) malloc(ny*Ns*sizeof(double));
			for(jj=0; jj<ny*Ns; jj++)
				{
				dummy_int = fscanf(fid, "%e", &yy_temp[jj]);
				yy[jj] = (double) yy_temp[jj];
				//printf("\n%f", yy[jj]);
				}
			//printf("\n");
			fclose(fid);
			#if 1
			N = 15; //Ns-1; // NN;
			nrep = NREP;//nnrep[ll];
			nx = 12;//nn[ll];
			nw = 5;//nn[ll];
			ny = 3;
			ndN = 0; //2;
			diag_R = 0;
			#else
			N = 10; //Ns-1; // NN;
			nrep = nnrep[ll];
			nx = nn[ll];
			nw = nn[ll];
			ny = 3;
			ndN = 0;
			diag_R = 0;
			#endif
			//printf("\nnx = %d; nw =  %d; ny =  %d; ndN = %d; N = %d\n\n", nx, nw, ny, ndN, N);
			}
		else if(ll_max==1)
			{
			nx = NX; // number of states (it has to be even for the mass-spring system test problem)
			nw = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
			ny = nx/2; // size of measurements vector
			N  = NN; // horizon lenght
			nrep = NREP;
			}
		else
			{
			nx = nn[ll]; // number of states (it has to be even for the mass-spring system test problem)
			nw = 2; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
			ny = nx/2; // size of measurements vector
			N  = 10; // horizon lenght
			nrep = nnrep[ll];
			}

		int rep;
		
	
		const int nz = nx+ny; // TODO delete
		const int nwx = nw+nx;
		const int anz = nal*((nz+nal-1)/nal);
		const int anx = nal*((nx+nal-1)/nal);
		const int anw = nal*((nw+nal-1)/nal);
		const int any = nal*((ny+nal-1)/nal);
		const int pnz = bs*((nz+bs-1)/bs);
		const int pnx = bs*((nx+bs-1)/bs);
		const int pnw = bs*((nw+bs-1)/bs);
		const int pny = bs*((ny+bs-1)/bs);
		const int pnx2 = bs*((2*nx+bs-1)/bs);
		const int pnwx = bs*((nw+nx+bs-1)/bs);
		const int cnz = ncl*((nz+ncl-1)/ncl);
		const int cnx = ncl*((nx+ncl-1)/ncl);
		const int cnw = ncl*((nw+ncl-1)/ncl);
		const int cny = ncl*((ny+ncl-1)/ncl);
		const int cnx2 = 2*(ncl*((nx+ncl-1)/ncl));
		const int cnwx = ncl*((nw+nx+ncl-1)/ncl);
		const int cnwx1 = ncl*((nw+nx+1+ncl-1)/ncl);
		const int cnf = cnz<cnx+ncl ? cnx+ncl : cnz;

		const int pad = (ncl-(nx+nw)%ncl)%ncl; // packing between AGL & P
		const int cnl = nx+nw+pad+cnx;
		const int pad2 = (ncl-(nx)%ncl)%ncl; // packing between AGL & P
		const int cnl2 = cnz<cnx+ncl ? nx+pad2+cnx+ncl : nx+pad2+cnz;
	
/************************************************
* dynamical system
************************************************/	

		double *A; d_zeros(&A, nx, nx); // states update matrix

		double *B; d_zeros(&B, nx, nw); // inputs matrix

		double *b; d_zeros(&b, nx, 1); // states offset
		double *x0; d_zeros(&x0, nx, 1); // initial state

		double Ts = 0.5; // sampling time
		mass_spring_system(Ts, nx, nw, N, A, B, b, x0);
	
		for(jj=0; jj<nx; jj++)
			b[jj] = 0.0;
	
		for(jj=0; jj<nx; jj++)
			x0[jj] = 0.0;
		x0[0] = 3.5;
		x0[1] = 3.5;
	
		double *C; d_zeros(&C, ny, nx); // inputs matrix
		for(jj=0; jj<ny; jj++)
			C[jj*(ny+1)] = 1.0;

//		d_print_mat(nx, nx, A, nx);
//		d_print_mat(nx, nw, B, nx);
//		d_print_mat(ny, nx, C, ny);
//		d_print_mat(nx, 1, b, nx);
//		d_print_mat(nx, 1, x0, nx);
	
		/* packed into contiguous memory */
		double *pA; d_zeros_align(&pA, pnx, cnx);
		d_cvt_mat2pmat(nx, nx, A, nx, 0, pA, cnx);

		double *pG; d_zeros_align(&pG, pnx, cnw);
		d_cvt_mat2pmat(nx, nw, B, nx, 0, pG, cnw);
		
		double *pC; d_zeros_align(&pC, pny, cnx);
		d_cvt_mat2pmat(ny, nx, C, ny, 0, pC, cnx);
		
		double *pCA; d_zeros_align(&pCA, pnz, cnx);
		d_cvt_mat2pmat(ny, nx, C, ny, 0, pCA, cnx);
		d_cvt_mat2pmat(nx, nx, A, nx, ny, pCA+(ny/bs)*bs+ny%bs, cnx);

//		d_print_pmat(nx, nx, bs, pA, cnx);
//		d_print_pmat(nx, nw, bs, pG, cnw);
//		d_print_pmat(ny, nx, bs, pC, cnx);

/************************************************
* cost function
************************************************/	

		double *R; d_zeros(&R, nw, nw);
		for(jj=0; jj<nw; jj++)
			R[jj*(nw+1)] = 1.0;

		double *Q; d_zeros(&Q, ny, ny);
		for(jj=0; jj<ny; jj++)
			Q[jj*(ny+1)] = 1.0;

		double *Qx; d_zeros(&Qx, nx, nx);
		for(jj=0; jj<ny; jj++)
			for(ii=0; ii<ny; ii++)
				Qx[ii+nx*jj] = Q[ii+ny*jj];

		double *L0; d_zeros(&L0, nx, nx);
		for(jj=0; jj<nx; jj++)
			L0[jj*(nx+1)] = 1.0;

		double *q; d_zeros_align(&q, any, 1);
		for(jj=0; jj<ny; jj++)
			q[jj] = 0.0;

		double *r; d_zeros_align(&r, anw, 1);
		for(jj=0; jj<nw; jj++)
			r[jj] = 1.0;

		double *f; d_zeros_align(&f, anx, 1);
		for(jj=0; jj<nx; jj++)
			f[jj] = jj;//1.0; //b[jj]; //1.0;

		/* packed into contiguous memory */
		double *pR; d_zeros_align(&pR, pnw, cnw);
		d_cvt_mat2pmat(nw, nw, R, nw, 0, pR, cnw);

		double *pQ; d_zeros_align(&pQ, pny, cny);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQ, cny);

//		d_print_pmat(nw, nw, bs, pQ, cnw);
//		d_print_pmat(ny, ny, bs, pR, cny);

/************************************************
* compound quantities
************************************************/	
		
		double *pRG; d_zeros_align(&pRG, pnwx, cnw);
		d_cvt_mat2pmat(nw, nw, R, nw, 0, pRG, cnw);
		d_cvt_mat2pmat(nx, nw, B, nx, nw, pRG+(nw/bs)*bs*cnw+nw%bs, cnw);
		//d_print_pmat(nw+nx, nw, bs, pRG, cnw);

		double *pQA; d_zeros_align(&pQA, pnx2, cnx);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQA, cnx);
		d_cvt_mat2pmat(nx, nx, A, nx, nx, pQA+(nx/bs)*bs*cnx+nx%bs, cnx);
		//d_print_pmat(2*nx, cnx, bs, pQA, cnx);
		//exit(1);

/************************************************
* series of matrices
************************************************/	

		double *(hpA[N]);
		double *(hpCA[N]);
		double *(hpG[N]);
		double *(hpC[N+1]);
		double *(hpR[N]);
		double *(hpQ[N+1]);
		double *(hpLp[N+1]);
		double *(hdLp[N+1]);
		double *(hpLp2[N+1]);
		double *(hpLe[N+1]);
		double *(hq[N]);
		double *(hr[N+1]);
		double *(hf[N]);
		double *(hxe[N+1]);
		double *(hxp[N+1]);
		double *(hw[N]);
		double *(hy[N+1]);
		double *(hlam[N]);

		double *(hpRG[N]);
		double *(hpQA[N+1]);
		double *(hpGLr[N]);
		double *(hpALe[N+1]);
		double *(hrr[N]);
		double *(hqq[N+1]);
		double *(hff[N+1]);
		double *p_hrr; d_zeros_align(&p_hrr, anw, N);
		double *p_hqq; d_zeros_align(&p_hqq, anx, N+1);
		double *p_hff; d_zeros_align(&p_hff, anx, N+1);

		double *p_hxe; d_zeros_align(&p_hxe, anx, N+1);
		double *p_hxp; d_zeros_align(&p_hxp, anx, N+1);
		double *p_hw; d_zeros_align(&p_hw, anw, N);
		double *p_hy; d_zeros_align(&p_hy, any, N+1);
		double *p_hlam; d_zeros_align(&p_hlam, anx, N+1);

		double *(hq_res[N+1]);
		double *(hr_res[N]);
		double *(hf_res[N+1]);
		double *p_hq_res; d_zeros_align(&p_hq_res, anx, N+1);
		double *p_hr_res; d_zeros_align(&p_hr_res, anw, N);
		double *p_hf_res; d_zeros_align(&p_hf_res, anx, N+1);

		for(jj=0; jj<N; jj++)
			{
			hpA[jj] = pA;
			hpCA[jj] = pCA;
			hpG[jj] = pG;
			hpC[jj] = pC;
			hpR[jj] = pR;
			hpQ[jj] = pQ;
			d_zeros_align(&hpLp[jj], pnx, cnl);
			d_zeros_align(&hdLp[jj], anx, 1);
			d_zeros_align(&hpLp2[jj], pnz, cnl2);
			d_zeros_align(&hpLe[jj], pnz, cnf);
			hr[jj] = r;
			hq[jj] = q;
			hf[jj] = f;

			hpRG[jj] = pRG;
			hpQA[jj] = pQA;
			d_zeros_align(&hpGLr[jj], pnwx, cnw);
			d_zeros_align(&hpALe[jj], pnx2, cnx2);
			hrr[jj] = p_hrr+jj*anw;
			hqq[jj] = p_hqq+jj*anx;
			hff[jj] = p_hff+jj*anx;

			hxe[jj] = p_hxe+jj*anx; //d_zeros_align(&hxe[jj], anx, 1);
			hxp[jj] = p_hxp+jj*anx; //d_zeros_align(&hxp[jj], anx, 1);
			hw[jj] = p_hw+jj*anw; //d_zeros_align(&hw[jj], anw, 1);
			hy[jj] = p_hy+jj*any; //d_zeros_align(&hy[jj], any, 1);
			hlam[jj] = p_hlam+jj*anx; //d_zeros_align(&hlambda[jj], anx, 1);

			hq_res[jj] = p_hq_res+jj*anx;
			hr_res[jj] = p_hr_res+jj*anw;
			hf_res[jj] = p_hf_res+jj*anx;
			}

		hpC[N] = pC;
		hpQ[N] = pQ;
		d_zeros_align(&hpLp[N], pnx, cnl);
		d_zeros_align(&hdLp[N], anx, 1);
		d_zeros_align(&hpLp2[N], pnz, cnl2);
		d_zeros_align(&hpLe[N], pnz, cnf);
		hq[N] = q;

		// equality constraints on the states at the last stage
		double *D; d_zeros(&D, ndN, nx);
		for(ii=0; ii<ndN; ii++) D[ii*(ndN+1)] = 1;
		//D[0+ndN*0] = 1;
		//D[1+ndN*(nx-1)] = 1;
		double *d; d_zeros_align(&d, ndN, 1);
		for(ii=0; ii<ndN; ii++) d[ii] = ii;
		//d[0] = 1;
		//d[1] = 0;
		const int pnxdN = bs*((nx+ndN+bs-1)/bs);
		double *pCtQC; d_zeros_align(&pCtQC, pnxdN, cnx);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pCtQC, cnx);
		d_cvt_mat2pmat(ndN, nx, D, ndN, nx, pCtQC+nx/bs*bs*cnx+nx%bs, cnx);
		//d_print_pmat(nx+ndN, nx, bs, pCtRC, cnx);
		hpQA[N] = pCtQC; // there is not A_N
		d_zeros_align(&hpALe[N], pnxdN, cnx2); // there is not A_N: pnx not pnx2
		hqq[N] = p_hqq+N*anx;
		hff[N] = p_hff+N*anx;
		const int pndN = bs*((ndN+bs-1)/bs);
		const int cndN = ncl*((ndN+ncl-1)/ncl);
		double *Ld; d_zeros_align(&Ld, pndN, cndN);
		double *d_res; d_zeros_align(&d_res, pndN, 1);



		hxe[N] = p_hxe+N*anx; //d_zeros_align(&hxe[N], anx, 1);
		hxp[N] = p_hxp+N*anx; //d_zeros_align(&hxp[N], anx, 1);
		hy[N] = p_hy+N*any; //d_zeros_align(&hy[N], any, 1);
		hlam[N] = p_hlam+N*anx; //d_zeros_align(&hlambda[jj], anx, 1);

		hf_res[N] = p_hf_res+N*anx;
		hq_res[N] = p_hq_res+N*anx;

		// initialize hpLp[0] with the cholesky factorization of /Pi_p
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, hpLp[0]+(nx+nw+pad)*bs, cnl);
		for(ii=0; ii<nx; ii++) hdLp[0][ii] = 1.0/L0[ii*(nx+1)];
		d_cvt_mat2pmat(nx, nx, L0, nx, ny, hpLp2[0]+(ny/bs)*bs+ny%bs+(nx+pad2+ny)*bs, cnl2);
		dtrtr_l_lib(nx, ny, hpLp2[0]+(ny/bs)*bs*cnl2+ny%bs+(nx+pad2+ny)*bs, cnl2, hpLp2[0]+(nx+pad2+ncl)*bs, cnl2);	
		//d_print_pmat(nx, cnl, bs, hpLp[0], cnl);
		//d_print_pmat(nz, cnl2, bs, hpLp2[0], cnl2);

		// buffer for L0
		double *pL0; d_zeros_align(&pL0, pnx, cnx);
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, pL0, cnx);
		// invert L0 in hpALe[0]
		dtrinv_lib(nx, pL0, cnx, hpALe[0], cnx2);
		double *pL0_inv; d_zeros_align(&pL0_inv, pnx, cnx);
		dtrinv_lib(nx, pL0, cnx, pL0_inv, cnx);
		//d_print_pmat(nx, nx, bs, pL0, cnx);
		//d_print_pmat(nx, nx, bs, pL0_inv, cnx);
		//d_print_pmat(pnx2, cnx2, bs, hpALe[0], cnx2);
		//exit(1);

		//double *work; d_zeros_align(&work, pny*cnx+pnz*cnz+anz+pnz*cnf+pnw*cnw, 1);
		double *work; d_zeros_align(&work, 2*pny*cnx+anz+pnw*cnw+pnx*cnx, 1);
		//printf("\nciao %d %d %d %d %d %d\n", pny, cnx, anz, pnw, cnw, pnx);

		double *work2; d_zeros_align(&work2, 2*pny*cnx+pnw*cnw+pnx*cnw+2*pnx*cnx+anz, 1);

		double *work3; d_zeros_align(&work3, pnx*cnl+anx, 1);
		double *work4; d_zeros_align(&work4, 4*anx+2*(anx+anw), 1);
//		for(jj=0; jj<2*pny*cnx+anz+pnw*cnw+pnx*cnx; jj++)
//			work[jj] = -100.0;

		// measurements
		for(jj=0; jj<=N; jj++)
			for(ii=0; ii<ny; ii++)
				hy[jj][ii] = yy[jj*ny+ii];

		//d_print_mat(ny, N+1, hy[0], any);

		// initial guess
		for(ii=0; ii<nx; ii++)
			x0[ii] = 0.0;
		for(ii=0; ii<nx; ii++)
			hxp[0][ii] = x0[ii];



		// information filter - solution
		double *y_temp; d_zeros_align(&y_temp, any, 1);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) hrr[ii][jj] = r[jj];
		for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) hff[ii][jj] = f[jj];
		for(jj=0; jj<ndN; jj++) hff[N][jj] = d[jj];
		for(ii=0; ii<=N; ii++) 
			{
			for(jj=0; jj<ny; jj++) y_temp[jj] = - q[jj];
			//d_print_mat(1, ny, y_temp, 1);
			dsymv_lib(ny, ny, hpQ[ii], cny, hy[ii], y_temp, y_temp, -1);
			//d_print_mat(1, ny, y_temp, 1);
			dgemv_t_lib(ny, nx, hpC[ii], cnx, y_temp, hqq[ii], hqq[ii], 0);
			//d_print_mat(1, nx, hqq[ii], 1);
			//if(ii==9)
			//exit(1);
			}
		//exit(1);




/************************************************
* new low-level mhe_if interface
************************************************/	

		int nrows = pnx>pnw ? 2*pnx : pnx+pnw;
		int ncols = cnwx1;

		double *pQRAG; d_zeros_align(&pQRAG, nrows, ncols);

		if(nx>=nw)
			{
			d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQRAG, cnwx1);
			d_cvt_mat2pmat(nx, nx, A, nx, 0, pQRAG+pnx*cnwx1, cnwx1);
			d_cvt_mat2pmat(nw, nw, R, nw, 0, pQRAG+(pnx-pnw)*cnwx1+nx*bs, cnwx1);
			d_cvt_mat2pmat(nx, nw, B, nx, 0, pQRAG+pnx*cnwx1+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			if(nx>pnx-nx)
				d_cvt_mat2pmat(pnx-nx, nx, A+(nx-pnx+nx), nx, nx, pQRAG+nx/bs*bs*cnwx1+nx%bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nx, A, nx, nx, pQRAG+nx/bs*bs*cnwx1+nx%bs, cnwx1);
			if(nx>pnw-nw)
				d_cvt_mat2pmat(pnw-nw, nw, B+(nx-pnw+nw), nx, nw, pQRAG+(pnx-pnw+nw/bs*bs)*cnwx1+nw%bs+nx*bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nw, B, nx, nw, pQRAG+(pnx-pnw+nw/bs*bs)*cnwx1+nw%bs+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			}
		else
			{
			d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQRAG+(pnw-pnx)*cnwx1, cnwx1);
			d_cvt_mat2pmat(nx, nx, A, nx, 0, pQRAG+pnw*cnwx1, cnwx1);
			d_cvt_mat2pmat(nw, nw, R, nw, 0, pQRAG+nx*bs, cnwx1);
			d_cvt_mat2pmat(nx, nw, B, nx, 0, pQRAG+pnw*cnwx1+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			if(nx>pnx-nx)
				d_cvt_mat2pmat(pnx-nx, nx, A+(nx-pnx+nx), nx, nx, pQRAG+(pnw-pnx+nx/bs*bs)*cnwx1+nx%bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nx, A, nx, nx, pQRAG+(pnw-pnx+nx/bs*bs)*cnwx1+nx%bs, cnwx1);
			if(nx>pnw-nw)
				d_cvt_mat2pmat(pnw-nw, nw, B+(nx-pnw+nw), nx, nw, pQRAG+nw/bs*bs*cnwx1+nw%bs+nx*bs, cnwx1);
			else
				d_cvt_mat2pmat(nx, nw, B, nx, nw, pQRAG+nw/bs*bs*cnwx1+nw%bs+nx*bs, cnwx1);
			//d_print_pmat(nrows, ncols, bs, pQRAG, ncols);
			}

		double *pQD; d_zeros_align(&pQD, pnx+pndN, cnx);
		d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQD, cnx);
		d_cvt_mat2pmat(ndN, nx, D, ndN, 0, pQD+pnx*cnx, cnx);
		//d_print_pmat(pnx+pndN, cnx, bs, pQD, cnx);
		if(ndN>pnx-nx)
			d_cvt_mat2pmat(pnx-nx, nx, D+(ndN-pnx+nx), ndN, nx, pQD+nx/bs*bs*cnx+nx%bs, cnx);
		else
			d_cvt_mat2pmat(ndN, nx, D, ndN, nx, pQD+nx/bs*bs*cnx+nx%bs, cnx);
		//d_print_pmat(pnx+pndN, cnx, bs, pQD, cnx);
		//exit(1);




		double *(hpQRAG[N+1]);
		double *(hpLAG[N+1]);
		double *(hpLe2[N+1]);

		for(ii=0; ii<N; ii++)	
			{
			hpQRAG[ii] = pQRAG;
			d_zeros_align(&hpLAG[ii], nrows, ncols);
			d_zeros_align(&hpLe2[ii], pnx, cnx);
			}
		hpQRAG[N] = pQD;
		d_zeros_align(&hpLAG[N], pnx+pndN, cnx);
		d_zeros_align(&hpLe2[N], pnx, cnx);
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, hpLe2[0], cnx);
		//d_print_pmat(nx, nx, bs, hpLe2[0], cnx);



		double **dummy;
#if 0

		struct timeval tv10, tv11, tv12;

		// double precision
		gettimeofday(&tv10, NULL); // start

		for(ii=0; ii<1; ii++)
		//for(ii=0; ii<nrep; ii++)
			{

			d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3);
			//d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3);

			}

		gettimeofday(&tv11, NULL); // stop

		for(ii=0; ii<1; ii++)
		//for(ii=0; ii<nrep; ii++)
			{

			d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);

			}

		gettimeofday(&tv12, NULL); // stop

		float time_trf_mhe_if_new = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6);
		float time_trs_mhe_if_new = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6);

		printf("\ntime = %e\t%e\n\n", time_trf_mhe_if_new, time_trs_mhe_if_new);




		//exit(1);
#endif


/************************************************
* reference code
************************************************/	

		double *(hA[N]);
		double *(hG[N]);
		double *(hQ[N+1]);
		double *(hR[N]);
		double *(hAGU[N]);
		double *(hUp[N+1]);
		double *(hUe[N+1]);
		double *(hUr[N]);
		double *Ud;
		double *work_ref;

		for(ii=0; ii<N; ii++)
			{
			hA[ii] = A;
			hG[ii] = B;
			hQ[ii] = Qx;
			hR[ii] = R;
			d_zeros(&hAGU[ii], nx, nx+nw);
			d_zeros(&hUp[ii], nx, nx);
			d_zeros(&hUe[ii], nx, nx);
			d_zeros(&hUr[ii], nw, nw);
			}
		hA[N] = D;
		hQ[N] = Qx;
		d_zeros(&hAGU[N], ndN, nx);
		d_zeros(&hUp[N], nx, nx);
		d_zeros(&hUe[N], nx, nx);
		d_zeros(&Ud, ndN, ndN);
		d_zeros(&work_ref, nx+nw, 1);

		for(ii=0; ii<nx*nx; ii++)
			hUp[0][ii] = L0[ii];



		#if 0

		printf("\nfactorization\n");
		d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr, Ud);

		printf("\nsolution\n");
		d_ric_trs_mhe_if_blas( nx, nw, ndN, N, hAGU, hUp, hUe, hUr, Ud, hqq, hrr, hff, hxp, hxe, hw, hlam, work_ref);

		//d_print_mat(nx, nx, hUe[N], nx);
		//exit(1);

		#endif




/************************************************
* high-level interface
************************************************/	

#if 0
		int kk;

		double *AA; d_zeros(&AA, nx, nx*N);
		//for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) for(ll=0; ll<nx; ll++) AA[ll+nx*jj+nx*nx*ii] = A[ll+nx*jj];
		for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) for(kk=0; kk<nx; kk++) AA[jj+nx*kk+nx*nx*ii] = A[kk+nx*jj];

		double *GG; d_zeros(&GG, nx, nw*N);
		//for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) for(ll=0; ll<nx; ll++) GG[ll+nx*jj+nx*nw*ii] = B[ll+nx*jj];
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) for(kk=0; kk<nx; kk++) GG[jj+nw*kk+nx*nw*ii] = B[kk+nx*jj];

		double *ff; d_zeros(&ff, nx, N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) ff[jj+nx*ii] = f[jj];

		double *DD; d_zeros(&DD, ndN, nx);
		//for(jj=0; jj<nx; jj++) for(ll=0; ll<ndN; ll++) DD[ll+ndN*jj] = D[ll+ndN*jj];
		for(jj=0; jj<nx; jj++) for(kk=0; kk<ndN; kk++) DD[jj+nx*kk] = D[kk+ndN*jj];

		double *dd; d_zeros(&dd, ndN, 1);
		for(kk=0; kk<ndN; kk++) dd[kk] = d[kk];

		double *RR; d_zeros(&RR, nw, nw*N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw*nw; jj++) RR[jj+nw*nw*ii] = R[jj];

		double *QQ; d_zeros(&QQ, nx, nx*N);
		for(ii=0; ii<N; ii++) 
			{
			for(jj=0; jj<ny; jj++) for(kk=0; kk<ny; kk++) QQ[kk+nx*jj+nx*nx*ii] = Q[kk+ny*jj];
			//for(jj=ny; jj<nx; jj++) QQ[jj+nx*jj+nx*nx*ii] = 1e-8;
			}

		double *Qf; d_zeros(&Qf, nx, nx);
		for(jj=0; jj<ny; jj++) for(kk=0; kk<ny; kk++) Qf[kk+nx*jj] = Q[kk+ny*jj];

		double *rr; d_zeros(&rr, nw, N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) rr[jj+nw*ii] = r[jj];

		double *qq; d_zeros(&qq, nx, N);
		for(ii=0; ii<N; ii++) for(jj=0; jj<ny; jj++) qq[jj+nx*ii] = q[jj];
		double *yy_tmp; d_zeros_align(&yy_tmp, any, 1);
		for(ii=0; ii<N; ii++) 
			{
			for(jj=0; jj<ny; jj++) yy_tmp[jj] = - q[jj];
			dsymv_lib(ny, ny, hpQ[ii], cny, hy[ii], yy_tmp, -1);
			dgemv_t_lib(ny, nx, hpC[ii], cnx, yy_tmp, &qq[ii*nx], 0);
			}

		double *qf; d_zeros(&qf, nx, 1);
//		for(jj=0; jj<ny; jj++) qf[jj] = q[jj];
//		if(ndN>0) 
//			{
			for(jj=0; jj<ny; jj++) yy_tmp[jj] = - q[jj];
			dsymv_lib(ny, ny, hpQ[N], cny, hy[N], yy_tmp, -1);
			dgemv_t_lib(ny, nx, hpC[N], cnx, yy_tmp, qf, 0);
//			}

		double *xx0; d_zeros(&xx0, nx, 1);

		double *LL0; d_zeros(&LL0, nx, nx);

		double *xxe; d_zeros(&xxe, nx, N+1);

		double *LLe; d_zeros(&LLe, nx, nx);

		double *ww; d_zeros(&ww, nw, N);

		double *llam; d_zeros(&llam, nx, N+1);

		double *work_high_level; d_zeros(&work_high_level, hpmpc_ric_mhe_if_dp_work_space(nx, nw, ny, ndN, N), 1);

		double *dummy0;

		struct timeval tv00, tv01;

		int error_code;

		printf("\nhigh-level\n");

		// double precision
		gettimeofday(&tv00, NULL); // start

		for(ii=0; ii<nrep; ii++)
			{

			for(jj=0; jj<nx; jj++) xx0[jj] = x0[jj];
			for(jj=0; jj<nx*nx; jj++) LL0[jj] = L0[jj];

			//error_code = fortran_order_riccati_mhe_if( 'd', 2, nx, nw, 0, ndN, N, AA, GG, dummy, ff, DD, dd, RR, QQ, Qf, rr, qq, qf, dummy, xx0, LL0, xxe, LLe, ww, llam, work_high_level);
			error_code = c_order_riccati_mhe_if( 'd', 2, nx, nw, 0, ndN, N, AA, GG, dummy0, ff, DD, dd, RR, QQ, Qf, rr, qq, qf, dummy0, xx0, LL0, xxe, LLe, ww, llam, work_high_level);

			//if(error_code)
			//	break;

			}

		gettimeofday(&tv01, NULL); // stop

		float time_mhe_if_high_level = (float) (tv01.tv_sec-tv00.tv_sec)/(nrep+0.0)+(tv01.tv_usec-tv00.tv_usec)/(nrep*1e6);

		printf("\nhigh-level interface for MHE_if\n\nerror_code: %d, time = %e\n\n", error_code, time_mhe_if_high_level);

		//d_print_mat(nx, N+1, xxe, nx);
		//d_print_mat(nw, N, ww, nw);

		free(AA);
		free(GG);
		free(ff);
		free(DD);
		free(dd);
		free(RR);
		free(QQ);
		free(Qf);
		free(rr);
		free(qq);
		free(qf);
		free(xx0);
		free(LL0);
		free(xxe);
		free(LLe);
		free(ww);
		free(llam);
		free(work_high_level);
		free(yy_tmp);

		//exit(1);
#endif


/************************************************
* call the solver
************************************************/	

		//d_print_mat(nx, nx, A, nx);
		//d_print_mat(nx, nw, B, nx);

		//d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work);
		d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, work);

		// estimation
		d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 0, hlam, work);

#if 0
		// print solution
		printf("\nx_e\n");
		d_print_mat(nx, N+1, hxe[0], anx);
#endif
	
		// smooth estimation
		d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 1, hlam, work);

		//d_print_pmat(nx, nx, bs, hpLp[N-1]+(nx+nw+pad)*bs, cnl);
		//d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl);
		//d_print_pmat(nx, nx, bs, hpLe[N-1]+ncl*bs, cnf);
		//d_print_pmat(nx, nx, bs, hpLe[N]+ncl*bs, cnf);

#if 1
		printf("\nx_s\n");
		//d_print_mat(nx, N+1, hxp[0], anx);
		d_print_mat(nw, N, hw[0], anw);
		d_print_mat(nx, N+1, hxe[0], anx);
		//d_print_mat(nx, N, hlam[0], anx);
#endif

		// information filter - factorization
		//d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3);
		d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3);

		// information filter - solution
		//d_ric_trs_mhe_if(nx, nw, ndN, N, hpALe, hpGLr, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
		d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
		//d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hxp, hxe, hw, hy, 1, hlam, work);

		//d_print_pmat(nx, nx, bs, hpALe[N-1], cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N], cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N-2]+cnx*bs, cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N-1]+cnx*bs, cnx2);
		//d_print_pmat(nx, nx, bs, hpALe[N]+cnx*bs, cnx2);
		//d_print_pmat(nx, nx, bs, hpRA[N], cnx);

#if 1
		printf("\nx_s_if\n");
		//d_print_mat(nx, N+1, hxp[0], anx);
		d_print_mat(nw, N, hw[0], anw);
		d_print_mat(nx, N+1, hxe[0], anx);
		//d_print_mat(nx, N, hlam[0], anx);
		//exit(1);
#endif

		//d_print_pmat(nw, nw, bs, hpQ[0], cnw);
		//d_print_pmat(nx, nw, bs, hpG[0], cnw);
		//d_print_mat(nw, 1, hq[0], nw);
		//d_print_mat(nw, 1, hw[0], nw);
		//d_print_mat(nx, 1, hlam[0], nx);
		//exit(3);

#if 1
		int nZ = nw+nx+1;
		int pnZ = (nw+nx+1+bs-1)/bs*bs;
		int cnZ = (nw+nx+1+ncl-1)/ncl*ncl;

		int cnL = cnZ>cnx+ncl ? cnZ : cnx+ncl;

		double *(hpRSQrq[N+1]); 
		for(ii=0; ii<=N; ii++)
			{
			d_zeros_align(&hpRSQrq[ii], pnZ, cnZ);
			d_cvt_mat2pmat(nw, nw, R, nw, 0, hpRSQrq[ii], cnZ);
			d_cvt_mat2pmat(ny, ny, Q, ny, nw, hpRSQrq[ii]+nw/bs*bs*cnZ+nw%bs+nw*bs, cnZ);
			d_cvt_mat2pmat(1, nw, r, 1, nw+nx, hpRSQrq[ii]+(nw+nx)/bs*bs*cnZ+(nw+nx)%bs, cnZ);
			d_cvt_mat2pmat(1, nx, hqq[ii], 1, nw+nx, hpRSQrq[ii]+(nw+nx)/bs*bs*cnZ+(nw+nx)%bs+nw*bs, cnZ);
			//d_print_pmat(nZ, nZ, bs, hpRSQrq[ii], cnZ);
			}

		double *pP0; d_zeros_align(&pP0, pnx, cnx);
		d_cvt_mat2pmat(nx, nx, L0, nx, 0, pP0, cnx);
		//d_print_pmat(nx, nx, bs, pP0, cnx);
		dgead_lib(nx, nx, 1.0, 0, pP0, cnx, nw, hpRSQrq[0]+nw/bs*bs*cnZ+nw%bs+nw*bs, cnZ); 
		//d_print_pmat(nZ, nZ, bs, hpRSQrq[0], cnZ);

		double *pBAbt; d_zeros_align(&pBAbt, pnZ, cnx);
		d_cvt_tran_mat2pmat(nx, nw, B, nx, 0, pBAbt, cnx);
		d_cvt_tran_mat2pmat(nx, nx, A, nx, nw, pBAbt+nw/bs*bs*cnx+nw%bs, cnx);
		d_cvt_mat2pmat(1, nx, f, 1, nw+nx, pBAbt+(nw+nx)/bs*bs*cnx+(nw+nx)%bs, cnx);
		//d_print_pmat(nZ, nx, bs, pBAbt, cnx);

		double *(hpBAbt[N]);
		for(ii=0; ii<N; ii++)
			{
			hpBAbt[ii] = pBAbt;
			}

		double *(hpLam[N+1]);
		for(ii=0; ii<=N; ii++)
			{
			d_zeros_align(&hpLam[ii], pnZ, cnL);
			}

		double *work_ric; d_zeros_align(&work_ric, pnZ, cnx);
		double *diag_ric; d_zeros_align(&diag_ric, pnZ, 1);

		double *hux_mat; d_zeros_align(&hux_mat, pnZ, N+1);
		double *(hux[N+1]);
		for(ii=0; ii<=N; ii++)
			{
			hux[ii] = hux_mat+ii*pnZ;
			}

		double **pdummy;

		d_back_ric_sv(N, nx, nw, hpBAbt, hpRSQrq, 0, pdummy, pdummy, 0, hux, hpLam, work_ric, diag_ric, 0, pdummy, 0, pdummy, 0, 0, 0, pdummy, pdummy, pdummy);

		d_print_mat(nw, N+1, hux_mat, pnZ);
		d_print_mat(nx, N+1, hux_mat+nw, pnZ);

		exit(1);

#endif

		// compute residuals
		double *p0; d_zeros_align(&p0, anx, 1);
		double *x_temp; d_zeros_align(&x_temp, anx, 1);
		dtrmv_u_t_lib(nx, pL0_inv, cnx, x0, x_temp, 0);
		dtrmv_u_n_lib(nx, pL0_inv, cnx, x_temp, p0, 0);
		d_res_mhe_if(nx, nw, ndN, N, hpQA, hpRG, pL0_inv, hqq, hrr, hff, p0, hxe, hw, hlam, hq_res, hr_res, hf_res, work4);

//		printf("\nprint residuals\n\n");
//		d_print_mat(nx, N+1, hq_res[0], anx);
//		d_print_mat(nw, N, hr_res[0], anw);
//		d_print_mat(nx, N, hf_res[0], anx);
//		d_print_mat(ndN, 1, hf_res[0]+N*anx, anx);

		//return 0;
		//exit(1);

		if(0 && PRINTRES)
			{
			// print solution
			printf("\nx_p\n");
			d_print_mat(nx, N+1, hxp[0], anx);
			printf("\nx_s\n");
			d_print_mat(nx, N+1, hxe[0], anx);
			printf("\nw\n");
			d_print_mat(nw, N+1, hw[0], anw);
			//printf("\nL_p\n");
			//d_print_pmat(nx, nx, bs, hpLp[0]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[0], 1);
			//d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[1], 1);
			//d_print_pmat(nx, nx, bs, hpLp[2]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[2], 1);
			//d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl);
			//d_print_mat(1, nx, hdLp[N], 1);
			//printf("\nL_p\n");
			//d_print_pmat(nz, nz, bs, hpLp2[0]+(nx+pad2)*bs, cnl2);
			//d_print_pmat(nz, nz, bs, hpLp2[1]+(nx+pad2)*bs, cnl2);
			//d_print_pmat(nz, nz, bs, hpLp2[2]+(nx+pad2)*bs, cnl2);
			//printf("\nL_e\n");
			//d_print_pmat(nz, nz, bs, hpLe[0], cnf);
			//d_print_pmat(nz, nz, bs, hpLe[1], cnf);
			//d_print_pmat(nz, nz, bs, hpLe[2], cnf);
			//d_print_pmat(nx, nx, bs, hpA[0], cnx);
			}


		// timing 
		struct timeval tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8;

		// double precision
		gettimeofday(&tv0, NULL); // start

		// factorize
		for(rep=0; rep<nrep; rep++)
			{
			//d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work);
			d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, work);
			}

		gettimeofday(&tv1, NULL); // start

		// solve
		for(rep=0; rep<nrep; rep++)
			{
			d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 1, hlam, work);
			}

		gettimeofday(&tv2, NULL); // start

		// factorize
		for(rep=0; rep<nrep; rep++)
			{
			//d_print_pmat(nx, nx, bs, hpLe[N]+(ncl)*bs, cnf);
			//d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl);
			//d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work);
			d_ric_trf_mhe_end(nx, nw, ny, N, hpCA, hpG, hpC, hpLp2, hpR, hpQ, hpLe, work2);
			}

		gettimeofday(&tv3, NULL); // start

		// solve
		for(rep=0; rep<nrep; rep++)
			{
			d_ric_trs_mhe_end(nx, nw, ny, N, hpA, hpG, hpC, hpLp2, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hy, work2);
			}

		gettimeofday(&tv4, NULL); // start

		// factorize information filter
		for(rep=0; rep<nrep; rep++)
			{
			//d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3);
			d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3);
			}

		gettimeofday(&tv5, NULL); // start

		// factorize information filter
		for(rep=0; rep<nrep; rep++)
			{
			//d_ric_trs_mhe_if(nx, nw, ndN, N, hpALe, hpGLr, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
			d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3);
			}

		gettimeofday(&tv6, NULL); // start

		// factorize information filter
		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_BLIS) || defined(REF_BLAS_NETLIB)
			//d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr);
			d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr, Ud);
#endif
			}

		gettimeofday(&tv7, NULL); // start

		// solution information filter
		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_BLIS) || defined(REF_BLAS_NETLIB)
			d_ric_trs_mhe_if_blas( nx, nw, ndN, N, hAGU, hUp, hUe, hUr, Ud, hqq, hrr, hff, hxp, hxe, hw, hlam, work_ref);
#endif
			}

		gettimeofday(&tv8, NULL); // start

		float Gflops_max = flops_max * GHz_max;

		float time_trf = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
		float time_trs = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
		float time_trf_end = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
		float time_trs_end = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6);
		float time_trf_if = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6);
		float time_trs_if = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6);
		float time_trf_if_blas = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6);
		float time_trs_if_blas = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6);

		float flop_trf_if = N*(10.0/3.0*nx*nx*nx+nx*nx*nw)+2.0/3.0*nx*nx*nx+ndN*nx*nx+ndN*ndN*nx+1.0/3.0*ndN*ndN*ndN;
		if(diag_R==0)
			flop_trf_if += N*(nx*nw*nw+1.0/3.0*nw*nw*nw);
		else
			flop_trf_if += N*(nx*nw+1.0/2.0*nw*nw);

		float Gflops_trf_if = flop_trf_if*1e-9/time_trf_if;
		float Gflops_trf_if_blas = flop_trf_if*1e-9/time_trf_if_blas;

		if(ll==0)
			{
			printf("\nnx\tnw\tny\tN\ttrf time\ttrs time\ttrf_e time\ttrs_e time\ttrf_if time\ttrf_if Gflops\ttrf_if percent\ttrs_if time\ttrf_if BLAS\tGflops\t\tpercent\t\ttrs_if BLAS\n\n");
//			fprintf(f, "\nnx\tnu\tN\tsv time\t\tsv Gflops\tsv %%\t\ttrs time\ttrs Gflops\ttrs %%\n\n");
			}
		printf("%d\t%d\t%d\t%d\t%e\t%e\t%e\t%e\t%e\t%f\t%f\t%e\t%e\t%f\t%f\t%e\n", nx, nw, ny, N, time_trf, time_trs, time_trf_end, time_trs_end, time_trf_if, Gflops_trf_if, 100*Gflops_trf_if/Gflops_max, time_trs_if, time_trf_if_blas, Gflops_trf_if_blas, 100*Gflops_trf_if_blas/Gflops_max, time_trs_if_blas);


#if 0
		return 0;


		// moving horizon test

		// window size
		N = 20;

		double *(hhxe[N+1]);
		double *(hhxp[N+1]);
		double *(hhw[N]);
		double *(hhy[N+1]);
		double *(hhlam[N]);

		double *p_hhxe; d_zeros_align(&p_hhxe, anx, N+1);
		double *p_hhxp; d_zeros_align(&p_hhxp, anx, N+1);
		double *p_hhw; d_zeros_align(&p_hhw, anw, N);
		double *p_hhlam; d_zeros_align(&p_hhlam, anx, N);

		// shift measurements and initial prediction
		for(ii=0; ii<N; ii++)
			{
			hhxe[ii] = p_hhxe+ii*anx; //d_zeros_align(&hxe[jj], anx, 1);
			hhxp[ii] = p_hhxp+ii*anx; //d_zeros_align(&hxp[jj], anx, 1);
			hhw[ii] = p_hhw+ii*anw; //d_zeros_align(&hw[jj], anw, 1);
			hhy[ii] = hy[ii]; //d_zeros_align(&hy[jj], any, 1);
			hhlam[ii] = p_hhlam+ii*anx; //d_zeros_align(&hlam[jj], anx, 1);
			}
		hhxe[N] = p_hhxe+N*anx; //d_zeros_align(&hxe[jj], anx, 1);
		hhxp[N] = p_hhxp+N*anx; //d_zeros_align(&hxp[jj], anx, 1);
		hhy[N] = hy[N]; //d_zeros_align(&hy[jj], any, 1);

		// shift initial prediction covariance
		//for(ii=0; ii<pnx*cnl; ii++)
		//	hpLp[0][ii] = hpLp[1][ii];

		d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, work);
		d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hhxp, hhxe, hhw, hhy, 1, hhlam, work);

		// zero data
		for(ii=0; ii<Ns*anx; ii++)
			hxe[0][ii] = 0.0;

		for(ii=anx; ii<Ns*anx; ii++)
			hxp[0][ii] = 0.0;

		for(ii=0; ii<(Ns-1)*anw; ii++)
			hw[0][ii] = 0.0;

		for(ii=0; ii<(Ns-1)*anx; ii++)
			hlam[0][ii] = 0.0;

		// save data
		for(ii=0; ii<(N+1); ii++)
			for(jj=0; jj<nx; jj++)
				hxe[ii][jj] = hhxe[ii][jj];

		for(ii=0; ii<(N+1); ii++)
			for(jj=0; jj<nx; jj++)
				hxp[ii][jj] = hhxp[ii][jj];

		for(ii=0; ii<N; ii++)
			for(jj=0; jj<nw; jj++)
				hw[ii][jj] = hhw[ii][jj];
		//d_print_mat(nw, N, hw[0], anw);

		for(ii=0; ii<N; ii++)
			for(jj=0; jj<nx; jj++)
				hlam[ii][jj] = hhlam[ii][jj];



		for(jj=1; jj<Ns-N; jj++)
			{

			//break;
			
			// shift measurements and initial prediction
			for(ii=0; ii<=N; ii++)
				{
				hhy[ii] = hy[ii+jj];
				}

			// shift initial prediction and relative covariance
			for(ii=0; ii<nx; ii++)
				hhxp[0][ii] = hhxp[1][ii];
			for(ii=0; ii<pnx*cnl; ii++)
				hpLp[0][ii] = hpLp[1][ii];

			//d_print_mat(nx, N+1, hhxp[0], anx);

			//d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[1], cnf);
			//d_print_pmat(nx, nx, bs, hpLp[2]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[2], cnf);

			d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, work);
			d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hhxp, hhxe, hhw, hhy, 1, hhlam, work);

			//d_print_mat(nx, N+1, hhxp[0], anx);

			//d_print_pmat(nx, nx, bs, hpLp[0]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[0], cnf);
			//d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl);
			//d_print_pmat(nz, nz, bs, hpLe[1], cnf);

			// save data
			for(ii=0; ii<nx; ii++)
				hxe[N+jj][ii] = hhxe[N][ii];

			for(ii=0; ii<nx; ii++)
				hxp[N+jj][ii] = hhxp[N][ii];

			if(jj<Ns-N-1)
				for(ii=0; ii<nw; ii++)
					hw[N+jj][ii] = hhw[N-1][ii];

			if(jj<Ns-N-1)
				for(ii=0; ii<nx; ii++)
					hlam[N+jj][ii] = hhlam[N-1][ii];

			//break;

			}

		// print solution
		if(PRINTRES)
			{
			printf("\nx_p\n");
			d_print_mat(nx, Ns, hxp[0], anx);
			printf("\nx_e\n");
			d_print_mat(nx, Ns, hxe[0], anx);
			//printf("\nL_e\n");
			//d_print_pmat(nx, nx, bs, hpLp[Ns-1]+(nx+nw+pad)*bs, cnl);
			}

#endif

/************************************************
* return
************************************************/

		free(A);
		free(B);
		free(C);
		free(b);
		free(D);
		free(d);
		free(x0);
		free(Q);
		free(Qx);
		free(R);
		free(q);
		free(r);
		free(f);
		free(L0);
		free(pA);
		free(pG);
		free(pC);
		free(pQ);
		free(pR);
		free(pQA);
		free(pRG);
		free(work);
		free(work2);
		free(work3);
		free(work4);
		free(p_hxe);
		free(p_hxp);
		free(p_hy);
		free(p_hw);
		free(p_hlam);
		//free(p_hhxe);
		//free(p_hhxp);
		//free(p_hhw);
		//free(p_hhlam);
		free(x_temp);
		free(y_temp);
		free(p0);
		free(p_hr_res);
		free(p_hq_res);
		free(p_hf_res);
		free(pL0_inv);
		free(hpLp[0]);
		free(hdLp[0]);
		free(hpLe[0]);
		for(jj=0; jj<N; jj++)
			{
			free(hpLp[jj+1]);
			free(hdLp[jj+1]);
			free(hpLe[jj+1]);
			free(hpGLr[jj]);
			free(hpALe[jj]);
			free(hpLp2[jj]);
			}
		free(hpALe[N]);


		free(pQRAG);
		free(pQD);
		for(ii=0; ii<N; ii++)
			{
			free(hpLAG[ii]);
			free(hpLe2[ii]);
			}
		free(hpLAG[N]);
		free(hpLe2[N]);

		for(ii=0; ii<N; ii++)
			{
			free(hAGU[ii]);
			free(hUp[ii]);
			free(hUe[ii]);
			free(hUr[ii]);
			}
		free(hUp[N]);
		free(hUe[N]);
		free(Ud);
		free(work_ref);


		} // increase size

	fprintf(f, "];\n");
	fclose(f);


	return 0;

	}

예제 #7

파일 보기

파일: test_blas_d.c 프로젝트: wuyou33/hpmpc

int main()
	{
		
#if defined(REF_BLAS_OPENBLAS)
	openblas_set_num_threads(1);
#endif
#if defined(REF_BLAS_BLIS)
	omp_set_num_threads(1);
#endif

	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

	printf("BLAS performance test - double precision\n");
	printf("\n");

	// maximum frequency of the processor
	const float GHz_max = GHZ_MAX;
	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
	printf("\n");

	// maximum flops per cycle, double precision
#if defined(TARGET_X64_AVX2)
	const float flops_max = 16;
	printf("Testing BLAS version for AVX2 & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_AVX)
	const float flops_max = 8;
	printf("Testing BLAS version for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	const float flops_max = 4;
	printf("Testing BLAS version for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A15)
	const float flops_max = 2;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A9)
	const float flops_max = 1;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A7)
	const float flops_max = 0.5;
	printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X86_ATOM)
	const float flops_max = 1;
	printf("Testing BLAS version for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_POWERPC_G2)
	const float flops_max = 1;
	printf("Testing BLAS version for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4_PREFETCH)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 4x4 kernel with register prefetch: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_2X2)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#endif
	
	FILE *f;
	f = fopen("./test_problems/results/test_blas.m", "w"); // a

#if defined(TARGET_X64_AVX2)
	fprintf(f, "C = 'd_x64_avx2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_AVX)
	fprintf(f, "C = 'd_x64_avx';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	fprintf(f, "C = 'd_x64_sse3';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A9)
	fprintf(f, "C = 'd_ARM_cortex_A9';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A7)
	fprintf(f, "C = 'd_ARM_cortex_A7';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A15)
	fprintf(f, "C = 'd_ARM_cortex_A15';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X86_ATOM)
	fprintf(f, "C = 'd_x86_atom';\n");
	fprintf(f, "\n");
#elif defined(TARGET_POWERPC_G2)
	fprintf(f, "C = 'd_PowerPC_G2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4_PREFETCH)
	fprintf(f, "C = 'd_c99_4x4';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_2X2)
	fprintf(f, "C = 'd_c99_2x2';\n");
	fprintf(f, "\n");
#endif

	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
	fprintf(f, "\n");

	fprintf(f, "B = [\n");
	


	int i, j, rep, ll;
	
	const int bsd = D_MR; //d_get_mr();

/*	int info = 0;*/
	
	printf("\nn\t  kernel_dgemm\t  dgemm\t\t  dsyrk_dpotrf\t  dtrmm\t\t  dtrtr\t\t  dgemv_n\t  dgemv_t\t  dtrmv_n\t  dtrmv_t\t  dtrsv_n\t  dtrsv_t\t  dsymv\t\t  dgemv_nt\t\t  dsyrk+dpotrf\t  BLAS dgemm\t  BLAS dgemv_n\t  BLAS dgemv_t\n");
	printf("\nn\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\t Gflops\t    %%\n\n");
	
#if 1
	int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
	
	for(ll=0; ll<75; ll++)
//	for(ll=0; ll<115; ll++)
//	for(ll=0; ll<120; ll++)

		{

		int n = nn[ll];
		int nrep = nnrep[ll];

#else
	int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
	
	for(ll=0; ll<24; ll++)

		{

		int n = nn[ll];
		int nrep = 40000; //nnrep[ll];
#endif


#if defined(REF_BLAS_BLIS)
		f77_int n77 = n;
#endif
	
		double *A; d_zeros(&A, n, n);
		double *B; d_zeros(&B, n, n);
		double *C; d_zeros(&C, n, n);
		double *M; d_zeros(&M, n, n);

		char c_n = 'n';
		char c_t = 't';
		int i_1 = 1;
#if defined(REF_BLAS_BLIS)
		f77_int i77_1 = i_1;
#endif
		double d_1 = 1;
		double d_0 = 0;
	
		for(i=0; i<n*n; i++)
			A[i] = i;
	
		for(i=0; i<n; i++)
			B[i*(n+1)] = 1;
	
		for(i=0; i<n*n; i++)
			M[i] = 1;
	
		int pnd = ((n+bsd-1)/bsd)*bsd;	
		int cnd = ((n+D_NCL-1)/D_NCL)*D_NCL;	
		int cnd2 = 2*((n+D_NCL-1)/D_NCL)*D_NCL;	
		int pad = (D_NCL-n%D_NCL)%D_NCL;

		double *pA; d_zeros_align(&pA, pnd, cnd);
		double *pB; d_zeros_align(&pB, pnd, cnd);
		double *pC; d_zeros_align(&pC, pnd, cnd);
		double *pD; d_zeros_align(&pD, pnd, cnd);
		double *pE; d_zeros_align(&pE, pnd, cnd2);
		double *pF; d_zeros_align(&pF, 2*pnd, cnd);
		double *pL; d_zeros_align(&pL, pnd, cnd);
		double *pM; d_zeros_align(&pM, pnd, cnd);
		double *x; d_zeros_align(&x, pnd, 1);
		double *y; d_zeros_align(&y, pnd, 1);
		double *x2; d_zeros_align(&x2, pnd, 1);
		double *y2; d_zeros_align(&y2, pnd, 1);
		double *diag; d_zeros_align(&diag, pnd, 1);
	
		d_cvt_mat2pmat(n, n, A, n, 0, pA, cnd);
		d_cvt_mat2pmat(n, n, B, n, 0, pB, cnd);
		d_cvt_mat2pmat(n, n, B, n, 0, pD, cnd);
		d_cvt_mat2pmat(n, n, A, n, 0, pE, cnd2);
		d_cvt_mat2pmat(n, n, M, n, 0, pM, cnd);
/*		d_cvt_mat2pmat(n, n, B, n, 0, pE+n*bsd, pnd);*/
		
/*		d_print_pmat(n, 2*n, bsd, pE, 2*pnd);*/
/*		exit(2);*/
	
		for(i=0; i<pnd*cnd; i++) pC[i] = -1;
		
		for(i=0; i<pnd; i++) x[i] = 1;
		for(i=0; i<pnd; i++) x2[i] = 1;

		double *dummy;

		/* timing */
		struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;

		/* warm up */
		for(rep=0; rep<nrep; rep++)
			{
			dgemm_nt_lib(n, n, n, pA, cnd, pB, cnd, 1, pC, cnd, pC, cnd, 1, 1);
			}

		gettimeofday(&tvm1, NULL); // start
	
		for(rep=0; rep<nrep; rep++)
			{

			//dgemm_kernel_nt_lib(n, n, n, pA, cnd, pB, cnd, pC, cnd, pC, cnd, 0, 0, 0);
			dgemm_nn_lib(n, n, n, pA, cnd, pB, cnd, 0, pC, cnd, pC, cnd, 0, 0);

			}

		gettimeofday(&tv0, NULL); // start
	
		for(rep=0; rep<nrep; rep++)
			{

			dgemm_nt_lib(n, n, n, pA, cnd, pB, cnd, 0, pC, cnd, pC, cnd, 0, 0);

			}
	
		gettimeofday(&tv1, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			//dsyrk_dpotrf_lib(n, n, n, pA, cnd, 1, pD, cnd, pC, cnd, diag, 0);
			dsyrk_dpotrf_lib_new(n, n, n, pA, cnd, pA, cnd, 1, pD, cnd, pC, cnd, diag);

			}
	
		gettimeofday(&tv2, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrmm_nt_u_lib(n, n, pA, cnd, pB, cnd, pC, cnd);

			}
	
		gettimeofday(&tv3, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrtr_l_lib(n, 0, pA, cnd, pC, cnd); // triangualr matrix transpose
			//dgetr_lib(n, n, 0, pA, cnd, 0, pC, cnd); // general matrix transpose

			}
	
		gettimeofday(&tv4, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dgemv_n_lib(n, n, pA, cnd, x, 0, y, y);

			}
	
		gettimeofday(&tv5, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dgemv_t_lib(n, n, pA, cnd, x, 0, y, y);

			}
	
		gettimeofday(&tv6, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrmv_u_n_lib(n, pA, cnd, x, 0, y);

			}
	
		gettimeofday(&tv7, NULL); // stop


		for(rep=0; rep<nrep; rep++)
			{

			dtrmv_u_t_lib(n, pA, cnd, x, 0, y);

			}
	
		gettimeofday(&tv8, NULL); // stop


		for(rep=0; rep<nrep; rep++)
			{

			dtrsv_n_lib(2*n, n, 1, pF, cnd, x);

			}
	
		gettimeofday(&tv9, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dtrsv_t_lib(2*n, n, 1, pF, cnd, x);

			}
	
		gettimeofday(&tv10, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dsymv_lib(n, n, pA, cnd, x, 0, y, y);

			}
	
		gettimeofday(&tv11, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dgemv_nt_lib(n, n, pA, cnd, x, x2, 0, y, y2, y, y2);

			}
	
		gettimeofday(&tv12, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			dsyrk_nt_lib(n, n, n, pE, cnd2, pE, cnd2, 1, pD, cnd, pE+(n+pad)*bsd, cnd2);
			//dpotrf_lib(n, n, pE+(n+pad)*bsd, cnd2, pE+(n+pad)*bsd, cnd2, diag);
			dpotrf_lib_new(n, n, pE+(n+pad)*bsd, cnd2, pE+(n+pad)*bsd, cnd2, diag);
			//d_print_pmat(pnd, cnd2, bsd, pE, cnd2);
			//exit(1);
			//break;

			}
	
		gettimeofday(&tv13, NULL); // stop
	
		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB)
			dgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
#endif
#if defined(REF_BLAS_BLIS)
			dgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
#endif
			}

		gettimeofday(&tv14, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB)
			dgemv_(&c_n, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y, &i_1);
#endif
#if defined(REF_BLAS_BLIS)
			dgemv_(&c_n, &n77, &n77, &d_1, A, &n77, x2, &i77_1, &d_0, y, &i77_1);
#endif
			}

		gettimeofday(&tv15, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{
#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB)
			dgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y, &i_1);
#endif
#if defined(REF_BLAS_BLIS)
			dgemv_(&c_t, &n77, &n77, &d_1, A, &n77, x2, &i77_1, &d_0, y, &i77_1);
#endif
			}

		gettimeofday(&tv16, NULL); // stop



		float Gflops_max = flops_max * GHz_max;

		float time_dgemm_kernel = (float) (tv0.tv_sec-tvm1.tv_sec)/(nrep+0.0)+(tv0.tv_usec-tvm1.tv_usec)/(nrep*1e6);
		float flop_dgemm_kernel = 2.0*n*n*n;
		float Gflops_dgemm_kernel = 1e-9*flop_dgemm_kernel/time_dgemm_kernel;

		float time_dgemm = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
		float flop_dgemm = 2.0*n*n*n;
		float Gflops_dgemm = 1e-9*flop_dgemm/time_dgemm;

		float time_dsyrk_dpotrf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
		float flop_dsyrk_dpotrf = 1.0*n*n*n + 1.0/3.0*n*n*n;
		float Gflops_dsyrk_dpotrf = 1e-9*flop_dsyrk_dpotrf/time_dsyrk_dpotrf;

		float time_dtrmm = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
		float flop_dtrmm = 1.0*n*n*n;
		float Gflops_dtrmm = 1e-9*flop_dtrmm/time_dtrmm;
	
		float time_dtrtr = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6);
		float flop_dtrtr = 0.5*n*n;
		float Gflops_dtrtr = 1e-9*flop_dtrtr/time_dtrtr;

		float time_dgemv_n = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6);
		float flop_dgemv_n = 2.0*n*n;
		float Gflops_dgemv_n = 1e-9*flop_dgemv_n/time_dgemv_n;

		float time_dgemv_t = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6);
		float flop_dgemv_t = 2.0*n*n;
		float Gflops_dgemv_t = 1e-9*flop_dgemv_t/time_dgemv_t;

		float time_dtrmv_n = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6);
		float flop_dtrmv_n = 1.0*n*n;
		float Gflops_dtrmv_n = 1e-9*flop_dtrmv_n/time_dtrmv_n;

		float time_dtrmv_t = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6);
		float flop_dtrmv_t = 1.0*n*n;
		float Gflops_dtrmv_t = 1e-9*flop_dtrmv_t/time_dtrmv_t;

		float time_dtrsv_n = (float) (tv9.tv_sec-tv8.tv_sec)/(nrep+0.0)+(tv9.tv_usec-tv8.tv_usec)/(nrep*1e6);
		float flop_dtrsv_n = 3.0*n*n;
		float Gflops_dtrsv_n = 1e-9*flop_dtrsv_n/time_dtrsv_n;

		float time_dtrsv_t = (float) (tv10.tv_sec-tv9.tv_sec)/(nrep+0.0)+(tv10.tv_usec-tv9.tv_usec)/(nrep*1e6);
		float flop_dtrsv_t = 3.0*n*n;
		float Gflops_dtrsv_t = 1e-9*flop_dtrsv_t/time_dtrsv_t;

		float time_dsymv = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6);
		float flop_dsymv = 2.0*n*n;
		float Gflops_dsymv = 1e-9*flop_dsymv/time_dsymv;

		float time_dgemv_nt = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6);
		float flop_dgemv_nt = 4.0*n*n;
		float Gflops_dgemv_nt = 1e-9*flop_dgemv_nt/time_dgemv_nt;

		float time_dsyrk_dpotrf2 = (float) (tv13.tv_sec-tv12.tv_sec)/(nrep+0.0)+(tv13.tv_usec-tv12.tv_usec)/(nrep*1e6);
		float flop_dsyrk_dpotrf2 = 1.0*n*n*n + 1.0/3.0*n*n*n;
		float Gflops_dsyrk_dpotrf2 = 1e-9*flop_dsyrk_dpotrf2/time_dsyrk_dpotrf2;

		float time_dgemm_blas = (float) (tv14.tv_sec-tv13.tv_sec)/(nrep+0.0)+(tv14.tv_usec-tv13.tv_usec)/(nrep*1e6);
		float flop_dgemm_blas = 2.0*n*n*n;
		float Gflops_dgemm_blas = 1e-9*flop_dgemm_blas/time_dgemm_blas;

		float time_dgemv_n_blas = (float) (tv15.tv_sec-tv14.tv_sec)/(nrep+0.0)+(tv15.tv_usec-tv14.tv_usec)/(nrep*1e6);
		float flop_dgemv_n_blas = 2.0*n*n;
		float Gflops_dgemv_n_blas = 1e-9*flop_dgemv_n_blas/time_dgemv_n_blas;

		float time_dgemv_t_blas = (float) (tv16.tv_sec-tv15.tv_sec)/(nrep+0.0)+(tv16.tv_usec-tv15.tv_usec)/(nrep*1e6);
		float flop_dgemv_t_blas = 2.0*n*n;
		float Gflops_dgemv_t_blas = 1e-9*flop_dgemv_t_blas/time_dgemv_t_blas;

		printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm_kernel, 100.0*Gflops_dgemm_kernel/Gflops_max, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dgemv_nt, 100.0*Gflops_dgemv_nt/Gflops_max, Gflops_dsyrk_dpotrf2, 100.0*Gflops_dsyrk_dpotrf2/Gflops_max, Gflops_dgemm_blas, 100.0*Gflops_dgemm_blas/Gflops_max, Gflops_dgemv_n_blas, 100.0*Gflops_dgemv_n_blas/Gflops_max, Gflops_dgemv_t_blas, 100.0*Gflops_dgemv_t_blas/Gflops_max);

	fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm_kernel, 100.0*Gflops_dgemm_kernel/Gflops_max, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dgemv_nt, 100.0*Gflops_dgemv_nt/Gflops_max, Gflops_dsyrk_dpotrf2, 100.0*Gflops_dsyrk_dpotrf2/Gflops_max, Gflops_dgemm_blas, 100.0*Gflops_dgemm_blas/Gflops_max, Gflops_dgemv_n_blas, 100.0*Gflops_dgemv_n_blas/Gflops_max, Gflops_dgemv_t_blas, 100.0*Gflops_dgemv_t_blas/Gflops_max);

		free(A);
		free(B);
		free(M);
		free(pA);
		free(pB);
		free(pC);
		free(pD);
		free(pE);
		free(pF);
		free(pL);
		free(pM);
		free(x);
		free(y);
		free(x2);
		free(y2);
		
		}

	printf("\n");

	fprintf(f, "];\n");
	fclose(f);

	return 0;
	
	}

예제 #8

파일 보기

파일: d_res_ip_res_hard.c 프로젝트: mkotlyar/hpmpc

/* supports the problem size to change stage-wise */
void d_res_res_mpc_hard_tv(int N, int const * nx, int const * nu, int const * nb, int **idxb, int const * ng, double **hpBAbt, double **hb, double **hpQ, double **hq, double **hux, double **hpDCt, double **hd, double **hpi, double **hlam, double **ht, double *work, double **hrq, double **hrb, double **hrd, double **hrm, double *mu)
	{

	const int bs = D_MR;
	const int ncl = D_NCL;

	int ii, jj;
	
	int nu0, nu1, cnux0, nx0, nx1, nxm, cnx0, cnx1, nb0, pnb, ng0, png, cng, nb_tot;

	double mu2;


	// initialize mu
	nb_tot = 0;
	mu2 = 0;



	// first stage
	ii = 0;
	nu0 = nu[ii];
	nu1 = nu[ii+1];
	nx0 = nx[ii]; // nx1;
	nx1 = nx[ii+1];
	cnx1  = (nx1+ncl-1)/ncl*ncl;
	cnux0 = (nu0+nx0+ncl-1)/ncl*ncl;
	nb0 = nb[ii];
	pnb = (nb0+bs-1)/bs*bs;
	ng0 = ng[ii];
	png = (ng0+bs-1)/bs*bs;
	cng = (ng0+ncl-1)/ncl*ncl;

//	for(jj=0; jj<nu0; jj++) 
//		hrq[ii][jj] = hq[ii][jj];

//	for(jj=0; jj<nx0; jj++) 
//		hrq[ii][nu0+jj] = hq[ii][nu0+jj]; // - hpi[ii-1][jj];

	for(jj=0; jj<nu0+nx0; jj++) 
		hrq[ii][jj] = hq[ii][jj];

	if(nb0>0)
		{

		nb_tot += nb0;

		for(jj=0; jj<nb0; jj++) 
			{
			hrq[ii][idxb[ii][jj]] += - hlam[ii][jj] + hlam[ii][pnb+jj];

			hrd[ii][jj]     = hd[ii][jj]     - hux[ii][idxb[ii][jj]] + ht[ii][jj];
			hrd[ii][pnb+jj] = hd[ii][pnb+jj] - hux[ii][idxb[ii][jj]] - ht[ii][pnb+jj];

			hrm[ii][jj]     = hlam[ii][jj]     * ht[ii][jj];
			hrm[ii][pnb+jj] = hlam[ii][pnb+jj] * ht[ii][pnb+jj];
			mu2 += hrm[ii][jj] + hrm[ii][pnb+jj];
			}
		}

	dsymv_lib(nu0+nx0, nu0+nx0, hpQ[ii], cnux0, hux[ii], 1, hrq[ii], hrq[ii]);

	for(jj=0; jj<nx1; jj++) 
		hrb[ii][jj] = hb[ii][jj] - hux[ii+1][nu1+jj];

	dgemv_nt_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hpi[ii], hux[ii], 1, 1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);

	if(ng0>0)
		{

		nb_tot += ng0;

		for(jj=0; jj<ng0; jj++)
			{
			work[jj] = hlam[ii][jj+2*pnb+png] - hlam[ii][jj+2*pnb+0];

			hrd[ii][2*pnb+jj]     = hd[ii][2*pnb+jj]     + ht[ii][2*pnb+jj];
			hrd[ii][2*pnb+png+jj] = hd[ii][2*pnb+png+jj] - ht[ii][2*pnb+png+jj];

			hrm[ii][2*pnb+jj]     = hlam[ii][2*pnb+jj]     * ht[ii][2*pnb+jj];
			hrm[ii][2*pnb+png+jj] = hlam[ii][2*pnb+png+jj] * ht[ii][2*pnb+png+jj];
			mu2 += hrm[ii][2*pnb+jj] + hrm[ii][2*pnb+png+jj];
			}

		dgemv_nt_lib(nu0+nx0, ng0, hpDCt[ii], cng, work, hux[ii], 1, 0, hrq[ii], work+png, hrq[ii], work+png);

		for(jj=0; jj<ng0; jj++)
			{
			hrd[ii][2*pnb+jj]     -= work[png+jj];
			hrd[ii][2*pnb+png+jj] -= work[png+jj];
			}

		}



	// middle stages
	for(ii=1; ii<N; ii++)
		{
		nu0 = nu1;
		nu1 = nu[ii+1];
		nx0 = nx1;
		nx1 = nx[ii+1];
		cnx0 = cnx1;
		cnx1  = (nx1+ncl-1)/ncl*ncl;
		cnux0  = (nu0+nx0+ncl-1)/ncl*ncl;
		nb0 = nb[ii];
		pnb = (nb0+bs-1)/bs*bs;
		ng0 = ng[ii];
		png = (ng0+bs-1)/bs*bs;
		cng = (ng0+ncl-1)/ncl*ncl;

		for(jj=0; jj<nu0; jj++) 
			hrq[ii][jj] = + hq[ii][jj];

		for(jj=0; jj<nx0; jj++) 
			hrq[ii][nu0+jj] = + hq[ii][nu0+jj] - hpi[ii-1][jj];

		if(nb0>0)
			{

			nb_tot += nb0;

			for(jj=0; jj<nb0; jj++) 
				{
				hrq[ii][idxb[ii][jj]] += - hlam[ii][jj] + hlam[ii][pnb+jj];

				hrd[ii][jj]     = hd[ii][jj]     - hux[ii][idxb[ii][jj]] + ht[ii][jj];
				hrd[ii][pnb+jj] = hd[ii][pnb+jj] - hux[ii][idxb[ii][jj]] - ht[ii][pnb+jj];

				hrm[ii][jj]     = hlam[ii][jj]     * ht[ii][jj];
				hrm[ii][pnb+jj] = hlam[ii][pnb+jj] * ht[ii][pnb+jj];
				mu2 += hrm[ii][jj] + hrm[ii][pnb+jj];
				}
			}

		dsymv_lib(nu0+nx0, nu0+nx0, hpQ[ii], cnux0, hux[ii], 1, hrq[ii], hrq[ii]);

		for(jj=0; jj<nx1; jj++) 
			hrb[ii][jj] = hb[ii][jj] - hux[ii+1][nu1+jj];

		dgemv_nt_lib(nu0+nx0, nx1, hpBAbt[ii], cnx1, hpi[ii], hux[ii], 1, 1, hrq[ii], hrb[ii], hrq[ii], hrb[ii]);

		if(ng0>0)
			{

			nb_tot += ng0;

			for(jj=0; jj<ng0; jj++)
				{
				work[jj] = hlam[ii][jj+2*pnb+png] - hlam[ii][jj+2*pnb+0];

				hrd[ii][2*pnb+jj]     = hd[ii][2*pnb+jj]     + ht[ii][2*pnb+jj];
				hrd[ii][2*pnb+png+jj] = hd[ii][2*pnb+png+jj] - ht[ii][2*pnb+png+jj];

				hrm[ii][2*pnb+jj]     = hlam[ii][2*pnb+jj]     * ht[ii][2*pnb+jj];
				hrm[ii][2*pnb+png+jj] = hlam[ii][2*pnb+png+jj] * ht[ii][2*pnb+png+jj];
				mu2 += hrm[ii][2*pnb+jj] + hrm[ii][2*pnb+png+jj];
				}

			dgemv_nt_lib(nu0+nx0, ng0, hpDCt[ii], cng, work, hux[ii], 1, 0, hrq[ii], work+png, hrq[ii], work+png);

			for(jj=0; jj<ng0; jj++)
				{
				hrd[ii][2*pnb+jj]     -= work[png+jj];
				hrd[ii][2*pnb+png+jj] -= work[png+jj];
				}

			}

		}
	


	// last stage
	ii = N;
	nu0 = nu1;
	nx0 = nx1;
	cnux0  = (nu0+nx0+ncl-1)/ncl*ncl;
	nb0 = nb[ii];
	pnb = (nb0+bs-1)/bs*bs;
	ng0 = ng[ii];
	png = (ng0+bs-1)/bs*bs;
	cng = (ng0+ncl-1)/ncl*ncl;

	// res_q
	for(jj=0; jj<nx0; jj++) 
		hrq[ii][nu0+jj] = - hpi[ii-1][jj] + hq[ii][nu0+jj];

	if(nb0>0)
		{

		nb_tot += nb0;

		for(jj=0; jj<nb0; jj++) 
			{
			hrq[ii][idxb[ii][jj]] += - hlam[ii][jj] + hlam[ii][pnb+jj];

			hrd[ii][jj]     = hd[ii][jj]     - hux[ii][idxb[ii][jj]] + ht[ii][jj];
			hrd[ii][pnb+jj] = hd[ii][pnb+jj] - hux[ii][idxb[ii][jj]] - ht[ii][pnb+jj];

			hrm[ii][jj]     = hlam[ii][jj]     * ht[ii][jj];
			hrm[ii][pnb+jj] = hlam[ii][pnb+jj] * ht[ii][pnb+jj];
			mu2 += hrm[ii][jj] + hrm[ii][pnb+jj];
			}
		}

	dsymv_lib(nx0+nu0%bs, nx0+nu0%bs, hpQ[ii]+nu0/bs*bs*cnux0+nu0/bs*bs*bs, cnux0, hux[ii]+nu0/bs*bs, 1, hrq[ii]+nu0/bs*bs, hrq[ii]+nu0/bs*bs);
	
	if(ng0>0)
		{

		nb_tot += ng0;

		for(jj=0; jj<ng0; jj++)
			{
			work[jj] = hlam[ii][jj+2*pnb+png] - hlam[ii][jj+2*pnb+0];

			hrd[ii][2*pnb+jj]     = hd[ii][2*pnb+jj]     + ht[ii][2*pnb+jj];
			hrd[ii][2*pnb+png+jj] = hd[ii][2*pnb+png+jj] - ht[ii][2*pnb+png+jj];

			hrm[ii][2*pnb+jj]     = hlam[ii][2*pnb+jj]     * ht[ii][2*pnb+jj];
			hrm[ii][2*pnb+png+jj] = hlam[ii][2*pnb+png+jj] * ht[ii][2*pnb+png+jj];
			mu2 += hrm[ii][2*pnb+jj] + hrm[ii][2*pnb+png+jj];
			}

		dgemv_nt_lib(nu0+nx0, ng0, hpDCt[ii], cng, work, hux[ii], 1, 0, hrq[ii], work+png, hrq[ii], work+png);

		for(jj=0; jj<ng0; jj++)
			{
			hrd[ii][2*pnb+jj]     -= work[png+jj];
			hrd[ii][2*pnb+png+jj] -= work[png+jj];
			}
		}

	

	// normalize mu
	if(nb_tot!=0)
		{
		mu2 /= 2.0*nb_tot;
		mu[0] = mu2;
		}



	return;

	}