示例#1
0
int main()
	{
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

#if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
/*	printf("\nflush subnormals to zero\n\n");*/
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	int ii, jj, idx;
	
	int rep, nrep=NREP;

	int nx = NX; // number of states (it has to be even for the mass-spring system test problem)
	int nu = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
	int N  = NN; // horizon lenght
	int nb = NB; // number of box constrained inputs and states

	printf(" Test problem: mass-spring system with %d masses and %d controls.\n", nx/2, nu);
	printf("\n");
	printf(" MPC problem size: %d states, %d inputs, %d horizon length, %d two-sided box constraints.\n", nx, nu, N, nb);
	printf("\n");
	printf(" ADMM method parameters: single precision, %d maximum iterations, %5.1e exit tolerance in primal and duality measure (edit file test_admm_ip_box.c to change them).\n", K_MAX_ADMM, TOL);

	int info = 0;
		
	const int bs = S_MR; //d_get_mr();
	const int ncl = S_NCL;
	const int nal = bs*ncl; // number of doubles per cache line
	
	const int nz = nx+nu+1;
	const int pnz = bs*((nz+bs-1)/bs);
	const int pnx = bs*((nx+bs-1)/bs);
	const int cnz = ncl*((nx+nu+1+ncl-1)/ncl);
	const int cnx = ncl*((nx+ncl-1)/ncl);
	const int pnb = bs*((2*nb+bs-1)/bs); // packed number of box constraints
	const int anz = nal*((nz+nal-1)/nal);
	const int anx = nal*((nx+nal-1)/nal);
	const int anb = nal*((2*nb+nal-1)/nal); // cache aligned number of box constraints

	const int pad = (ncl-nx%ncl)%ncl; // packing between BAbtL & P
	const int cnl = cnz<cnx+ncl ? nx+pad+cnx+ncl : nx+pad+cnz;
	
/************************************************
* dynamical system
************************************************/	

	double *A; d_zeros(&A, nx, nx); // states update matrix

	double *B; d_zeros(&B, nx, nu); // inputs matrix

	double *b; d_zeros(&b, nx, 1); // states offset
	double *x0; d_zeros(&x0, nx, 1); // initial state

	double Ts = 0.5; // sampling time
	mass_spring_system(Ts, nx, nu, N, A, B, b, x0);
	
	for(jj=0; jj<nx; jj++)
		b[jj] = 0.1;
	
	for(jj=0; jj<nx; jj++)
		x0[jj] = 0;
	x0[0] = 3.5;
	x0[1] = 3.5;
	
//	d_print_mat(nx, nx, A, nx);
//	d_print_mat(nx, nu, B, nx);
//	d_print_mat(nx, 1, b, nx);
//	d_print_mat(nx, 1, x0, nx);
	
	/* packed */
/*	double *BAb; d_zeros(&BAb, nx, nz);*/

/*	dmcopy(nx, nu, B, nx, BAb, nx);*/
/*	dmcopy(nx, nx, A, nx, BAb+nu*nx, nx);*/
/*	dmcopy(nx, 1 , b, nx, BAb+(nu+nx)*nx, nx);*/
	
	/* transposed */
/*	double *BAbt; d_zeros_align(&BAbt, pnz, pnz);*/
/*	for(ii=0; ii<nx; ii++)*/
/*		for(jj=0; jj<nz; jj++)*/
/*			{*/
/*			BAbt[jj+pnz*ii] = BAb[ii+nx*jj];*/
/*			}*/

	/* packed into contiguous memory */
	float *pBAbt; s_zeros_align(&pBAbt, pnz, cnx);
/*	d_cvt_mat2pmat(nz, nx, 0, bs, BAbt, pnz, pBAbt, cnx);*/
/*	d_cvt_tran_mat2pmat(nx, nz, 0, bs, BAb, nx, pBAbt, cnx);*/

	cvt_tran_d2s_mat2pmat(nx, nu, 0, bs, B, nx, pBAbt, cnx);
	cvt_tran_d2s_mat2pmat(nx, nx, nu, bs, A, nx, pBAbt+nu/bs*cnx*bs+nu%bs, cnx);
	for (jj = 0; jj<nx; jj++)
		pBAbt[(nx+nu)/bs*cnx*bs+(nx+nu)%bs+jj*bs] = (float) b[jj];

/*	s_print_pmat (nz, nx, bs, pBAbt, cnx);*/
/*	exit(1);*/

/************************************************
* box constraints
************************************************/	

/*	double *db; d_zeros_align(&db, 2*nb, 1);*/
/*	for(jj=0; jj<2*nu; jj++)*/
/*		db[jj] = - 0.5;   // umin*/
/*	for(; jj<2*nb; jj++)*/
/*		db[jj] = - 4.0;   // xmin*/

	float *lb; s_zeros_align(&lb, nx+nu, 1);
	for(jj=0; jj<nu; jj++)
		lb[jj] = - 0.5;   // umin
	for(; jj<nu+nx; jj++)
		lb[jj] = - 4.0;   // xmin

	float *ub; s_zeros_align(&ub, nx+nu, 1);
	for(jj=0; jj<nu; jj++)
		ub[jj] = 0.5;   // uman
	for(; jj<nu+nx; jj++)
		ub[jj] = 4.0;   // xman

/************************************************
* cost function
************************************************/	

	float *Q; s_zeros_align(&Q, pnz, pnz);
	for(ii=0; ii<nu; ii++) Q[ii*(pnz+1)] = 2.0;
	for(; ii<pnz; ii++) Q[ii*(pnz+1)] = 1.0;
	for(ii=0; ii<nz; ii++) Q[nx+nu+ii*pnz] = 0.1;
/*	Q[(nx+nu)*(pnz+1)] = 1e35; // large enough (not needed any longer) */
	
	/* packed into contiguous memory */
	float *pQ; s_zeros_align(&pQ, pnz, cnz);
	s_cvt_mat2pmat(nz, nz, 0, bs, Q, pnz, pQ, cnz);

/************************************************
* matrices series
************************************************/	

	float *(hpQ[N+1]);
	float *(hq[N+1]);
	float *(hux[N+1]);
	float *(hpi[N+1]);
	float *(hlam[N+1]);
	float *(ht[N+1]);
	float *(hpBAbt[N]);
	float *(hlb[N+1]);
	float *(hub[N+1]);
	float *(hrb[N]);
	float *(hrq[N+1]);
	float *(hrd[N+1]);
	float *(hux_v[N+1]);
	float *(hux_w[N+1]);

	for(jj=0; jj<N; jj++)
		{
		s_zeros_align(&hpQ[jj], pnz, cnz);
		}
	s_zeros_align(&hpQ[N], pnz, pnz);

	for(jj=0; jj<N; jj++)
		{
		s_zeros_align(&hq[jj], anz, 1);
		s_zeros_align(&hux[jj], anz, 1);
		s_zeros_align(&hpi[jj], anx, 1);
		s_zeros_align(&hlam[jj],anb, 1); // TODO pnb
		s_zeros_align(&ht[jj], anb, 1); // TODO pnb
		hpBAbt[jj] = pBAbt;
		hlb[jj] = lb;
		hub[jj] = ub;
		s_zeros_align(&hrb[jj], anx, 1);
		s_zeros_align(&hrq[jj], anz, 1);
		s_zeros_align(&hrd[jj], anb, 1); // TODO pnb
		s_zeros_align(&hux_v[jj], anz, 1);
		s_zeros_align(&hux_w[jj], anz, 1);
		}
	s_zeros_align(&hq[N], anz, 1);
	s_zeros_align(&hux[N], anz, 1);
	s_zeros_align(&hpi[N], anx, 1);
	s_zeros_align(&hlam[N], anb, 1); // TODO pnb
	s_zeros_align(&ht[N], anb, 1); // TODO pnb
	hlb[N] = lb;
	hub[N] = ub;
	s_zeros_align(&hrq[N], anz, 1);
	s_zeros_align(&hrd[N], anb, 1); // TODO pnb
	s_zeros_align(&hux_v[N], anz, 1);
	s_zeros_align(&hux_w[N], anz, 1);
	
	// starting guess
//	for(jj=0; jj<nx; jj++) hux[0][nu+jj]=x0[jj];

/************************************************
* riccati-like iteration
************************************************/

	float *work; s_zeros_align(&work, (N+1)*(pnz*cnl + 4*anz + 2*anx) + 3*anz, 1); // work space
	int kk = 0; // acutal number of iterations
/*	char prec = PREC; // double/single precision*/
/*	float sp_thr = SP_THR; // threshold to switch between double and single precision*/
	int k_max = K_MAX_ADMM; // maximum number of iterations in the ADMM method
	float tol = TOL*sqrt(N*(nx+nu));//TOL; // tolerance in the duality measure
/*	float sigma[] = {0.4, 0.3, 0.01}; // control primal-dual IP behaviour*/
	float rho = 2.0; // penalty parameter
	float alpha = 1.5; // relaxation parameter
	float *stat; s_zeros(&stat, 5, k_max); // stats from the ADMM routine
	int compute_mult = COMPUTE_MULT_ADMM;
	int warm_start = 0;//WARM_START;
/*	float mu = -1.0;*/
	


	/* initizile the cost function */
	for(ii=0; ii<N; ii++)
		{
		for(jj=0; jj<pnz*cnz; jj++) hpQ[ii][jj]=pQ[jj];
		}
	for(jj=0; jj<pnz*cnz; jj++) hpQ[N][jj]=pQ[jj];



	// initial states
	float xx0[] = {3.5, 3.5, 3.66465, 2.15833, 1.81327, -0.94207, 1.86531, -2.35760, 2.91534, 1.79890, -1.49600, -0.76600, -2.60268, 1.92456, 1.66630, -2.28522, 3.12038, 1.83830, 1.93519, -1.87113};



	/* warm up */

	// initialize states and inputs
	for(ii=0; ii<=N; ii++)
		for(jj=0; jj<nx+nu; jj++)
			hux[ii][jj] = 0;

	hux[0][nu+0] = xx0[0];
	hux[0][nu+1] = xx0[1];

	// call the ADMM solver
//	if(FREE_X0==0)
//		{
		s_admm_box_mpc(&kk, k_max, tol, tol, warm_start, 1, rho, alpha, stat, nx, nu, N, hpBAbt, hpQ, hlb, hub, hux, hux_v, hux_w, compute_mult, hpi, work);
//		}
//	else
//		{
///*		d_ip_box_mhe(&kk, k_max, tol, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);*/
//		}


	int kk_avg = 0;

	/* timing */
	struct timeval tv0, tv1;
	gettimeofday(&tv0, NULL); // start

	for(rep=0; rep<nrep; rep++)
		{

		idx = rep%10;
		x0[0] = xx0[2*idx];
		x0[1] = xx0[2*idx+1];

		// initialize states and inputs
		for(ii=0; ii<=N; ii++)
			for(jj=0; jj<nx+nu; jj++)
				hux[ii][jj] = 0;

		hux[0][nu+0] = xx0[2*idx];
		hux[0][nu+1] = xx0[2*idx+1];

		// call the ADMM solver
//		if(FREE_X0==0)
//			{
			s_admm_box_mpc(&kk, k_max, tol, tol, warm_start, 0, rho, alpha, stat, nx, nu, N, hpBAbt, hpQ, hlb, hub, hux, hux_v, hux_w, compute_mult, hpi, work);
//			}
//		else
//			{
///*			d_ip_box_mhe(&kk, k_max, tol, warm_start, sigma, stat, nx, nu, N, nb, hpBAbt, hpQ, hdb, hux, compute_mult, hpi, hlam, ht, work);*/
//			}

		kk_avg += kk;

		}
	
	gettimeofday(&tv1, NULL); // stop
	


	float time = (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
	
/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	
	printf("\n");
	printf(" Average number of iterations over %d runs: %5.1f\n", nrep, kk_avg / (float) nrep);
/*	printf(" Average number of iterations over %d runs: %d\n", nrep, kk);*/
	printf("\n");
	printf(" Average solution time over %d runs: %5.2e seconds\n", nrep, time);
	printf("\n");



	// restore linear part of cost function 
	for(ii=0; ii<N; ii++)
		{
		for(jj=0; jj<nx+nu; jj++) hq[ii][jj] = Q[nx+nu+pnz*jj];
		}
	for(jj=0; jj<nx+nu; jj++) hq[N][jj] = Q[nx+nu+pnz*jj];

	// residuals computation
/*	if(FREE_X0==0)*/
/*		d_res_ip_box_mpc(nx, nu, N, nb, hpBAbt, hpQ, hq, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, &mu);*/
/*	else*/
/*		d_res_ip_box_mhe(nx, nu, N, nb, hpBAbt, hpQ, hq, hux, hdb, hpi, hlam, ht, hrq, hrb, hrd, &mu);*/


	if(PRINTSTAT==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print ADMM statistics of the last run\n");
		printf("\n");

		for(jj=0; jj<kk; jj++)
			printf("k = %d\t\tp_res = %f\td_res = %f\n", jj, stat[5*jj], stat[5*jj+1]);
		printf("\n");
		
		}

	if(PRINTRES==1)
		{

		printf("\n");
		printf("\n");
		printf(" Print solution\n");
		printf("\n");

		printf("\nu = \n\n");
		for(ii=0; ii<N; ii++)
			s_print_mat(1, nu, hux[ii], 1);
		
		}

	if(0 && PRINTRES==1 && COMPUTE_MULT_ADMM==1)
		{
		// print result 
		// print result 
		printf("\n");
		printf("\n");
		printf(" Print residuals\n\n");
		printf("\n");
		printf("\n");
		printf("rq = \n\n");
//		if(FREE_X0==0)
//			{
			s_print_mat(1, nu, hrq[0], 1);
			for(ii=1; ii<=N; ii++)
/*				s_print_mat_e(1, nx+nu, hrq[ii], 1);*/
				s_print_mat(1, nx+nu, hrq[ii], 1);
//			}
//		else
//			{
//			for(ii=0; ii<=N; ii++)
///*				s_print_mat_e(1, nx+nu, hrq[ii], 1);*/
//				s_print_mat(1, nx+nu, hrq[ii], 1);
//			}
		printf("\n");
		printf("\n");
		printf("rb = \n\n");
		for(ii=0; ii<N; ii++)
/*			s_print_mat_e(1, nx, hrb[ii], 1);*/
			s_print_mat(1, nx, hrb[ii], 1);
		printf("\n");
		printf("\n");
		printf("rd = \n\n");
		for(ii=0; ii<=N; ii++)
/*			s_print_mat_e(1, 2*nb, hrd[ii], 1);*/
			s_print_mat(1, 2*nb, hrd[ii], 1);
		printf("\n");
		printf("\n");
/*		printf("mu = %e\n\n", mu);*/
		
		}

/*	printf("\nnx\tnu\tN\tkernel\n\n");*/
/*	printf("\n%d\t%d\t%d\t%e\n\n", nx, nu, N, time);*/
	
/************************************************
* free memory and return
************************************************/

	free(A);
	free(B);
	free(b);
	free(x0);
/*	free(BAb);*/
/*	free(BAbt);*/
	free(pBAbt);
	free(lb);
	free(ub);
	free(Q);
	free(pQ);
	free(work);
	free(stat);
	for(jj=0; jj<N; jj++)
		{
		free(hpQ[jj]);
		free(hq[jj]);
		free(hux[jj]);
		free(hpi[jj]);
		free(hlam[jj]);
		free(ht[jj]);
		free(hrb[jj]);
		free(hrq[jj]);
		free(hrd[jj]);
		free(hux_v[jj]);
		free(hux_w[jj]);
		}
	free(hpQ[N]);
	free(hq[N]);
	free(hux[N]);
	free(hpi[N]);
	free(hlam[N]);
	free(ht[N]);
	free(hrq[N]);
	free(hrd[N]);
	free(hux_v[N]);
	free(hux_w[N]);



	return 0;

	}
示例#2
0
int main()
	{
		
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

	printf("BLAS performance test - single precision\n");
	printf("\n");

	// maximum frequency of the processor
	const float GHz_max = GHZ_MAX;
	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
	printf("\n");

	// maximum flops per cycle, single precision
#if defined(TARGET_X64_AVX2)
	const float flops_max = 32;
	printf("Testing BLAS version for AVX2 & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_AVX)
	const float flops_max = 16;
	printf("Testing BLAS version for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	const float flops_max = 8;
	printf("Testing BLAS version for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A15)
	const float flops_max = 8;
	printf("Testing solvers for ARMv7a NEON instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A9)
	const float flops_max = 4;
	printf("Testing solvers for ARMv7a NEON instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEX_A7)
	const float flops_max = 2;
	printf("Testing solvers for ARMv7a NEON instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X86_ATOM)
	const float flops_max = 4;
	printf("Testing BLAS version for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_POWERPC_G2)
	const float flops_max = 2;
	printf("Testing BLAS version for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_2X2)
	const float flops_max = 2;
	printf("Testing reference BLAS version, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#endif
	printf("\n");
	printf("\n");
	printf("\n");
	
	FILE *f;
	f = fopen("./test_problems/results/test_blas.m", "w"); // a

#if defined(TARGET_X64_AVX2)
	fprintf(f, "C = 's_x64_avx2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_AVX)
	fprintf(f, "C = 's_x64_avx';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X64_SSE3)
	fprintf(f, "C = 's_x64_sse3';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A15)
	fprintf(f, "C = 's_ARM_cortex_A15';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A9)
	fprintf(f, "C = 's_ARM_cortex_A9';\n");
	fprintf(f, "\n");
#elif defined(TARGET_CORTEX_A7)
	fprintf(f, "C = 's_ARM_cortex_A7';\n");
	fprintf(f, "\n");
#elif defined(TARGET_X86_ATOM)
	fprintf(f, "C = 's_x86_atom';\n");
	fprintf(f, "\n");
#elif defined(TARGET_POWERPC_G2)
	fprintf(f, "C = 's_PowerPC_G2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_4X4)
	fprintf(f, "C = 's_c99_2x2';\n");
	fprintf(f, "\n");
#elif defined(TARGET_C99_2X2)
	fprintf(f, "C = 's_c99_4x4';\n");
	fprintf(f, "\n");
#endif

	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
	fprintf(f, "\n");

	fprintf(f, "B = [\n");



	int i, j, rep, ll;
	
	const int bsd = D_MR; //d_get_mr();
	const int bss = S_MR; //s_get_mr();
	
/*	int info = 0;*/
	
	printf("\nn\tGflops dgemm %%\tGflops dsyrk %%\tGflops dtrmm %%\tGflops dpotrf %%\tGflops dgemv_n%%\tGflops dgemv_t%%\tGflops dsymv %%\tGflops dtrmv_n%%\tGflops dtrmv_t%%\tGflops dmvmv%%\n\n");
	
	int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332};
	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100};
	
	for(ll=0; ll<75; ll++)

		{

		int n = nn[ll];
		int nrep = nnrep[ll];
	
		double *A; d_zeros(&A, n, n);
		double *B; d_zeros(&B, n, n);
		double *C; d_zeros(&C, n, n);
		float *sA; s_zeros(&sA, n, n);
		float *sB; s_zeros(&sB, n, n);
		float *sC; s_zeros(&sC, n, n);
	
		for(i=0; i<n*n; i++)
			A[i] = i;
	
		for(i=0; i<n; i++)
			B[i*(n+1)] = 1;
	
		for(i=0; i<n*n; i++)
			sA[i] = i;
	
		for(i=0; i<n; i++)
			sB[i*(n+1)] = 1;
	
		int pns = ((n+bss-1)/bss)*bss;	
		int cns = ((n+S_NCL-1)/S_NCL)*S_NCL;	
		int cns2 = ((2*n+S_NCL-1)/S_NCL)*S_NCL;	

		float *pA; s_zeros_align(&pA, pns, cns);
		float *pB; s_zeros_align(&pB, pns, cns);
		float *pC; s_zeros_align(&pC, pns, cns);
		float *pD; s_zeros_align(&pD, pns, cns);
		float *pE; s_zeros_align(&pE, pns, cns2);
		float *pF; s_zeros_align(&pF, 2*pns, cns);
		float *pL; s_zeros_align(&pL, pns, cns);
		float *x; s_zeros_align(&x, pns, 1);
		float *y; s_zeros_align(&y, pns, 1);
		float *x2; s_zeros_align(&x2, 2*pns, 1);
		float *y2; s_zeros_align(&y2, 2*pns, 1);
		float *diag; s_zeros_align(&diag, pns, 1);
	
		s_cvt_mat2pmat(n, n, 0, bss, sA, n, pA, cns);
		s_cvt_mat2pmat(n, n, 0, bss, sB, n, pB, cns);
		s_cvt_mat2pmat(n, n, 0, bss, sB, n, pD, cns);
		s_cvt_mat2pmat(n, n, 0, bss, sA, n, pE, cns2);
	
		for(i=0; i<pns*cns; i++) pC[i] = -1;
		
		for(i=0; i<pns; i++) x[i] = 1;
		for(i=0; i<pns; i++) x2[i] = 1;

		/* timing */
		struct timeval tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12;

		/* warm up */
		for(rep=0; rep<nrep; rep++)
			{
			sgemm_nt_lib(n, n, n, pA, cns, pB, cns, pC, cns, 0);
			}

		gettimeofday(&tv0, NULL); // start
	
		for(rep=0; rep<nrep; rep++)
			{

			sgemm_nt_lib(n, n, n, pA, cns, pB, cns, pC, cns, 0);

			}
	
		gettimeofday(&tv1, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			ssyrk_spotrf_lib(n, n, n, pE, cns2, pD, cns, diag);

			}
	
		gettimeofday(&tv2, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			strmm_lib(n, n, pA, cns, pB, cns, pC, cns);

			}
	
		gettimeofday(&tv3, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			strtr_l_lib(n, 0, pA, cns, pC, cns); // triangualr matrix transpose
			
			}
	
		gettimeofday(&tv4, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			sgemv_n_lib(n, n, pA, cns, x, y, 0);

			}
	
		gettimeofday(&tv5, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			sgemv_t_lib(n, n, 0, pA, cns, x, y, 0);

			}
	
		gettimeofday(&tv6, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			strmv_u_n_lib(n, pA, cns, x, y, 0);

			}
	
		gettimeofday(&tv7, NULL); // stop


		for(rep=0; rep<nrep; rep++)
			{

			strmv_u_t_lib(n, pA, cns, x, y, 0);

			}
	
		gettimeofday(&tv8, NULL); // stop


		for(rep=0; rep<nrep; rep++)
			{

			strsv_sgemv_n_lib(n, 2*n, pF, cns, x2);

			}
	
		gettimeofday(&tv9, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			strsv_sgemv_t_lib(n, 2*n, pF, cns, x2);

			}
	
		gettimeofday(&tv10, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			ssymv_lib(n, 0, pA, cns, x, y, 0);

			}
	
		gettimeofday(&tv11, NULL); // stop

		for(rep=0; rep<nrep; rep++)
			{

			smvmv_lib(n, n, 0, pA, cns, x, y, x2, y2, 0);

			}
	
		gettimeofday(&tv12, NULL); // stop



		float Gflops_max = flops_max * GHz_max;

		float time_dgemm = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
		float flop_dgemm = 2.0*n*n*n;
		float Gflops_dgemm = 1e-9*flop_dgemm/time_dgemm;

		float time_dsyrk_dpotrf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
		float flop_dsyrk_dpotrf = 1.0*n*n*n + 1.0/3.0*n*n*n;
		float Gflops_dsyrk_dpotrf = 1e-9*flop_dsyrk_dpotrf/time_dsyrk_dpotrf;

		float time_dtrmm = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
		float flop_dtrmm = 1.0*n*n*n;
		float Gflops_dtrmm = 1e-9*flop_dtrmm/time_dtrmm;
	
		float time_dtrtr = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6);
		float flop_dtrtr = 0.5*n*n; // 0.5*n*n elements
		float Gflops_dtrtr = 1e-9*flop_dtrtr/time_dtrtr;

		float time_dgemv_n = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6);
		float flop_dgemv_n = 2.0*n*n;
		float Gflops_dgemv_n = 1e-9*flop_dgemv_n/time_dgemv_n;

		float time_dgemv_t = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6);
		float flop_dgemv_t = 2.0*n*n;
		float Gflops_dgemv_t = 1e-9*flop_dgemv_t/time_dgemv_t;

		float time_dtrmv_n = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6);
		float flop_dtrmv_n = 1.0*n*n;
		float Gflops_dtrmv_n = 1e-9*flop_dtrmv_n/time_dtrmv_n;

		float time_dtrmv_t = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6);
		float flop_dtrmv_t = 1.0*n*n;
		float Gflops_dtrmv_t = 1e-9*flop_dtrmv_t/time_dtrmv_t;

		float time_dtrsv_n = (float) (tv9.tv_sec-tv8.tv_sec)/(nrep+0.0)+(tv9.tv_usec-tv8.tv_usec)/(nrep*1e6);
		float flop_dtrsv_n = 3.0*n*n;
		float Gflops_dtrsv_n = 1e-9*flop_dtrsv_n/time_dtrsv_n;

		float time_dtrsv_t = (float) (tv10.tv_sec-tv9.tv_sec)/(nrep+0.0)+(tv10.tv_usec-tv9.tv_usec)/(nrep*1e6);
		float flop_dtrsv_t = 3.0*n*n;
		float Gflops_dtrsv_t = 1e-9*flop_dtrsv_t/time_dtrsv_t;

		float time_dsymv = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6);
		float flop_dsymv = 2.0*n*n;
		float Gflops_dsymv = 1e-9*flop_dsymv/time_dsymv;

		float time_dmvmv = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6);
		float flop_dmvmv = 4.0*n*n;
		float Gflops_dmvmv = 1e-9*flop_dmvmv/time_dmvmv;

		printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dmvmv, 100.0*Gflops_dmvmv/Gflops_max);

	fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dmvmv, 100.0*Gflops_dmvmv/Gflops_max);

		free(A);
		free(B);
		free(pA);
		free(pB);
		free(pC);
		free(pD);
		free(pE);
		free(pF);
		free(pL);
		free(x);
		free(y);
		free(x2);
		free(y2);
		
		}

	printf("\n");

	fprintf(f, "];\n");
	fclose(f);

	return 0;
	
	}
示例#3
0
int main()
	{
	
	int i, j, rep;
	
	const int bs = D_MR; //d_get_mr();
	const int bss = S_MR; //s_get_mr();
	
	printf("\nbs = %d\n\n", bss);
	
	int n = 16;
	int nrep = 1000000;
	
	double *A; d_zeros(&A, n, n);
	double *B; d_zeros(&B, n, n);
	double *C; d_zeros(&C, n, n);
	double *L; d_zeros(&L, n, n);
	float *sA; s_zeros(&sA, n, n);
	float *sB; s_zeros(&sB, n, n);
	
	for(i=0; i<n*n; i++)
		{
		A[i] = i;
		sA[i] = i;
		}
	
	B[0] = 2;
/*	B[1] = 1;*/
	sB[0] = 2;
/*	sB[1] = 1;*/
	for(i=1; i<n-1; i++)
		{
/*		B[i*(n+1)-1] = 1;*/
		B[i*(n+1)+0] = 2;
/*		B[i*(n+1)+1] = 1;*/
/*		sB[i*(n+1)-1] = 1;*/
		sB[i*(n+1)+0] = 2;
/*		sB[i*(n+1)+1] = 1;*/
		}
/*	B[n*n-2] = 1;*/
	B[n*n-1] = 2;
/*	sB[n*n-2] = 1;*/
	sB[n*n-1] = 2;
	
	for(i=0; i<n; i++)
		C[i*(n+1)] = 2;
	for(i=0; i<n-1; i++)
		C[1+i*(n+1)] = 1;

/*sB[1*(n+1)] = 2;*/

/*	d_print_mat(n, n, C, n);*/

	int pn = ((n+bs-1)/bs)*bs;//+4;	
	int pns = ((n+bss-1)/bss)*bss;//+4;	
	int cns = ((n+S_NCL-1)/S_NCL)*S_NCL;//+4;	
	int cns2 = ((2*n+S_NCL-1)/S_NCL)*S_NCL;	

	double *pA; d_zeros_align(&pA, pn, pn);
	double *pB; d_zeros_align(&pB, pn, pn);
	double *pC; d_zeros_align(&pC, pn, pn);
	double *pL; d_zeros_align(&pL, pn, pn);
	float *spA; s_zeros_align(&spA, pns, cns);
	float *spB; s_zeros_align(&spB, pns, cns);
	float *spC; s_zeros_align(&spC, pns, cns);
	float *spD; s_zeros_align(&spD, pns, cns);
	float *spE; s_zeros_align(&spE, pns, cns2);
	float *diag; s_zeros_align(&diag, pns, 1);
	
	d_cvt_mat2pmat(n, n, 0, bs, A, n, pA, pn);
	d_cvt_mat2pmat(n, n, 0, bs, B, n, pB, pn);
	s_cvt_mat2pmat(n, n, 0, bss, sA, n, spA, cns);
	s_cvt_mat2pmat(n, n, 0, bss, sB, n, spB, cns);
	s_cvt_mat2pmat(n, n, 0, bss, sB, n, spC, cns);
	s_cvt_mat2pmat(n, n, 0, bss, sB, n, spD, cns);
	s_cvt_mat2pmat(n, n, 0, bss, sA, n, spE, cns2);
	
	double *x; d_zeros_align(&x, n, 1);
	double *y; d_zeros_align(&y, n, 1);
	
	x[2] = 1;

/*	for(i=0; i<pn*pn; i++) pC[i] = -1;*/
/*	for(i=0; i<pn*pn; i++) spC[i] = -1;*/
	
//	d_print_pmat(pn, pn, bs, pA, pn);
//	d_print_pmat(pn, pn, bs, pB, pn);
//	d_print_pmat(pn, pn, bs, pC, pn);
//	d_print_mat(n, n, B, n);

//	double *x; d_zeros_align(&x, pn, 1);
//	double *y; d_zeros_align(&y, pn, 1);
//	x[3] = 1.0;

/*	d_cvt_mat2pmat(n, n, bs-n%bs, bs, C, n, pC+((bs-n%bs))%bs*(bs+1), pn);*/
/*	d_print_pmat(pn, pn, bs, pC, pn);*/

/*	s_print_pmat(n, n, bss, spD, cns);*/
/*	s_print_pmat(n, n+4, bss, spE, cns2);*/

	/* timing */
	struct timeval tv0, tv1;
	gettimeofday(&tv0, NULL); // start
	
/*	d_print_pmat(n, n, bs, pC, pn);*/

	for(rep=0; rep<nrep; rep++)
		{

/*		sgemm_nt_lib(n, n, n, spA, cns, spB, cns, spC, cns, 0);*/
		ssyrk_spotrf_lib(n, n, n, spE, cns2, spD, cns, diag);
/*		strtr_l_lib(11, 3, spA+3, cns, spC, cns);*/
		
/*		sgemm_nt_lib(n, n, n, spB, pns, spA, pns, spC, pns, 0);*/
/*		dgemm_nt_lib(n, n, n, pA, pn, pB, pn, pC, pn, 0);*/
/*		dgemm_nt_lib(n, n, n, pB, pn, pA, pn, pC, pn, 0);*/
/*		dtrmm_pup_nn_lib(n, n, pA, pn, B, n, pC, pn);*/
/*		dsyrk_ppp_lib(n, n, pA, pn, pC, pn);*/
/*		dgemm_ppp_nt_lib(n, n, n, pA, pn, pA, pn, pB, pn, 0);*/
/*		dtrmm_ppp_lib(n, n, 0, pA, pn, pB, pn, pC, pn);*/
/*		dpotrf_dcopy_lib(n, 0, pC, pn, pL, pn);*/



/*		dgemm_pup_nn_lib(n, n, n, pA, pn, B, n, pC, pn, 0);*/
/*		dgemm_ppp_nt_lib(n, n, n, pA, pn, pA, pn, pC+(bs-n)*(bs+1), pn, 1);*/
/*		d_print_pmat(pn, pn, bs, pC, pn);*/
/*		dpotrf_p_dcopy_u_lib(n, (bs-n%bs)%bs, pC+((bs-n%bs))%bs*(bs+1), pn, L, n);*/
/*		d_print_pmat(pn, pn, bs, pC, pn);*/
/*		d_print_mat(n, n, L, n);*/
/*		exit(2);*/

//		dgemm_nt_lib(n, n, n, A, n, B, n, C, n, 0);
//		dgemm_nt_lib_asm(n, n, n, pA, pn, pB, pn, pC, pn, 0);
//		sgemm_nt_lib_neon(n, n, n, spA, pns, spB, pns, spC, pns, 0);
//		dsymm_nt_lib(n, n, A, n, B, n, C, n);
//		dpotrf_lib(n, B, n);
//		dgemm_nt_lib2(n, pB, pA, pC, pn);
//		dgemv_n_lib(n-1, n, 1, pn, pA+1, x, y);
//		dtrmv_n_lib(n-1, 1, pA+1, pn, x, y);
/*		dtrmv_t_lib(n-1, 1, pA+1, pn, x, y);*/

		}
	
	gettimeofday(&tv1, NULL); // stop

	float time = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
	float flop = 2.0*n*n*n;
/*	float flop = 1.0*n*n;*/
//	float flop = 1.0/3.0*n*n*n;
	float Gflops = 1e-9*flop/time;
	float Gflops_max = 1*1;
	
	printf("\nn\tGflops\t\t%%\n%d\t%f\t%f\n\n", n, Gflops, 100.0*Gflops/Gflops_max);

	if(n<=24)
		{
//		d_print_pmat(pn, pn, bs, pC, pn);
//		d_print_pmat(n, n, bs, pB, pn);
/*		d_print_pmat(n, n, bs, pA, pn);*/
/*		d_print_mat(n, n, B, n);*/
/*		d_print_pmat(n, n, bs, pB, pn);*/
/*		d_print_pmat(n, n, bs, pC, pn);*/
/*		d_print_pmat(n, n, bs, pL, pn);*/
		s_print_pmat(n, n, bss, spA, cns);
		s_print_pmat(n, n, bss, spB, cns);
/*		s_print_pmat(n, n, bss, spC, cns);*/
		s_print_pmat(n, n, bss, spE+n*bss, cns2);
/*		d_print_mat(n, 1, y, pn);*/
		}





	return 0;
	
	}