Exemplo n.º 1
0
Arquivo: math.c Projeto: acados/acados
void expm(int row, double *A)
{
    int i;

    int m_vals[] = {3, 5, 7, 9, 13};
    double theta[] = {0.01495585217958292, 0.2539398330063230, 0.9504178996162932,
                      2.097847961257068, 5.371920351148152};
    int lentheta = 5;

    double normA = onenorm(row, row, A);

    if (normA <= theta[4])
    {
        for (i = 0; i < lentheta; i++)
        {
            if (normA <= theta[i])
            {
                padeapprox(m_vals[i], row, A);
                break;
            }
        }
    }
    else
    {
        int s;
        double t = frexp(normA / (theta[4]), &s);
        s = s - (t == 0.5);
        t = pow(2, -s);
        int row2 = row * row;
        /*        int i1 = 1;*/
        //        dscal_(&row2, &t, A, &i1);
        dscal_3l(row2, t, A);
        padeapprox(m_vals[4], row, A);
        double *temp = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) temp[ii] = 0.0;
        //        char ta = 'n'; double alpha = 1; double beta = 0;
        for (i = 0; i < s; i++)
        {
            //            dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A,
            //            &row, &beta, temp, &row);
            dgemm_nn_3l(row, row, row, A, row, A, row, temp, row);
            dmcopy(row, row, temp, row, A, row);
        }
        free(temp);
    }
}
Exemplo n.º 2
0
/************************************************
Mass-spring system: nx/2 masses connected each other with springs (in a row),
and the first and the last one to walls. nu (<=nx) controls act on the first nu
masses. The system is sampled with sampling time Ts.
************************************************/
void mass_spring_system(double Ts, int nx, int nu, double *A, double *B,
                        double *b, double *x0) {
    int nx2 = nx * nx;

    int info = 0;

    int pp = nx / 2;  // number of masses

    /************************************************
     * build the continuous time system
     ************************************************/

    double *T;
    d_zeros(&T, pp, pp);
    int ii;
    for (ii = 0; ii < pp; ii++) T[ii * (pp + 1)] = -2;
    for (ii = 0; ii < pp - 1; ii++) T[ii * (pp + 1) + 1] = 1;
    for (ii = 1; ii < pp; ii++) T[ii * (pp + 1) - 1] = 1;

    double *Z;
    d_zeros(&Z, pp, pp);
    double *I;
    d_zeros(&I, pp, pp);
    for (ii = 0; ii < pp; ii++) I[ii * (pp + 1)] = 1.0;  // = eye(pp);
    double *Ac;
    d_zeros(&Ac, nx, nx);
    dmcopy(pp, pp, Z, pp, Ac, nx);
    dmcopy(pp, pp, T, pp, Ac + pp, nx);
    dmcopy(pp, pp, I, pp, Ac + pp * nx, nx);
    dmcopy(pp, pp, Z, pp, Ac + pp * (nx + 1), nx);
    free(T);
    free(Z);
    free(I);

    d_zeros(&I, nu, nu);
    for (ii = 0; ii < nu; ii++) I[ii * (nu + 1)] = 1.0;  // I = eye(nu);
    double *Bc;
    d_zeros(&Bc, nx, nu);
    dmcopy(nu, nu, I, nu, Bc + pp, nx);
    free(I);

    /************************************************
     * compute the discrete time system
     ************************************************/

    double *bb;
    d_zeros(&bb, nx, 1);
    dmcopy(nx, 1, bb, nx, b, nx);

    dmcopy(nx, nx, Ac, nx, A, nx);
    dscal_3l(nx2, Ts, A);
    expm(nx, A);

    d_zeros(&T, nx, nx);
    d_zeros(&I, nx, nx);
    for (ii = 0; ii < nx; ii++) I[ii * (nx + 1)] = 1.0;  // I = eye(nx);
    dmcopy(nx, nx, A, nx, T, nx);
    daxpy_3l(nx2, -1.0, I, T);
    dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);

    int *ipiv = (int *)malloc(nx * sizeof(int));
    dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
    free(ipiv);

    free(Ac);
    free(Bc);
    free(bb);

    /************************************************
     * initial state
     ************************************************/

    if (nx == 4) {
        x0[0] = 5;
        x0[1] = 10;
        x0[2] = 15;
        x0[3] = 20;
    } else {
        int jj;
        for (jj = 0; jj < nx; jj++) x0[jj] = 1;
    }
}
Exemplo n.º 3
0
int main()
	{
	
	
	printf("\n");
	printf("\n");
	printf("\n");
	printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n");
	printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n");
	printf("\n");
	printf(" HPMPC is distributed in the hope that it will be useful,\n");
	printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
	printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
	printf(" See the GNU Lesser General Public License for more details.\n");
	printf("\n");
	printf("\n");
	printf("\n");

	printf("Riccati solver performance test - single precision\n");
	printf("\n");

	// maximum frequency of the processor
	const float GHz_max = 2.9; //3.6; //2.9;
	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_dricposv.c to modify this value).\n", GHz_max);
	printf("\n");

	// maximum flops per cycle, single precision
#if defined(TARGET_X64_AVX)
	const float flops_max = 16;
	printf("Testing solvers for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3)
	const float flops_max = 8;
	printf("Testing solvers for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_CORTEXA9)
	const float flops_max = 4;
	printf("Testing solvers for ARMv7a NEON instruction set: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_X86_ATOM)
	const float flops_max = 4;
	printf("Testing solvers for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_POWERPC_G2)
	const float flops_max = 2;
	printf("Testing solvers for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_4X4)
	const float flops_max = 2;
	printf("Testing reference solvers, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#elif defined(TARGET_C99_2X2)
	const float flops_max = 2;
	printf("Testing reference solvers, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
#endif

	printf("\n");
	printf("Tested solvers:\n");
	printf("-sv : Riccati factorization and system solution (prediction step in IP methods)\n");
	printf("-trs: system solution after a previous call to Riccati factorization (correction step in IP methods)\n");
	printf("\n");
	printf("\n");

#if defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3)
	printf("\nflush to zero on\n");
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
#endif

	// to throw floating-point exception
/*#ifndef __APPLE__*/
/*    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);*/
/*#endif*/

	int err;
	
	int i, j, ii, jj, idx;
	
	const int bsd = D_MR; //d_get_mr();
	const int bss = S_MR; //s_get_mr();
	
	int info = 0;

	int nn[] = {4, 6, 8, 10, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300};
	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 4000, 4000, 2000, 2000, 1000, 1000, 400, 400, 400, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
	
	int vnx[] = {8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 1024};
	int vnrep[] = {100, 100, 100, 100, 100, 100, 50, 50, 50, 20, 10, 10};
	int vN[] = {4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256};

	int ll;
	for(ll=0; ll<77; ll++)
/*	for(ll=0; ll<1; ll++)*/

		{

		int nx = nn[ll];//NX;//16;//nn[ll]; // number of states (it has to be even for the mass-spring system test problem)
		int nu = 2;//NU;//5; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)
		int N  = 10;//NN;//10; // horizon lenght
		int nrep = nnrep[ll];
/*		int nx = NX;//16;//nn[ll]; // number of states (it has to be even for the mass-spring system test problem)*/
/*		int nu = NU;//5; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem)*/
/*		int N  = NN;//10; // horizon lenght*/
/*		int nrep = NREP;*/

		int rep;
	
		int nz = nx+nu+1;
		int pnz = bss*((nz+bss-nu%bss+bss-1)/bss);
	
/************************************************
* dynamical system
************************************************/	

		double *A; d_zeros(&A, nx, nx); // states update matrix

		double *B; d_zeros(&B, nx, nu); // inputs matrix

		double *b; d_zeros(&b, nx, 1); // states offset
		double *x0; d_zeros(&x0, nx, 1); // initial state

		double Ts = 0.5; // sampling time
		mass_spring_system(Ts, nx, nu, N, A, B, b, x0);
	
		for(jj=0; jj<nx; jj++)
			b[jj] = 0.1;
	
		for(jj=0; jj<nx; jj++)
			x0[jj] = 0;
		x0[0] = 3.5;
		x0[1] = 3.5;
	
//	d_print_mat(nx, nx, A, nx);
//	d_print_mat(nx, nu, B, nx);
//	d_print_mat(nx, 1, b, nx);
//	d_print_mat(nx, 1, x0, nx);
	
	/* packed */
		double *BAb; d_zeros(&BAb, nx, nz);

		dmcopy(nx, nu, B, nx, BAb, nx);
		dmcopy(nx, nx, A, nx, BAb+nu*nx, nx);
		dmcopy(nx, 1 , b, nx, BAb+(nu+nx)*nx, nx);
	
//	d_print_mat(nx, nx+nu+1, BAb, nx);

	/* transposed */
		double *BAbt; d_zeros_align(&BAbt, pnz, pnz);
		for(ii=0; ii<nx; ii++)
			for(jj=0; jj<nz; jj++)
				{
				BAbt[jj+pnz*ii] = BAb[ii+nx*jj];
				}

//	d_print_mat(nz, nx+1, BAbt, pnz);
//	s_print_mat(nz, nx+1, sBAbt, pnz);
//	return 0;
	
	/* packed into contiguous memory */
		double *pBAbt; d_zeros_align(&pBAbt, pnz, pnz);
		d_cvt_mat2pmat(nz, nx, 0, bsd, BAbt, pnz, pBAbt, pnz);

		float *psBAbt; s_zeros_align(&psBAbt, pnz, pnz);
		s_cvt_d2s_pmat(nz, nx, bsd, pBAbt, pnz, bss, psBAbt, pnz);

//	d_print_pmat(nz, nx, bsd, pBAbt, pnz);
//	s_print_pmat(nz, nx, bss, spBAbt, pnz);

/************************************************
* cost function
************************************************/	

		double *Q; d_zeros_align(&Q, pnz, pnz);
		for(ii=0; ii<nu; ii++) Q[ii*(pnz+1)] = 2.0;
		for(; ii<pnz; ii++) Q[ii*(pnz+1)] = 1.0;
		for(ii=0; ii<nz; ii++) Q[nx+nu+ii*pnz] = 1.0;
		Q[(nx+nu)*(pnz+1)] = 1e6;

		/* packed into contiguous memory */
		float *pQ; s_zeros_align(&pQ, pnz, pnz);
		cvt_d2s_mat2pmat(nz, nz, 0, bss, Q, pnz, pQ, pnz);

	/* matrices series */
		float *(hpQ[N+1]);
		float *(hq[N+1]);
		float *(hux[N+1]);
		float *(hpi[N+1]);
		float *(hpBAbt[N]);
		float *(hrb[N]);
		float *(hrq[N+1]);
		for(jj=0; jj<N; jj++)
			{
			s_zeros_align(&hpQ[jj], pnz, pnz);
			s_zeros_align(&hq[jj], pnz, 1);
			s_zeros_align(&hux[jj], pnz, 1);
			s_zeros_align(&hpi[jj], nx, 1);
			hpBAbt[jj] = psBAbt;
			s_zeros_align(&hrb[jj], nx, 1);
			s_zeros_align(&hrq[jj], nx+nu, 1);
			}
		s_zeros_align(&hpQ[N], pnz, pnz);
		s_zeros_align(&hq[N], pnz, 1);
		s_zeros_align(&hux[N], pnz, 1);
		s_zeros_align(&hpi[N], nx, 1);
		s_zeros_align(&hrq[N], nx+nu, 1);
	
		// starting guess
		for(jj=0; jj<nx; jj++) hux[0][nu+jj] = (float) x0[jj];
	
		float *pL; s_zeros_align(&pL, pnz, pnz);
	
		float *pBAbtL; s_zeros_align(&pBAbtL, pnz, pnz);

/************************************************
* riccati-like iteration
************************************************/

		// predictor

		// restore cost function 
		for(ii=0; ii<N; ii++)
			{
			for(jj=0; jj<pnz*pnz; jj++) hpQ[ii][jj]=pQ[jj];
			}
		for(jj=0; jj<pnz*pnz; jj++) hpQ[N][jj]=pQ[jj];

		// call the solver
		sricposv_mpc(nx, nu, N, pnz, hpBAbt, hpQ, hux, pL, pBAbtL, COMPUTE_MULT, hpi, &info);

		if(PRINTRES==1)
			{
			/* print result */
			printf("\n\nsv\n\n");
			for(ii=0; ii<N; ii++)
				s_print_mat(1, nu, hux[ii], 1);
			}
		if(PRINTRES==1 && COMPUTE_MULT==1)
			{
			// print result 
			printf("\n\nsv\n\n");
			for(ii=0; ii<N; ii++)
				s_print_mat(1, nx, hpi[ii+1], 1);
			}

		// corrector
	
		// clear solution 
		for(ii=0; ii<N; ii++)
			{
			for(jj=0; jj<nu; jj++) hux[ii][jj] = 0;
			for(jj=0; jj<nx; jj++) hux[ii+1][nu+jj] = 0;
			}

		// restore linear part of cost function 
		for(ii=0; ii<N; ii++)
			{
			for(jj=0; jj<nx+nu; jj++) hq[ii][jj] = Q[nx+nu+pnz*jj];
			}
		for(jj=0; jj<nx+nu; jj++) hq[N][jj] = Q[nx+nu+pnz*jj];

		// call the solver 
		sricpotrs_mpc(nx, nu, N, pnz, hpBAbt, hpQ, hq, hux, pBAbtL, COMPUTE_MULT, hpi);

		if(PRINTRES==1)
			{
			// print result 
			printf("\n\ntrs\n\n");
			for(ii=0; ii<N; ii++)
				s_print_mat(1, nu, hux[ii], 1);
			}
		if(PRINTRES==1 && COMPUTE_MULT==1)
			{
			// print result 
			printf("\n\ntrs\n\n");
			for(ii=0; ii<N; ii++)
				s_print_mat(1, nx, hpi[ii+1], 1);
			}

		// restore cost function 
		for(ii=0; ii<N; ii++)
			{
			for(jj=0; jj<pnz*pnz; jj++) hpQ[ii][jj]=pQ[jj];
			}
		for(jj=0; jj<pnz*pnz; jj++) hpQ[N][jj]=pQ[jj];

		// restore linear part of cost function 
		for(ii=0; ii<N; ii++)
			{
			for(jj=0; jj<nx+nu; jj++) hq[ii][jj] = Q[nx+nu+pnz*jj];
			}
		for(jj=0; jj<nx+nu; jj++) hq[N][jj] = Q[nx+nu+pnz*jj];

		// residuals computation
		sres(nx, nu, N, pnz, hpBAbt, hpQ, hq, hux, hpi, hrq, hrb);

		if(PRINTRES==1 && COMPUTE_MULT==1)
			{
			// print result 
			printf("\n\nres\n\n");
			for(ii=0; ii<+N; ii++)
				s_print_mat(1, nx+nu, hrq[ii], 1);
			for(ii=0; ii<N; ii++)
				s_print_mat(1, nx, hrb[ii], 1);
			}



		// timing 
		struct timeval tv0, tv1, tv2;

		gettimeofday(&tv0, NULL); // start

		// double precision
		for(rep=0; rep<nrep; rep++)
			{
			// restore cost function 
			for(ii=0; ii<N; ii++)
				{
				for(jj=0; jj<pnz*pnz; jj++) hpQ[ii][jj]=pQ[jj];
				}
			for(jj=0; jj<pnz*pnz; jj++) hpQ[N][jj]=pQ[jj];

			// call the solver 
			sricposv_mpc(nx, nu, N, pnz, hpBAbt, hpQ, hux, pL, pBAbtL, COMPUTE_MULT, hpi, &info);
			}
			
		gettimeofday(&tv1, NULL); // start

		for(rep=0; rep<nrep; rep++)
			{
			// clear solution 
			for(ii=0; ii<N; ii++)
				{
				for(jj=0; jj<nu; jj++) hux[ii][jj] = 0;
				for(jj=0; jj<nx; jj++) hux[ii+1][nu+jj] = 0;
				}

			// restore linear part of cost function 
			for(ii=0; ii<N; ii++)
				{
				for(jj=0; jj<nx+nu; jj++) hq[ii][jj] = Q[nx+nu+pnz*jj];
				}
			for(jj=0; jj<nx+nu; jj++) hq[N][jj] = Q[nx+nu+pnz*jj];

			// call the solver 
			sricpotrs_mpc(nx, nu, N, pnz, hpBAbt, hpQ, hq, hux, pBAbtL, COMPUTE_MULT, hpi);
			}
		
		gettimeofday(&tv2, NULL); // start



		float time_sv = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
		float flop_sv = (1.0/3.0*nx*nx*nx+3.0/2.0*nx*nx) + N*(7.0/3.0*nx*nx*nx+4.0*nx*nx*nu+2.0*nx*nu*nu+1.0/3.0*nu*nu*nu+13.0/2.0*nx*nx+9.0*nx*nu+5.0/2.0*nu*nu);
		if(COMPUTE_MULT==1)
			flop_sv += N*2*nx*nx;
		float Gflops_sv = 1e-9*flop_sv/time_sv;
	
		float time_trs = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
		float flop_trs = N*(8.0*nx*nx+8.0*nx*nu+2.0*nu*nu);
		if(COMPUTE_MULT==1)
			flop_trs += N*2*nx*nx;
		float Gflops_trs = 1e-9*flop_trs/time_trs;
		
		float Gflops_max = flops_max * GHz_max;

		if(ll==0)
			printf("\nnx\tnu\tN\tsv time\t\tsv Gflops\tsv \%\t\ttrs time\ttrs Gflops\ttrs \%\n\n");
		printf("%d\t%d\t%d\t%e\t%f\t%f\t%e\t%f\t%f\n", nx, nu, N, time_sv, Gflops_sv, 100.0*Gflops_sv/Gflops_max, time_trs, Gflops_trs, 100.0*Gflops_trs/Gflops_max);

/************************************************
* return
************************************************/

		free(A);
		free(B);
		free(b);
		free(x0);
		free(BAb);
		free(BAbt);
		free(pBAbt);
		free(Q);
		free(pQ);
		free(pL);
		free(pBAbtL);
		for(jj=0; jj<N; jj++)
			{
			free(hpQ[jj]);
			free(hq[jj]);
			free(hux[jj]);
			free(hpi[jj]);
			}
		free(hpQ[N]);
		free(hq[N]);
		free(hux[N]);
		free(hpi[N]);
	


		} // increase size

	printf("\n");
	printf("\n");
	printf("\n");

	return 0;

	}
Exemplo n.º 4
0
Arquivo: math.c Projeto: acados/acados
/* computes the Pade approximation of degree m of the matrix A */
void padeapprox(int m, int row, double *A)
{
    int row2 = row * row;
    /*    int i1 = 1;*/
    /*    double d0 = 0;*/
    /*    double d1 = 1;*/
    /*    double dm1 = -1;*/

    double *U = malloc(row * row * sizeof(double));
    for (int ii = 0; ii < row * row; ii++) U[ii] = 0.0;
    double *V = malloc(row * row * sizeof(double));
    for (int ii = 0; ii < row * row; ii++) V[ii] = 0.0;

    if (m == 3)
    {
        double c[] = {120, 60, 12, 1};
        double *A0 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A0[ii] = 0.0;
        for (int ii = 0; ii < row; ii++) A0[ii * (row + 1)] = 1.0;
        double *A2 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A2[ii] = 0.0;
        //        char ta = 'n'; double alpha = 1; double beta = 0;
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row,
        //        &beta, A2, &row);
        dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
        double *temp = malloc(row * row * sizeof(double));
        //        dscal_(&row2, &d0, temp, &i1);
        dscal_3l(row2, 0, temp);
        //        daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
        daxpy_3l(row2, c[3], A2, temp);
        //        daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
        daxpy_3l(row2, c[1], A0, temp);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp,
        //        &row, &beta, U, &row);
        dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
        //        dscal_(&row2, &d0, V, &i1);
        dscal_3l(row2, 0, V);
        //        daxpy_(&row2, &c[2], A2, &i1, V, &i1);
        daxpy_3l(row2, c[2], A2, V);
        //        daxpy_(&row2, &c[0], A0, &i1, V, &i1);
        daxpy_3l(row2, c[0], A0, V);
        free(A0);
        free(A2);
        free(temp);
    }
    else if (m == 5)
    {
        double c[] = {30240, 15120, 3360, 420, 30, 1};
        double *A0 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A0[ii] = 0.0;
        for (int ii = 0; ii < row; ii++) A0[ii * (row + 1)] = 1.0;
        double *A2 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A2[ii] = 0.0;
        double *A4 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A4[ii] = 0.0;
        //        char ta = 'n'; double alpha = 1; double beta = 0;
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row,
        //        &beta, A2, &row);
        dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row,
        //        &beta, A4, &row);
        dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
        dmcopy(row, row, A4, row, V, row);
        double *temp = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) temp[ii] = 0.0;
        dmcopy(row, row, A4, row, temp, row);
        //        daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
        daxpy_3l(row2, c[3], A2, temp);
        //        daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
        daxpy_3l(row2, c[1], A0, temp);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp,
        //        &row, &beta, U, &row);
        dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
        //        dscal_(&row2, &c[4], V, &i1);
        dscal_3l(row2, c[4], V);
        //        daxpy_(&row2, &c[2], A2, &i1, V, &i1);
        daxpy_3l(row2, c[2], A2, V);
        //        daxpy_(&row2, &c[0], A0, &i1, V, &i1);
        daxpy_3l(row2, c[0], A0, V);
        free(A0);
        free(A2);
        free(A4);
        free(temp);
    }
    else if (m == 7)
    {
        double c[] = {17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1};
        double *A0 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A0[ii] = 0.0;
        for (int ii = 0; ii < row; ii++) A0[ii * (row + 1)] = 1.0;
        double *A2 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A2[ii] = 0.0;
        double *A4 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A4[ii] = 0.0;
        double *A6 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A6[ii] = 0.0;
        //        char ta = 'n'; double alpha = 1; double beta = 1;
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row,
        //        &beta, A2, &row);
        dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row,
        //        &beta, A4, &row);
        dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row,
        //        &beta, A6, &row);
        dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
        double *temp = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) temp[ii] = 0.0;
        //        dscal_(&row2, &d0, temp, &i1);
        dscal_3l(row2, 0, temp);
        //        daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
        daxpy_3l(row2, c[3], A2, temp);
        //        daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
        daxpy_3l(row2, c[1], A0, temp);
        //        daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
        daxpy_3l(row2, c[5], A4, temp);
        //        daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
        daxpy_3l(row2, c[7], A6, temp);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp,
        //        &row, &beta, U, &row);
        dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
        //        dscal_(&row2, &d0, V, &i1);
        dscal_3l(row2, 0, V);
        //        daxpy_(&row2, &c[2], A2, &i1, V, &i1);
        daxpy_3l(row2, c[2], A2, V);
        //        daxpy_(&row2, &c[0], A0, &i1, V, &i1);
        daxpy_3l(row2, c[0], A0, V);
        //        daxpy_(&row2, &c[4], A4, &i1, V, &i1);
        daxpy_3l(row2, c[4], A4, V);
        //        daxpy_(&row2, &c[6], A6, &i1, V, &i1);
        daxpy_3l(row2, c[6], A6, V);
        free(A0);
        free(A2);
        free(A4);
        free(A6);
        free(temp);
    }
    else if (m == 9)
    {
        double c[] = {17643225600, 8821612800, 2075673600, 302702400, 30270240,
                      2162160,     110880,     3960,       90,        1};
        double *A0 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A0[ii] = 0.0;
        for (int ii = 0; ii < row; ii++) A0[ii * (row + 1)] = 1.0;
        double *A2 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A2[ii] = 0.0;
        double *A4 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A4[ii] = 0.0;
        double *A6 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A6[ii] = 0.0;
        double *A8 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A8[ii] = 0.0;
        //        char ta = 'n'; double alpha = 1; double beta = 0;
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row,
        //        &beta, A2, &row);
        dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row,
        //        &beta, A4, &row);
        dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row,
        //        &beta, A6, &row);
        dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, A2, &row,
        //        &beta, A8, &row);
        dgemm_nn_3l(row, row, row, A6, row, A2, row, A8, row);
        dmcopy(row, row, A8, row, V, row);
        double *temp = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) temp[ii] = 0.0;
        dmcopy(row, row, A8, row, temp, row);
        //        daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
        daxpy_3l(row2, c[3], A2, temp);
        //        daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
        daxpy_3l(row2, c[1], A0, temp);
        //        daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
        daxpy_3l(row2, c[5], A4, temp);
        //        daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
        daxpy_3l(row2, c[7], A6, temp);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp,
        //        &row, &beta, U, &row);
        dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
        //        dscal_(&row2, &c[8], V, &i1);
        dscal_3l(row2, c[8], V);
        //        daxpy_(&row2, &c[2], A2, &i1, V, &i1);
        daxpy_3l(row2, c[2], A2, V);
        //        daxpy_(&row2, &c[0], A0, &i1, V, &i1);
        daxpy_3l(row2, c[0], A0, V);
        //        daxpy_(&row2, &c[4], A4, &i1, V, &i1);
        daxpy_3l(row2, c[4], A4, V);
        //        daxpy_(&row2, &c[6], A6, &i1, V, &i1);
        daxpy_3l(row2, c[6], A6, V);
        free(A0);
        free(A2);
        free(A4);
        free(A6);
        free(A8);
        free(temp);
    }
    else if (m == 13)
    {  // tested
        double c[] = {64764752532480000,
                      32382376266240000,
                      7771770303897600,
                      1187353796428800,
                      129060195264000,
                      10559470521600,
                      670442572800,
                      33522128640,
                      1323241920,
                      40840800,
                      960960,
                      16380,
                      182,
                      1};
        double *A0 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A0[ii] = 0.0;
        for (int ii = 0; ii < row; ii++) A0[ii * (row + 1)] = 1.0;
        double *A2 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A2[ii] = 0.0;
        double *A4 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A4[ii] = 0.0;
        double *A6 = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) A6[ii] = 0.0;
        //        char ta = 'n'; double alpha = 1; double beta = 0;
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row,
        //        &beta, A2, &row);
        dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row,
        //        &beta, A4, &row);
        dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row,
        //        &beta, A6, &row);
        dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
        dmcopy(row, row, A2, row, U, row);
        double *temp = malloc(row * row * sizeof(double));
        for (int ii = 0; ii < row * row; ii++) temp[ii] = 0.0;
        //        dscal_(&row2, &c[9], U, &i1);
        dscal_3l(row2, c[9], U);
        //        daxpy_(&row2, &c[11], A4, &i1, U, &i1);
        daxpy_3l(row2, c[11], A4, U);
        //        daxpy_(&row2, &c[13], A6, &i1, U, &i1);
        daxpy_3l(row2, c[13], A6, U);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, U, &row,
        //        &beta, temp, &row);
        dgemm_nn_3l(row, row, row, A6, row, U, row, temp, row);
        //        daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
        daxpy_3l(row2, c[7], A6, temp);
        //        daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
        daxpy_3l(row2, c[5], A4, temp);
        //        daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
        daxpy_3l(row2, c[3], A2, temp);
        //        daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
        daxpy_3l(row2, c[1], A0, temp);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp,
        //        &row, &beta, U, &row);
        dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
        dmcopy(row, row, A2, row, temp, row);
        //        dscal_(&row2, &c[8], V, &i1);
        dscal_3l(row2, c[8], V);
        //        daxpy_(&row2, &c[12], A6, &i1, temp, &i1);
        daxpy_3l(row2, c[12], A6, temp);
        //        daxpy_(&row2, &c[10], A4, &i1, temp, &i1);
        daxpy_3l(row2, c[10], A4, temp);
        //        dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, temp,
        //        &row, &beta, V, &row);
        dgemm_nn_3l(row, row, row, A6, row, temp, row, V, row);
        //        daxpy_(&row2, &c[6], A6, &i1, V, &i1);
        daxpy_3l(row2, c[6], A6, V);
        //        daxpy_(&row2, &c[4], A4, &i1, V, &i1);
        daxpy_3l(row2, c[4], A4, V);
        //        daxpy_(&row2, &c[2], A2, &i1, V, &i1);
        daxpy_3l(row2, c[2], A2, V);
        //        daxpy_(&row2, &c[0], A0, &i1, V, &i1);
        daxpy_3l(row2, c[0], A0, V);
        free(A0);
        free(A2);
        free(A4);
        free(A6);
        free(temp);
    }
    else
    {
        printf("%s\n", "Wrong Pade approximatin degree");
        exit(1);
    }
    double *D = malloc(row * row * sizeof(double));
    for (int ii = 0; ii < row * row; ii++) D[ii] = 0.0;
    //    dcopy_(&row2, V, &i1, A, &i1);
    dmcopy(row, row, V, row, A, row);
    //    daxpy_(&row2, &d1, U, &i1, A, &i1);
    daxpy_3l(row2, 1.0, U, A);
    //    dcopy_(&row2, V, &i1, D, &i1);
    dmcopy(row, row, V, row, D, row);
    //    daxpy_(&row2, &dm1, U, &i1, D, &i1);
    daxpy_3l(row2, -1.0, U, D);
    int *ipiv = (int *) calloc(row, sizeof(int));
    int info = 0;
    //    dgesv_(&row, &row, D, &row, ipiv, A, &row, &info);
    dgesv_3l(row, row, D, row, ipiv, A, row, &info);
    free(ipiv);
    free(D);
    free(U);
    free(V);
}