int main() { printf("\n"); printf("\n"); printf("\n"); printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n"); printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n"); printf("\n"); printf(" HPMPC is distributed in the hope that it will be useful,\n"); printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"); printf(" See the GNU Lesser General Public License for more details.\n"); printf("\n"); printf("\n"); printf("\n"); printf("BLAS performance test - single precision\n"); printf("\n"); // maximum frequency of the processor const float GHz_max = GHZ_MAX; printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max); printf("\n"); // maximum flops per cycle, single precision #if defined(TARGET_X64_AVX2) const float flops_max = 32; printf("Testing BLAS version for AVX2 & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X64_AVX) const float flops_max = 16; printf("Testing BLAS version for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3) const float flops_max = 8; printf("Testing BLAS version for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A15) const float flops_max = 8; printf("Testing solvers for ARMv7a NEON instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A9) const float flops_max = 4; printf("Testing solvers for ARMv7a NEON instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A7) const float flops_max = 2; printf("Testing solvers for ARMv7a NEON instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X86_ATOM) const float flops_max = 4; printf("Testing BLAS version for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_POWERPC_G2) const float flops_max = 2; printf("Testing BLAS version for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_4X4) const float flops_max = 2; printf("Testing reference BLAS version, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_2X2) const float flops_max = 2; printf("Testing reference BLAS version, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #endif printf("\n"); printf("\n"); printf("\n"); FILE *f; f = fopen("./test_problems/results/test_blas.m", "w"); // a #if defined(TARGET_X64_AVX2) fprintf(f, "C = 's_x64_avx2';\n"); fprintf(f, "\n"); #elif defined(TARGET_X64_AVX) fprintf(f, "C = 's_x64_avx';\n"); fprintf(f, "\n"); #elif defined(TARGET_X64_SSE3) fprintf(f, "C = 's_x64_sse3';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A15) fprintf(f, "C = 's_ARM_cortex_A15';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A9) fprintf(f, "C = 's_ARM_cortex_A9';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A7) fprintf(f, "C = 's_ARM_cortex_A7';\n"); fprintf(f, "\n"); #elif defined(TARGET_X86_ATOM) fprintf(f, "C = 's_x86_atom';\n"); fprintf(f, "\n"); #elif defined(TARGET_POWERPC_G2) fprintf(f, "C = 's_PowerPC_G2';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_4X4) fprintf(f, "C = 's_c99_2x2';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_2X2) fprintf(f, "C = 's_c99_4x4';\n"); fprintf(f, "\n"); #endif fprintf(f, "A = [%f %f];\n", GHz_max, flops_max); fprintf(f, "\n"); fprintf(f, "B = [\n"); int i, j, rep, ll; const int bsd = D_MR; //d_get_mr(); const int bss = S_MR; //s_get_mr(); /* int info = 0;*/ printf("\nn\tGflops dgemm %%\tGflops dsyrk %%\tGflops dtrmm %%\tGflops dpotrf %%\tGflops dgemv_n%%\tGflops dgemv_t%%\tGflops dsymv %%\tGflops dtrmv_n%%\tGflops dtrmv_t%%\tGflops dmvmv%%\n\n"); int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332}; int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100}; for(ll=0; ll<75; ll++) { int n = nn[ll]; int nrep = nnrep[ll]; double *A; d_zeros(&A, n, n); double *B; d_zeros(&B, n, n); double *C; d_zeros(&C, n, n); float *sA; s_zeros(&sA, n, n); float *sB; s_zeros(&sB, n, n); float *sC; s_zeros(&sC, n, n); for(i=0; i<n*n; i++) A[i] = i; for(i=0; i<n; i++) B[i*(n+1)] = 1; for(i=0; i<n*n; i++) sA[i] = i; for(i=0; i<n; i++) sB[i*(n+1)] = 1; int pns = ((n+bss-1)/bss)*bss; int cns = ((n+S_NCL-1)/S_NCL)*S_NCL; int cns2 = ((2*n+S_NCL-1)/S_NCL)*S_NCL; float *pA; s_zeros_align(&pA, pns, cns); float *pB; s_zeros_align(&pB, pns, cns); float *pC; s_zeros_align(&pC, pns, cns); float *pD; s_zeros_align(&pD, pns, cns); float *pE; s_zeros_align(&pE, pns, cns2); float *pF; s_zeros_align(&pF, 2*pns, cns); float *pL; s_zeros_align(&pL, pns, cns); float *x; s_zeros_align(&x, pns, 1); float *y; s_zeros_align(&y, pns, 1); float *x2; s_zeros_align(&x2, 2*pns, 1); float *y2; s_zeros_align(&y2, 2*pns, 1); float *diag; s_zeros_align(&diag, pns, 1); s_cvt_mat2pmat(n, n, 0, bss, sA, n, pA, cns); s_cvt_mat2pmat(n, n, 0, bss, sB, n, pB, cns); s_cvt_mat2pmat(n, n, 0, bss, sB, n, pD, cns); s_cvt_mat2pmat(n, n, 0, bss, sA, n, pE, cns2); for(i=0; i<pns*cns; i++) pC[i] = -1; for(i=0; i<pns; i++) x[i] = 1; for(i=0; i<pns; i++) x2[i] = 1; /* timing */ struct timeval tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12; /* warm up */ for(rep=0; rep<nrep; rep++) { sgemm_nt_lib(n, n, n, pA, cns, pB, cns, pC, cns, 0); } gettimeofday(&tv0, NULL); // start for(rep=0; rep<nrep; rep++) { sgemm_nt_lib(n, n, n, pA, cns, pB, cns, pC, cns, 0); } gettimeofday(&tv1, NULL); // stop for(rep=0; rep<nrep; rep++) { ssyrk_spotrf_lib(n, n, n, pE, cns2, pD, cns, diag); } gettimeofday(&tv2, NULL); // stop for(rep=0; rep<nrep; rep++) { strmm_lib(n, n, pA, cns, pB, cns, pC, cns); } gettimeofday(&tv3, NULL); // stop for(rep=0; rep<nrep; rep++) { strtr_l_lib(n, 0, pA, cns, pC, cns); // triangualr matrix transpose } gettimeofday(&tv4, NULL); // stop for(rep=0; rep<nrep; rep++) { sgemv_n_lib(n, n, pA, cns, x, y, 0); } gettimeofday(&tv5, NULL); // stop for(rep=0; rep<nrep; rep++) { sgemv_t_lib(n, n, 0, pA, cns, x, y, 0); } gettimeofday(&tv6, NULL); // stop for(rep=0; rep<nrep; rep++) { strmv_u_n_lib(n, pA, cns, x, y, 0); } gettimeofday(&tv7, NULL); // stop for(rep=0; rep<nrep; rep++) { strmv_u_t_lib(n, pA, cns, x, y, 0); } gettimeofday(&tv8, NULL); // stop for(rep=0; rep<nrep; rep++) { strsv_sgemv_n_lib(n, 2*n, pF, cns, x2); } gettimeofday(&tv9, NULL); // stop for(rep=0; rep<nrep; rep++) { strsv_sgemv_t_lib(n, 2*n, pF, cns, x2); } gettimeofday(&tv10, NULL); // stop for(rep=0; rep<nrep; rep++) { ssymv_lib(n, 0, pA, cns, x, y, 0); } gettimeofday(&tv11, NULL); // stop for(rep=0; rep<nrep; rep++) { smvmv_lib(n, n, 0, pA, cns, x, y, x2, y2, 0); } gettimeofday(&tv12, NULL); // stop float Gflops_max = flops_max * GHz_max; float time_dgemm = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6); float flop_dgemm = 2.0*n*n*n; float Gflops_dgemm = 1e-9*flop_dgemm/time_dgemm; float time_dsyrk_dpotrf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6); float flop_dsyrk_dpotrf = 1.0*n*n*n + 1.0/3.0*n*n*n; float Gflops_dsyrk_dpotrf = 1e-9*flop_dsyrk_dpotrf/time_dsyrk_dpotrf; float time_dtrmm = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6); float flop_dtrmm = 1.0*n*n*n; float Gflops_dtrmm = 1e-9*flop_dtrmm/time_dtrmm; float time_dtrtr = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6); float flop_dtrtr = 0.5*n*n; // 0.5*n*n elements float Gflops_dtrtr = 1e-9*flop_dtrtr/time_dtrtr; float time_dgemv_n = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6); float flop_dgemv_n = 2.0*n*n; float Gflops_dgemv_n = 1e-9*flop_dgemv_n/time_dgemv_n; float time_dgemv_t = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6); float flop_dgemv_t = 2.0*n*n; float Gflops_dgemv_t = 1e-9*flop_dgemv_t/time_dgemv_t; float time_dtrmv_n = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6); float flop_dtrmv_n = 1.0*n*n; float Gflops_dtrmv_n = 1e-9*flop_dtrmv_n/time_dtrmv_n; float time_dtrmv_t = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6); float flop_dtrmv_t = 1.0*n*n; float Gflops_dtrmv_t = 1e-9*flop_dtrmv_t/time_dtrmv_t; float time_dtrsv_n = (float) (tv9.tv_sec-tv8.tv_sec)/(nrep+0.0)+(tv9.tv_usec-tv8.tv_usec)/(nrep*1e6); float flop_dtrsv_n = 3.0*n*n; float Gflops_dtrsv_n = 1e-9*flop_dtrsv_n/time_dtrsv_n; float time_dtrsv_t = (float) (tv10.tv_sec-tv9.tv_sec)/(nrep+0.0)+(tv10.tv_usec-tv9.tv_usec)/(nrep*1e6); float flop_dtrsv_t = 3.0*n*n; float Gflops_dtrsv_t = 1e-9*flop_dtrsv_t/time_dtrsv_t; float time_dsymv = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6); float flop_dsymv = 2.0*n*n; float Gflops_dsymv = 1e-9*flop_dsymv/time_dsymv; float time_dmvmv = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6); float flop_dmvmv = 4.0*n*n; float Gflops_dmvmv = 1e-9*flop_dmvmv/time_dmvmv; printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dmvmv, 100.0*Gflops_dmvmv/Gflops_max); fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dmvmv, 100.0*Gflops_dmvmv/Gflops_max); free(A); free(B); free(pA); free(pB); free(pC); free(pD); free(pE); free(pF); free(pL); free(x); free(y); free(x2); free(y2); } printf("\n"); fprintf(f, "];\n"); fclose(f); return 0; }
int main() { int i, j, rep; const int bs = D_MR; //d_get_mr(); const int bss = S_MR; //s_get_mr(); printf("\nbs = %d\n\n", bss); int n = 16; int nrep = 1000000; double *A; d_zeros(&A, n, n); double *B; d_zeros(&B, n, n); double *C; d_zeros(&C, n, n); double *L; d_zeros(&L, n, n); float *sA; s_zeros(&sA, n, n); float *sB; s_zeros(&sB, n, n); for(i=0; i<n*n; i++) { A[i] = i; sA[i] = i; } B[0] = 2; /* B[1] = 1;*/ sB[0] = 2; /* sB[1] = 1;*/ for(i=1; i<n-1; i++) { /* B[i*(n+1)-1] = 1;*/ B[i*(n+1)+0] = 2; /* B[i*(n+1)+1] = 1;*/ /* sB[i*(n+1)-1] = 1;*/ sB[i*(n+1)+0] = 2; /* sB[i*(n+1)+1] = 1;*/ } /* B[n*n-2] = 1;*/ B[n*n-1] = 2; /* sB[n*n-2] = 1;*/ sB[n*n-1] = 2; for(i=0; i<n; i++) C[i*(n+1)] = 2; for(i=0; i<n-1; i++) C[1+i*(n+1)] = 1; /*sB[1*(n+1)] = 2;*/ /* d_print_mat(n, n, C, n);*/ int pn = ((n+bs-1)/bs)*bs;//+4; int pns = ((n+bss-1)/bss)*bss;//+4; int cns = ((n+S_NCL-1)/S_NCL)*S_NCL;//+4; int cns2 = ((2*n+S_NCL-1)/S_NCL)*S_NCL; double *pA; d_zeros_align(&pA, pn, pn); double *pB; d_zeros_align(&pB, pn, pn); double *pC; d_zeros_align(&pC, pn, pn); double *pL; d_zeros_align(&pL, pn, pn); float *spA; s_zeros_align(&spA, pns, cns); float *spB; s_zeros_align(&spB, pns, cns); float *spC; s_zeros_align(&spC, pns, cns); float *spD; s_zeros_align(&spD, pns, cns); float *spE; s_zeros_align(&spE, pns, cns2); float *diag; s_zeros_align(&diag, pns, 1); d_cvt_mat2pmat(n, n, 0, bs, A, n, pA, pn); d_cvt_mat2pmat(n, n, 0, bs, B, n, pB, pn); s_cvt_mat2pmat(n, n, 0, bss, sA, n, spA, cns); s_cvt_mat2pmat(n, n, 0, bss, sB, n, spB, cns); s_cvt_mat2pmat(n, n, 0, bss, sB, n, spC, cns); s_cvt_mat2pmat(n, n, 0, bss, sB, n, spD, cns); s_cvt_mat2pmat(n, n, 0, bss, sA, n, spE, cns2); double *x; d_zeros_align(&x, n, 1); double *y; d_zeros_align(&y, n, 1); x[2] = 1; /* for(i=0; i<pn*pn; i++) pC[i] = -1;*/ /* for(i=0; i<pn*pn; i++) spC[i] = -1;*/ // d_print_pmat(pn, pn, bs, pA, pn); // d_print_pmat(pn, pn, bs, pB, pn); // d_print_pmat(pn, pn, bs, pC, pn); // d_print_mat(n, n, B, n); // double *x; d_zeros_align(&x, pn, 1); // double *y; d_zeros_align(&y, pn, 1); // x[3] = 1.0; /* d_cvt_mat2pmat(n, n, bs-n%bs, bs, C, n, pC+((bs-n%bs))%bs*(bs+1), pn);*/ /* d_print_pmat(pn, pn, bs, pC, pn);*/ /* s_print_pmat(n, n, bss, spD, cns);*/ /* s_print_pmat(n, n+4, bss, spE, cns2);*/ /* timing */ struct timeval tv0, tv1; gettimeofday(&tv0, NULL); // start /* d_print_pmat(n, n, bs, pC, pn);*/ for(rep=0; rep<nrep; rep++) { /* sgemm_nt_lib(n, n, n, spA, cns, spB, cns, spC, cns, 0);*/ ssyrk_spotrf_lib(n, n, n, spE, cns2, spD, cns, diag); /* strtr_l_lib(11, 3, spA+3, cns, spC, cns);*/ /* sgemm_nt_lib(n, n, n, spB, pns, spA, pns, spC, pns, 0);*/ /* dgemm_nt_lib(n, n, n, pA, pn, pB, pn, pC, pn, 0);*/ /* dgemm_nt_lib(n, n, n, pB, pn, pA, pn, pC, pn, 0);*/ /* dtrmm_pup_nn_lib(n, n, pA, pn, B, n, pC, pn);*/ /* dsyrk_ppp_lib(n, n, pA, pn, pC, pn);*/ /* dgemm_ppp_nt_lib(n, n, n, pA, pn, pA, pn, pB, pn, 0);*/ /* dtrmm_ppp_lib(n, n, 0, pA, pn, pB, pn, pC, pn);*/ /* dpotrf_dcopy_lib(n, 0, pC, pn, pL, pn);*/ /* dgemm_pup_nn_lib(n, n, n, pA, pn, B, n, pC, pn, 0);*/ /* dgemm_ppp_nt_lib(n, n, n, pA, pn, pA, pn, pC+(bs-n)*(bs+1), pn, 1);*/ /* d_print_pmat(pn, pn, bs, pC, pn);*/ /* dpotrf_p_dcopy_u_lib(n, (bs-n%bs)%bs, pC+((bs-n%bs))%bs*(bs+1), pn, L, n);*/ /* d_print_pmat(pn, pn, bs, pC, pn);*/ /* d_print_mat(n, n, L, n);*/ /* exit(2);*/ // dgemm_nt_lib(n, n, n, A, n, B, n, C, n, 0); // dgemm_nt_lib_asm(n, n, n, pA, pn, pB, pn, pC, pn, 0); // sgemm_nt_lib_neon(n, n, n, spA, pns, spB, pns, spC, pns, 0); // dsymm_nt_lib(n, n, A, n, B, n, C, n); // dpotrf_lib(n, B, n); // dgemm_nt_lib2(n, pB, pA, pC, pn); // dgemv_n_lib(n-1, n, 1, pn, pA+1, x, y); // dtrmv_n_lib(n-1, 1, pA+1, pn, x, y); /* dtrmv_t_lib(n-1, 1, pA+1, pn, x, y);*/ } gettimeofday(&tv1, NULL); // stop float time = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6); float flop = 2.0*n*n*n; /* float flop = 1.0*n*n;*/ // float flop = 1.0/3.0*n*n*n; float Gflops = 1e-9*flop/time; float Gflops_max = 1*1; printf("\nn\tGflops\t\t%%\n%d\t%f\t%f\n\n", n, Gflops, 100.0*Gflops/Gflops_max); if(n<=24) { // d_print_pmat(pn, pn, bs, pC, pn); // d_print_pmat(n, n, bs, pB, pn); /* d_print_pmat(n, n, bs, pA, pn);*/ /* d_print_mat(n, n, B, n);*/ /* d_print_pmat(n, n, bs, pB, pn);*/ /* d_print_pmat(n, n, bs, pC, pn);*/ /* d_print_pmat(n, n, bs, pL, pn);*/ s_print_pmat(n, n, bss, spA, cns); s_print_pmat(n, n, bss, spB, cns); /* s_print_pmat(n, n, bss, spC, cns);*/ s_print_pmat(n, n, bss, spE+n*bss, cns2); /* d_print_mat(n, 1, y, pn);*/ } return 0; }