int main(){ double performance; perf_t start,stop; double a,b,c; a = 1; b = 1; long flop = 1; // Executions a vide, flush potentiel, ... // Performance d'une addition scalaire perf(&start); c = a + b; perf(&stop); // Verification printf("%lf = %lf + %lf\n", c, a, b); // Performance perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("Mflop/s : %lf \n", performance); return 0; }
void test_question_five(){ double performance; perf_t start,stop; int size = 100000; double * A =(double*) malloc(sizeof(double)*size); double * B =(double*) malloc(sizeof(double)*size); matrix_init(A,size,1); matrix_init(B,size,1); long flop = 2*size; // Executions a vide, flush potentiel, ... // Performance d'une addition scalaire perf(&start); double res = cblas_ddot(size,A,1,B,1); perf(&stop); // Verification printf("Result %lf\n",res); // Performance perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("Mflop/s : %lf \n", performance); free(A); free(B); }
void test_question_six(){ double performance; perf_t start,stop; int size = 1000000; double * A = (double*) malloc(sizeof(double)*size); double * B = (double*) malloc(sizeof(double)*size); matrix_init(A,size,1); matrix_init(B,size,1); int size_temp = 50; long flop; double res; while(size_temp <= size){ flop = 2*size_temp; // Performance d'une addition scalaire perf(&start); res = cblas_ddot(size_temp,A,1,B,1); perf(&stop); perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("%d %lf \n",size_temp, performance); size_temp = size_temp + 0.25 * size_temp; } free(A); free(B); }
int main(){ int size; double performance, performance1; perf_t start,stop,start1,stop1; unsigned long long flop; printf("################################################################################\n"); printf(" Performance test \n"); printf("################################################################################\n"); printf("I - TRSM\n"); printf("________\n\n"); printf("Initiating matrix ..."); fflush(stdout); size = 4000; flop = (unsigned long long ) size*size*size; double * A = (double*) malloc(sizeof(double)*size*size); double * B = (double*) malloc(sizeof(double)*size*size); initiate_matrix(size,size,A); initiate_matrix(size,size,B); printf("[OK]\n"); printf("Running cblas_trsm ..."); fflush(stdout); perf(&start); cblas_dtrsm(CblasColMajor, CblasLeft, 121, CblasNoTrans, CblasNonUnit,size,size,1.,A,size,B,size); perf(&stop); perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("[OK]\n"); printf("Running PRCD trsm ..."); fflush(stdout); perf(&start1); trsm_l(size,A,B,LOWER,size); perf(&stop1); perf_diff(&start1, &stop1); performance1= perf_mflops(&stop1, flop); printf("[OK]\n"); printf("Time Cblas version : "); perf_printh(&stop); printf("Time PRCD version : "); perf_printh(&stop1); printf("Matrix size : %dx%d\n",size,size); printf("Cblas version : %lf Mflop/s\n",performance); printf("PRCD version : %lf Mflop/s\n",performance1); free(A); free(B); printf("\n\nII - LU seq\n"); printf("___________\n\n"); printf("Initiating matrix ..."); fflush(stdout); size = 4000; flop = (unsigned long long) 2*size*size*size/3; A = (double*) malloc(sizeof(double)*size*size); initiate_matrix(size,size,A); printf("[OK]\n"); printf("Running cblas_lu ..."); fflush(stdout); perf(&start); //lapack_int * aa = malloc(sizeof(lapack_int)); // LAPACKE_dgetrf(LAPACK_COL_MAJOR, size, size, A, size, aa ); // cblas_dtrsm(CblasColMajor, CblasLeft, 121, CblasNoTrans, CblasNonUnit,size,size,1.,A,size,B,size); perf(&stop); perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("[OK]\n"); printf("Running PRCD simple_lu ..."); fflush(stdout); perf(&start1); simple_lu(size,A,size); perf(&stop1); perf_diff(&start1, &stop1); performance1= perf_mflops(&stop1, flop); printf("[OK]\n"); printf("Matrix size : %dx%d\n",size,size); // printf("Cblas version : %lf Mflop/s\n",performance); printf("PRCD version : %lf Mflop/s\n",performance1); free(A); printf("\n\nIII - Best block size on the architecture\n"); printf("_________________________________________\n\n"); printf("Initiating matrix ..."); fflush(stdout); printf("[OK]\n"); int values[10] = {100,200,300,500,600,700,800,900,1000,1100}; int i; for ( i = 0; i < 10 ; ++i){ size = values[i]; flop = (unsigned long long ) 2*size*size*size/3; A = (double*) malloc(sizeof(double)*size*size); initiate_matrix(size,size,A); fflush(stdout); perf(&start1); simple_lu(size,A,size); perf(&stop1); perf_diff(&start1, &stop1); performance1= perf_mflops(&stop1, flop); printf("BlockSize %d %lf Mflop/s\n",size,performance1); free(A); } printf("\n\nIV - Block version of LU seq\n"); printf("____________________________\n"); printf("Initiating matrix ..."); fflush(stdout); size = 4000; flop = (unsigned long long) 2*size*size*size/3; A = (double*) malloc(sizeof(double)*size*size); initiate_matrix(size,size,A); printf("[OK]\n"); printf("Running cblas_lu ..."); fflush(stdout); perf(&start); // LAPACKE_dgetrf(LAPACK_COL_MAJOR, size, size, A, size, aa ); // cblas_dtrsm(CblasColMajor, CblasLeft, 121, CblasNoTrans, CblasNonUnit,size,size,1.,A,size,B,size); perf(&stop); perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("[OK]\n"); printf("Running PRCD simple_lu block using block of size %d...",200); fflush(stdout); perf(&start1); lu_distributed(size,A,200); perf(&stop1); perf_diff(&start1, &stop1); performance1= perf_mflops(&stop1, flop); printf("[OK]\n"); printf("Matrix size : %dx%d\n",size,size); // printf("Cblas version : %lf Mflop/s\n",performance); printf("PRCD version : %lf Mflop/s\n",performance1); free(A); printf("\n\nIV - Block version of LU parallel\n"); printf("_________________________________\n"); printf("Initiating matrix ..."); fflush(stdout); size = 4000; flop = (unsigned long long) 2*size*size*size/3; A = (double*) malloc(sizeof(double)*size*size); initiate_matrix(size,size,A); printf("[OK]\n"); printf("Running cblas_lu ..."); fflush(stdout); perf(&start); // LAPACKE_dgetrf(LAPACK_COL_MAJOR, size, size, A, size, aa ); // cblas_dtrsm(CblasColMajor, CblasLeft, 121, CblasNoTrans, CblasNonUnit,size,size,1.,A,size,B,size); perf(&stop); perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("[OK]\n"); printf("Running PRCD parallel lu block using block of size %d...",200); fflush(stdout); perf(&start1); lu_distributed_parallel(size,A,200); perf(&stop1); perf_diff(&start1, &stop1); performance1= perf_mflops(&stop1, flop); printf("[OK]\n"); printf("Matrix size : %dx%d\n",size,size); // printf("Cblas version : %lf Mflop/s\n",performance); printf("PRCD version : %lf Mflop/s\n",performance1); free(A); printf("Initiating matrix ..."); fflush(stdout); size = 20000; flop = (unsigned long long) 2*size*size*size/3; A = (double*) malloc(sizeof(double)*size*size); initiate_matrix(size,size,A); printf("[OK]\n"); printf("Running cblas_lu ..."); fflush(stdout); perf(&start); // LAPACKE_dgetrf(LAPACK_COL_MAJOR, size, size, A, size, aa ); // cblas_dtrsm(CblasColMajor, CblasLeft, 121, CblasNoTrans, CblasNonUnit,size,size,1.,A,size,B,size); perf(&stop); perf_diff(&start, &stop); performance = perf_mflops(&stop, flop); printf("[OK]\n"); printf("Running PRCD parallel lu block using block of size %d...",200); fflush(stdout); perf(&start1); lu_distributed_parallel(size,A,200); perf(&stop1); perf_diff(&start1, &stop1); performance1= perf_mflops(&stop1, flop); printf("[OK]\n"); printf("Matrix size : %dx%d\n",size,size); // printf("Cblas version : %lf Mflop/s\n",performance); printf("PRCD version : %lf Mflop/s\n",performance1); free(A); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { char dummy[L2_CACHE_SIZE]; // Tests de performances de ddot int size = 50; blas_t *matriceD, *matriceE; alloc_vecteur(&matriceD, size); alloc_vecteur(&matriceE, size); printf("Tests de performance de la fonction ddot\n"); perf_t *t1, *t2,*t3, *t4,*t5, *t6,*t7, *t8, *t9, *t10; t1 = malloc(sizeof(perf_t)); t2 = malloc(sizeof(perf_t)); t3 = malloc(sizeof(perf_t)); t4 = malloc(sizeof(perf_t)); t5 = malloc(sizeof(perf_t)); t6 = malloc(sizeof(perf_t)); t7 = malloc(sizeof(perf_t)); t8 = malloc(sizeof(perf_t)); t9 = malloc(sizeof(perf_t)); t10 = malloc(sizeof(perf_t)); double mflops, mflops1,mflops2,mflops3,mflops4, mflops5; char command[200]; system("rm results/ddot_perf.txt"); for(size = 50; size < 100000000; size += size/4) { printf("M: %d ", size); if(size != 50) { free(matriceD); free(matriceE); alloc_vecteur(&matriceD, size); alloc_vecteur(&matriceE, size); } memset(dummy, 0, sizeof(dummy)); perf(t1); blas_t res = cblas_ddot(size, matriceD, 1, matriceE, 1); perf(t2); perf_diff(t1, t2); mflops = perf_mflops(t2, 2 * size); printf("Mflops/s: %le\n", mflops); sprintf(command, "echo %d %lf >> results/ddot_perf.txt", size, mflops); system(command); } // Test de performance dgemm ////////////////////////////////////////// long m = 100; blas_t *matriceA, *matriceB, *matriceC; alloc_matrice(&matriceA, m, m); alloc_matrice(&matriceB, m, m); matriceC = calloc(m*m,sizeof(blas_t)); system("rm results/dgemm_perf.txt"); for(; m< 1000; m+=20) { printf("M: %d ", m); if(m != 100) { free(matriceA); free(matriceB); free(matriceC); alloc_matrice(&matriceA, m, m); alloc_matrice(&matriceB, m, m); alloc_matrice(&matriceC, m, m); } memset(dummy, 0, sizeof(dummy)); perf(t1); cblas_dgemm_scalaire( CblasNoTrans, CblasNoTrans ,m, m, m, 1, matriceA, m, matriceB, m, 1, matriceC, m); perf(t2); perf_diff(t1, t2); mflops1 = perf_mflops(t2, m * m * m * 3 + m * m ); perf(t3); cblas_dgemm_scalaire1(matriceC, m, matriceA, m, matriceB, m, m); perf(t4); perf_diff(t3, t4); mflops2 = perf_mflops(t4, m * m * m * 3); perf(t5); cblas_dgemm_scalaire2(matriceC, m, matriceA, m, matriceB, m, m); perf(t6); perf_diff(t5, t6); mflops3 = perf_mflops(t6, m * m * m * 3); perf(t7); cblas_dgemm_scalaire3(matriceC, m, matriceA, m, matriceB, m, m); perf(t8); perf_diff(t7, t8); mflops4 = perf_mflops(t8, m * m * m * 3); perf(t9); cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, m,m, 1, matriceA, m, matriceB, m, 1, matriceC, m); perf(t10); perf_diff(t9, t10); mflops5 = perf_mflops(t10, m * m * m * 3); sprintf(command, "echo %d %lf %lf %lf %lf %lf >> results/dgemm_perf.txt", m * m, mflops1, mflops2, mflops3, mflops4, mflops5); system(command); printf("Mflops/s : %d %lf %lf %lf %lf %lf\n", m * m, mflops1, mflops2, mflops3, mflops4, mflops5 ); } free(matriceA); free(matriceB); free(matriceC); free(matriceD); free(matriceE); return EXIT_SUCCESS; }