void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { int attr[NINT]; FLA_Obj obj[NOBJ]; double *dtime; FLA_Init(); /* Check if the number of arguments supplied is correct */ FLA_M2C_CheckNumArgs(NRHS, nrhs); /* Convert Matlab arguments into the appropriate FLAME C arguments */ FLA_M2C_ConvertArgs(NRHS, prhs, NINT, attr, obj); /* If an extra argument is supplied, collect timing informaion in it. */ if (nrhs == NRHS+1) dtime = FLA_M2C_ConvertDoublePtr(prhs[NRHS]); /* Now call the C FLAME function, timing it if the extra argument is given. */ if (nrhs == NRHS+1) *dtime = FLA_Clock(); FLA_Axpyt_external(attr[0], obj[0], obj[1], obj[2]); if (nrhs == NRHS+1) *dtime = FLA_Clock() - *dtime; FLA_Finalize(); }
int main(int argc, char *argv[]) { int m, n, k, nfirst, nlast, ninc, i, irep, nrepeats, nb_alg, check;; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, B, C, Cref, Cold; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed */ printf( "%% enter nfirst, nlast, ninc:" ); scanf( "%d%d%d", &nfirst, &nlast, &ninc ); printf( "%% %d %d %d\n", nfirst, nlast, ninc ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &B ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &C ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cref ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cold ); /* Generate random matrices L and B */ FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( Cold ); gflops = 2.0 * n * n * n * 1.0e-09; /* Time FLA_Symm */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, Cref ); dtime = FLA_Clock(); FLA_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ONE, Cref ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); /* Time the your implementations */ #if TEST_UNB_VAR1==TRUE /* Variant 1 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var1( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR1==TRUE /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var1( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR2==TRUE /* Variant 2 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var2( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR2==TRUE /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var2( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR3==TRUE /* Variant 3 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var3( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR3==TRUE /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var3( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR4==TRUE /* Variant 4 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var4( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR4==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var4( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR5==TRUE /* Variant 5 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var5( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR5==TRUE /* Variant 5 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var5( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR6==TRUE /* Variant 6 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var6( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var6( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR6==TRUE /* Variant 6 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var6( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var6( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR7==TRUE /* Variant 7 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var7( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var7( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR7==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var7( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var7( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR8==TRUE /* Variant 8 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var8( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var8( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR8==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var8( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var8( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &Cref ); FLA_Obj_free( &Cold ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ // printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ #if TEST_UNB_VAR1==TRUE printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.' ); \n" ); #endif #if TEST_UNB_VAR2==TRUE printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.' ); \n" ); #endif #if TEST_UNB_VAR3==TRUE printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.' ); \n" ); #endif #if TEST_UNB_VAR4==TRUE printf( "plot( data_unb_var4( :,1 ), data_unb_var4( :, 2 ), 'm-.' ); \n" ); #endif #if TEST_UNB_VAR5==TRUE printf( "plot( data_unb_var5( :,1 ), data_unb_var5( :, 2 ), 'c-.' ); \n" ); #endif #if TEST_UNB_VAR6==TRUE printf( "plot( data_unb_var6( :,1 ), data_unb_var6( :, 2 ), 'y-.' ); \n" ); #endif #if TEST_UNB_VAR7==TRUE printf( "plot( data_unb_var7( :,1 ), data_unb_var7( :, 2 ), 'k-.' ); \n" ); #endif #if TEST_UNB_VAR8==TRUE printf( "plot( data_unb_var8( :,1 ), data_unb_var8( :, 2 ), 'm:' ); \n" ); #endif #if TEST_BLK_VAR1==TRUE printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r--' ); \n" ); #endif #if TEST_BLK_VAR2==TRUE printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g--' ); \n" ); #endif #if TEST_BLK_VAR3==TRUE printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b--' ); \n" ); #endif #if TEST_BLK_VAR4==TRUE printf( "plot( data_blk_var4( :,1 ), data_blk_var4( :, 2 ), 'm--' ); \n" ); #endif #if TEST_BLK_VAR5==TRUE printf( "plot( data_blk_var5( :,1 ), data_blk_var5( :, 2 ), 'c--' ); \n" ); #endif #if TEST_BLK_VAR6==TRUE printf( "plot( data_blk_var6( :,1 ), data_blk_var6( :, 2 ), 'y--' ); \n" ); #endif #if TEST_BLK_VAR7==TRUE printf( "plot( data_blk_var7( :,1 ), data_blk_var7( :, 2 ), 'k--' ); \n" ); #endif #if TEST_BLK_VAR8==TRUE printf( "plot( data_blk_var8( :,1 ), data_blk_var8( :, 2 ), 'm-' ); \n" ); #endif printf( "hold on \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); // printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); printf( "legend( 'FLA Trsm', ...\n"); #if TEST_UNB_VAR1==TRUE printf( " 'unb var1', ...\n"); #endif #if TEST_UNB_VAR2==TRUE printf( " 'unb var2', ...\n"); #endif #if TEST_UNB_VAR3==TRUE printf( " 'unb var3', ...\n"); #endif #if TEST_UNB_VAR4==TRUE printf( " 'unb var4', ...\n"); #endif #if TEST_UNB_VAR5==TRUE printf( " 'unb var5', ...\n"); #endif #if TEST_UNB_VAR6==TRUE printf( " 'unb var6', ...\n"); #endif #if TEST_UNB_VAR7==TRUE printf( " 'unb var7', ...\n"); #endif #if TEST_UNB_VAR8==TRUE printf( " 'unb var8', ...\n"); #endif #if TEST_BLK_VAR1==TRUE printf( " 'blk var1', ...\n"); #endif #if TEST_BLK_VAR2==TRUE printf( " 'blk var2', ...\n"); #endif #if TEST_BLK_VAR3==TRUE printf( " 'blk var3', ...\n"); #endif #if TEST_BLK_VAR4==TRUE printf( " 'blk var4', ...\n"); #endif #if TEST_BLK_VAR5==TRUE printf( " 'blk var5', ...\n"); #endif #if TEST_BLK_VAR6==TRUE printf( " 'blk var6', ...\n"); #endif #if TEST_BLK_VAR7==TRUE printf( " 'blk var7', ...\n"); #endif #if TEST_BLK_VAR8==TRUE printf( " 'blk var8', ...\n"); #endif printf( " 2 );\n"); FLA_Finalize( ); }
void time_Apply_Q( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj B, FLA_Obj B_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj B_save, A_flat, B_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, B, &B_flat ); FLASH_Copy( B, B_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( B_save, B ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( B, B_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Apply_Q( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A_flat, t, B_flat ); break; case FLA_ALG_FRONT: //printf("\n"); FLASH_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, B ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A_flat, B_flat ); FLASH_Obj_hierarchify( B_flat, B_ref ); *diff = 0.0; } else { FLASH_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, B ); *diff = FLASH_Max_elemwise_diff( B, B_ref ); } *gflops = 2.0 * FLASH_Obj_scalar_length( A ) * FLASH_Obj_scalar_width( A ) * FLASH_Obj_scalar_width( B ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( B_save, B ); FLASH_Obj_free( &B_save ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &B_flat ); }
void time_Her2k_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_her2k_t* cntl_her2k_blas; fla_her2k_t* cntl_her2k_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_her2k_blas = FLA_Cntl_her2k_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL, NULL ); cntl_her2k_var = FLA_Cntl_her2k_obj_create( FLA_FLAT, variant, bp, cntl_her2k_blas, cntl_gemm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var1( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var1( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var2( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var2( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var3( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var3( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var4( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var4( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 5:{ // Time variant 5 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var5( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var5( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 6:{ // Time variant 6 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var6( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var6( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 7:{ // Time variant 7 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var7( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var7( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 8:{ // Time variant 8 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var8( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var8( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 9:{ // Time variant 9 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var9( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var9( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 10:{ // Time variant 10 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var10( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var10( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_her2k_var ); FLA_Cntl_obj_free( cntl_her2k_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void libfla_test_symm_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; unsigned int n; signed int n_input = -1; FLA_Side side; FLA_Uplo uplo; FLA_Obj A, B, C, x, y, z, w, norm; FLA_Obj alpha, beta; FLA_Obj C_save; FLA_Obj A_test, B_test, C_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); // Create the matrices for the current operation. if ( side == FLA_LEFT ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, m, 1, 0, 0, &w ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, n, 1, 0, 0, &w ); } libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C ); // Create a norm scalar. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_symm_matrix( uplo, A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); // Initialize the test vectors. FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, z ); FLA_Set( FLA_ZERO, w ); // Set constants. alpha = FLA_TWO; beta = FLA_MINUS_ONE; // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test ); } else { A_test = A; B_test = B; C_test = C; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( C_save, C_test ); else FLA_Copy_external( C_save, C_test ); time = FLA_Clock(); libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Copy the solution to flat matrix X. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( C_test, C ); } else { // No action needed since C_test and C refer to the same object. } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &B_test ); FLASH_Obj_free( &C_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_free(); // Compute the performance of the best experiment repeat. if ( side == FLA_LEFT ) *perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; else *perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute: // y = C * x // and compare to // z = ( beta * C_orig + alpha * A * B ) x (side = left) // z = ( beta * C_orig + alpha * B * A ) x (side = right) FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); if ( side == FLA_LEFT ) { FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Symv_external( uplo, alpha, A, w, FLA_ZERO, z ); } else { FLA_Symv_external( uplo, FLA_ONE, A, x, FLA_ZERO, w ); FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B, w, FLA_ZERO, z ); } FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z ); // Compute || y - z ||. //FLA_Axpy_external( FLA_MINUS_ONE, y, z ); //FLA_Nrm2_external( z, norm ); //FLA_Obj_extract_real_scalar( norm, residual ); *residual = FLA_Max_elemwise_diff( y, z ); // Free the supporting flat objects. FLA_Obj_free( &C_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); FLA_Obj_free( &norm ); }
void time_Sylv_nn( int variant, int type, int n_repeats, int m, int n, int nb_alg, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_sylv_t* cntl_sylv_var; fla_sylv_t* cntl_sylv_unb; fla_gemm_t* cntl_gemm_blas; /* if( type == FLA_ALG_UNBLOCKED && n > 400 ) { *gflops = 0.0; *diff = 0.0; return; } */ bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_sylv_unb = FLA_Cntl_sylv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_sylv_var = FLA_Cntl_sylv_obj_create( FLA_FLAT, variant, bp, cntl_sylv_unb, cntl_sylv_unb, cntl_sylv_unb, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: /* Time reference implementation */ REF_Sylv_nn( isgn, A, B, C, scale ); break; case 1:{ /* Time variant 1 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var1( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var1( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 2:{ /* Time variant 2 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var2( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var2( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 3:{ /* Time variant 3 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var3( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var3( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 4:{ /* Time variant 4 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var4( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var4( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 5:{ /* Time variant 5 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var5( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var5( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 6:{ /* Time variant 6 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var6( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var6( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 7:{ /* Time variant 7 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var7( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var7( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 8:{ /* Time variant 8 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var8( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var8( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 9:{ /* Time variant 9 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var9( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var9( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 10:{ /* Time variant 10 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var10( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var10( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 11:{ /* Time variant 11 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var11( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var11( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 12:{ /* Time variant 12 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var12( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var12( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 13:{ /* Time variant 13 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var13( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var13( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 14:{ /* Time variant 14 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var14( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var14( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 15:{ /* Time variant 15 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var15( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var15( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 16:{ /* Time variant 16 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var16( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var16( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 17:{ /* Time variant 17 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var17( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var17( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 18:{ /* Time variant 18 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var18( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var18( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_sylv_var ); FLA_Cntl_obj_free( cntl_sylv_unb ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ){ FLA_Copy_external( C, C_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = ( m * m * n + n * n * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_QR_UT( int variant, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, FLA_Obj b, FLA_Obj b_orig, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, b_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); if ( FLA_Obj_is_single_precision( A ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_QR_UT( A, t ); break; case FLA_ALG_FRONT: FLA_QR_UT( A, T ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Obj AT, AB; FLA_Obj bT, bB; FLA_Obj y; FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y ); FLA_Copy_external( b, b_orig ); if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); FLA_Part_2x1( A, &AT, &AB, FLA_Obj_width( A ), FLA_TOP ); FLA_Part_2x1( b, &bT, &bB, FLA_Obj_width( A ), FLA_TOP ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, AT, bT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, bT, FLA_ONE, b_orig ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &y ); } else { FLA_Obj x, y; FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y ); FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &x ); FLA_Copy_external( b, b_orig ); FLA_QR_UT_solve( A, T, b, x ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, x, FLA_ONE, b_orig ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); } *gflops = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &norm ); }
void time_Gemm_hh( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_gemm_t* cntl_gemm_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_gemm_var = FLA_Cntl_gemm_obj_create( FLA_FLAT, variant, bp, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ // Time reference implementation case 0: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; // Time variant 1 case 1:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var1( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var1( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 2 case 2:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var2( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var2( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 3 case 3:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var3( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var3( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 4 case 4:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var4( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var4( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 5 case 5:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var5( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var5( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 6 case 6:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var6( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var6( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_gemm_var ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Gemm( int param_combo, int type, int nrepeats, int m, int k, int n, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; if ( param_combo != 4 ) { *gflops = 0.0; *diff = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 4 case 4:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); //FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 5 case 5:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 6 case 6:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 7 case 7:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 8 case 8:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } /* if ( type == FLA_ALG_REFERENCE ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } */ *gflops = 2.0 * m * k * n / dtime_old / 1.0e9; if ( param_combo == 0 || param_combo == 1 || param_combo == 2 || param_combo == 3 || param_combo == 6 ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Tevd_v( int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max, FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l, double *dtime, double *diff1, double* diff2, double *gflops ) { int irep; double k, dtime_old = 1.0e9; FLA_Obj A_save, G_save, d_save, e_save; if ( //( variant == 0 ) || //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || FALSE ) { *dtime = 0.0; *gflops = 0.0; *diff1 = 0.0; *diff2 = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); FLA_Copy_external( d, d_save ); FLA_Copy_external( e, e_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Tevd_v( d, e, A ); break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj V, A_rev_evd, norm, eye; FLA_Copy( d, l ); //FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" ); FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); /* FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, D, FLA_ZERO, A_rev_evd ); FLA_Copy( A_rev_evd, D ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, D, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); */ //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); /* FLA_Obj_free( &EL ); FLA_Obj_free( &EU ); FLA_Obj_free( &D ); FLA_Obj_free( &dc ); FLA_Obj_free( &ec ); */ FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; if ( FLA_Obj_is_complex( A ) ) { *gflops = ( ( 4.5 * k * m * m ) + 2.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } else { *gflops = ( ( 4.5 * k * m * m ) + 1.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &d_save ); FLA_Obj_free( &e_save ); }
void time_Sylv( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ){ FLA_Copy_external( C, C_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = ( m * m * n + n * n * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Gemm_nn( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *gflops ) { int irep, info, lwork; double dtime_old, d_minus_one = -1.0, d_one = 1.0; FLA_Obj Cold; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold ); FLA_Copy_external( C, Cold ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( Cold, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var1( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var2( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var3( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var4( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 5:{ // Time variant 5 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var5( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 6:{ // Time variant 6 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var6( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 13:{ // Time variant 1->3 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var13( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 15:{ // Time variant 1->5 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var15( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 31:{ // Time variant 3->1 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var31( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 35:{ // Time variant 3->5 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var35( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 51:{ // Time variant 5->1 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var51( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 53:{ // Time variant 5->3 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var53( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, Cref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, Cref ); //FLA_Obj_show( "C:", C, "%f", "\n"); } *gflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( Cold, C ); FLA_Obj_free( &Cold ); }
void time_Syrk_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var1( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var1( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var1( A, C ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var2( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var2( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var2( A, C ); break; case FLA_ALG_OPENMP_2LOOPSPLUS: FLA_Syrk_ln_omp2x_var2( A, C ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var3( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var3( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var3( A, C ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var4( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var4( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var4( A, C ); break; default: printf("trouble\n"); } break; } case 5:{ // Time variant 5 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var5( A, C ); break; default: printf("trouble\n"); } break; } } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, C_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, C_ref ); //FLA_Obj_show( "C:", C, "%f", "\n"); } *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Syrk( int param_combo, int type, int nrepeats, int m, int k, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Syrk( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syrk( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Syrk( FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syrk( FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Syrk( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syrk( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Syrk( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syrk( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * m * m * k / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &C_flat ); }
void time_Apply_G_rf( int variant, int type, int n_repeats, int m, int k, int n, int b_alg, FLA_Obj A, FLA_Obj A_ref, FLA_Obj G, FLA_Obj P, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, G_save, norm; if ( FLA_Obj_is_real( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } else if ( FLA_Obj_is_complex( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); //dim_t b_flash_m = b_alg; //dim_t b_flash_n = n; //FLASH_Obj_create_hier_copy_of_flat_ext( A, 1, &b_flash_m, &b_flash_n, &AH ); //printf ( "flash dims: %d x %d\n", FLA_Obj_length( AH ), FLA_Obj_width( AH ) ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_hierarchify( A_save, AH ); *dtime = FLA_Clock(); switch( variant ){ case 0: break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var1( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var1( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var1( G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var2( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var2( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var2( G, A, b_alg ); break; } break; } // Time variant 3 case 3: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var3( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var3( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var3( G, A, b_alg ); break; } break; } // Time variant 6 case 6: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var6( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var6( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var6( G, A, b_alg ); break; } break; } // Time variant 9 case 9: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var9( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var9( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var9( G, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( variant == 1 && type == FLA_ALG_UNB_OPT ) { //FLA_Obj_show( "A_ref", A, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A", A, "%9.2e ", "" ); FLA_Copy( A, A_ref ); *diff = 0.0; } else { //FLA_Obj_show( "A", A, "%9.2e + %9.2e ", "" ); //if ( variant == 7 && type == FLA_ALG_UNB_ASM ) //FLA_Obj_show( "A", A, "%9.2e", "" ); //if ( variant == 9 ) FLASH_Obj_flatten( AH, A ); FLA_Axpy( FLA_MINUS_ONE, A_ref, A ); FLA_Norm_frob( A, norm ); FLA_Obj_extract_real_scalar( norm, diff ); //*diff = FLA_Max_elemwise_diff( A_ref, A ); } *gflops = 6.0 * k * m * ( n - 1 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 2.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_free( &AH ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &norm ); }
void libfla_test_qrut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input = -2; signed int n_input = -1; FLA_Obj A, T, x, b, y, norm; FLA_Obj A_save; FLA_Obj A_test, T_test, x_test, b_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); if ( impl == FLA_TEST_FLAT_FRONT_END || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 1 ) ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); else if ( var == 2 ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], min_m_n, min_m_n, &T ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], 1, min_m_n, &T ); // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create vectors to form a linear system. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, n, 1, 0, 0, &y ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a random right-hand side vector. FLA_Random_matrix( b ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &b_flash, &b_test ); FLASH_Obj_create_hier_copy_of_flat( x, 1, &b_flash, &x_test ); } else { A_test = A; T_test = T; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( A_save, A_test ); else FLA_Copy_external( A_save, A_test ); time = FLA_Clock(); libfla_test_qrut_impl( impl, A_test, T_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Perform a linear solve with the result. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_solve( A_test, T_test, b_test, x_test ); FLASH_Obj_flatten( x_test, x ); } else { FLA_QR_UT_solve( A_test, T_test, b, x ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &b_test ); FLASH_Obj_free( &x_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_free(); // Compute the performance of the best experiment repeat. *perf = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save, x, FLA_MINUS_ONE, b ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &y ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); }
void time_Her2k( int param_combo, int type, int nrepeats, int m, int k, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 4.0 * 2.0 * m * m * k / dtime_old / 1.0e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
FLA_Error REF_Svdd_uv_components( FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) /* { *dtime_bred = 1; *dtime_bsvd = 1; *dtime_appq = 1; *dtime_qrfa = 1; *dtime_gemm = 1; return FLA_Svdd_external( FLA_SVD_VECTORS_ALL, A, s, U, V ); } */ { FLA_Datatype dt_A; FLA_Datatype dt_A_real; dim_t m_A, n_A; dim_t min_m_n; FLA_Obj tq, tu, tv, d, e, Ur, Vr, W; FLA_Obj eT, epsilonB; FLA_Uplo uplo = FLA_UPPER_TRIANGULAR; double crossover_ratio = 16.0 / 10.0; double dtime_temp; dt_A = FLA_Obj_datatype( A ); dt_A_real = FLA_Obj_datatype_proj_to_real( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tq ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tu ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tv ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &e ); FLA_Obj_create( dt_A_real, n_A, n_A, 0, 0, &Ur ); FLA_Obj_create( dt_A_real, n_A, n_A, 0, 0, &Vr ); FLA_Part_2x1( e, &eT, &epsilonB, 1, FLA_BOTTOM ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce to bidiagonal form. FLA_Bidiag_blk_external( A, tu, tv ); FLA_Bidiag_UT_extract_diagonals( A, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Divide-and-conquor algorithm. FLA_Bsvdd_external( uplo, d, e, Ur, Vr ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U. FLA_Copy_external( Ur, U ); FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, tu, U ); // Form V. FLA_Copy_external( Vr, V ); FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, A, tv, V ); } *dtime_appq = FLA_Clock() - dtime_temp; *dtime_qrfa = 0.0; *dtime_gemm = 0.0; } else { FLA_Obj AT, AB; FLA_Obj UL, UR; FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); // Create a temporary n-by-n matrix R. FLA_Obj_create( dt_A, n_A, n_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Perform a QR factorization. FLA_QR_blk_external( A, tq ); FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, UL ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, A ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form Q. FLA_QR_form_Q_external( U, tq ); } *dtime_appq = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Reduce R to bidiagonal form. FLA_Bidiag_blk_external( AT, tu, tv ); FLA_Bidiag_UT_extract_diagonals( A, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Divide-and-conquor algorithm. FLA_Bsvdd_external( uplo, d, e, Ur, Vr ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U in W. FLA_Copy_external( Ur, W ); FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, AT, tu, W ); // Form V. FLA_Copy_external( Vr, V ); FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, AT, tv, V ); } *dtime_appq += FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying // back to U. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, W, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; // Free R. FLA_Obj_free( &W ); } } else { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy singular values to output vector. FLA_Copy( d, s ); // Sort singular values and vectors. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &tq ); FLA_Obj_free( &tu ); FLA_Obj_free( &tv ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &Ur ); FLA_Obj_free( &Vr ); return FLA_SUCCESS; }
void time_Trmm( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 4 case 4:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 5 case 5:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 6 case 6:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 7 case 7:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 8 case 8:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 9 case 9:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 10 case 10:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 11 case 11:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_width( C ) * FLASH_Obj_scalar_width( A ) / dtime_old / 1.0e9; if ( param_combo == 0 || param_combo == 3 || param_combo == 6 || param_combo == 9 ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &C_flat ); }
void time_Copyt( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_NO_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_NO_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_CONJ_NO_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_CONJ_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_CONJ_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * m * n / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, S, rL, rR, d, e, G, H, RG, RH, W; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; double dtime_temp; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // If the matrix is a scalar, then the SVD is easy. if ( min_m_n == 1 ) { FLA_Copy( A, s ); FLA_Set_to_identity( U ); FLA_Set_to_identity( V ); return FLA_SUCCESS; } // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create matrices to hold the left and right Givens matrices. FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG ); FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH ); FLA_Obj_create( dt, m_A, n_A, 0, 0, &W ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the sub-diagonal to the real domain. // Extract the diagonal and sub-diagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_diagonals( A, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; //FLA_QR_UT_create_T( A, &TQ ); FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ ); dtime_temp = FLA_Clock(); { // Perform a QR factorization on A and form Q in U. FLA_QR_UT( A, TQ ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { FLA_QR_UT_form_Q( A, TQ, U ); } *dtime_appq = FLA_Clock() - dtime_temp; FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_diagonals( R, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); } *dtime_appq += FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; FLA_Obj_free( &R ); } } else // if ( m_A < n_A ) { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); FLA_Obj_free( &RG ); FLA_Obj_free( &RH ); FLA_Obj_free( &W ); return r_val; }
void time_Syrk_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *gflops ) { int irep, info, lwork; double dtime_old, d_minus_one = -1.0, d_one = 1.0; FLA_Obj Cold; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold ); FLA_Copy_external( C, Cold ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( Cold, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C ); break; default: printf("trouble\n"); break; } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, Cref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, Cref ); } *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( Cold, C ); FLA_Obj_free( &Cold ); }
FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg, double* dtime_tred, double* dtime_tevd, double* dtime_appq ) { FLA_Error r_val = FLA_SUCCESS; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, r, d, e, G, R, W; FLA_Obj d0, e0, ls, pu; dim_t mn_A; dim_t n_G = k_accum; double dtime_temp; mn_A = FLA_Obj_length( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); *dtime_tred = 1; *dtime_tevd = 1; *dtime_appq = 1; // If the matrix is a scalar, then the EVD is easy. if ( mn_A == 1 ) { FLA_Copy( A, l ); FLA_Set( FLA_ONE, A ); return FLA_SUCCESS; } // Create a matrix to hold block Householder transformations. FLA_Tridiag_UT_create_T( A, &T ); // Create a vector to hold the realifying scalars. FLA_Obj_create( dt, mn_A, 1, 0, 0, &r ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d0 ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e0 ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &pu ); FLA_Obj_create( FLA_INT, mn_A, 1, 0, 0, &ls ); FLA_Obj_create( dt_comp, mn_A-1, n_G, 0, 0, &G ); FLA_Obj_create( dt_real, mn_A, mn_A, 0, 0, &R ); FLA_Obj_create( dt, mn_A, mn_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Reduce the matrix to tridiagonal form. FLA_Tridiag_UT( uplo, A, T ); } *dtime_tred = FLA_Clock() - dtime_temp; // Apply scalars to rotate elements on the sub-diagonal to the real domain. FLA_Tridiag_UT_realify( uplo, A, r ); // Extract the diagonal and sub-diagonal from A. FLA_Tridiag_UT_extract_diagonals( uplo, A, d, e ); dtime_temp = FLA_Clock(); { // Form Q, overwriting A. FLA_Tridiag_UT_form_Q( uplo, A, T ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the scalars in r to Q. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); // Find the eigenvalues only. FLA_Copy( d, d0 ); FLA_Copy( e, e0 ); //r_val = FLA_Tevd_n_opt_var1( n_iter_max, d0, e0, G, A ); { int info; double* buff_d = FLA_DOUBLE_PTR( d0 ); double* buff_e = FLA_DOUBLE_PTR( e0 ); dsterf_( &mn_A, buff_d, buff_e, &info ); } FLA_Sort( FLA_FORWARD, d0 ); FLA_Set( FLA_ZERO, ls ); FLA_Set( FLA_ZERO, pu ); dtime_temp = FLA_Clock(); { // Perform an eigenvalue decomposition on the tridiagonal matrix. r_val = FLA_Tevd_v_opt_var4( n_iter_max, d, e, d0, ls, pu, G, R, W, A, b_alg ); } *dtime_tevd = FLA_Clock() - dtime_temp; // Copy the converged eigenvalues to the output vector. FLA_Copy( d, l ); // Sort the eigenvalues and eigenvectors in ascending order. FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_free( &T ); FLA_Obj_free( &r ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &d0 ); FLA_Obj_free( &pu ); FLA_Obj_free( &e0 ); FLA_Obj_free( &ls ); FLA_Obj_free( &G ); FLA_Obj_free( &R ); FLA_Obj_free( &W ); return r_val; }
void time_Gemm_pp_nn( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *mflops ) { int irep, info, lwork; double dtime_old, d_minus_one = -1.0, d_one = 1.0; FLA_Obj Cold; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold ); FLA_Copy_external( C, Cold ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( Cold, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, ONE, A, B, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_pp_nn_var1( FLA_ONE, A, B, C, nb_alg ); break; case FLA_ALG_BLOCKED: REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, ONE, A, B, FLA_ONE, C ); break; default: printf("trouble\n"); } break; } } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, Cref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, Cref ); } *mflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1000000; *dtime = dtime_old; FLA_Copy_external( Cold, C ); FLA_Obj_free( &Cold ); }
int main( int argc, char *argv[] ) { int i, j, n_threads, n_repeats, n_trials, increment, begin, sorting, caching, work_stealing, data_affinity; dim_t size, nb_alg; FLA_Datatype datatype = FLA_DOUBLE; FLA_Obj A, x, b, b_norm, AH, pH, bH; double b_norm_value, dtime, *dtimes, *flops; #ifndef FLA_ENABLE_WINDOWS_BUILD char output_file_m[100]; FILE *fpp; #endif fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' ); scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity ); fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #else sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | FLASH |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #endif FLA_Init(); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); FLASH_Queue_set_num_threads( n_threads ); FLASH_Queue_set_sorting( sorting ); FLASH_Queue_set_caching( caching ); FLASH_Queue_set_work_stealing( work_stealing ); FLASH_Queue_set_data_affinity( data_affinity ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); FLA_Obj_create( datatype, size, 1, 0, 0, &x ); FLA_Obj_create( datatype, size, 1, 0, 0, &b ); FLA_Obj_create( datatype, 1, 1, 0, 0, &b_norm ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); FLA_Random_matrix( b ); FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH ); FLASH_Obj_create( FLA_INT, size, 1, 1, &nb_alg, &pH ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_alg, &bH ); dtime = FLA_Clock(); FLASH_LU_piv( AH, pH ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; FLASH_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pH, bH ); FLASH_Trsv( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, AH, bH ); FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, AH, bH ); FLASH_Obj_free( &AH ); FLASH_Obj_free( &pH ); FLASH_Obj_flatten( bH, x ); FLASH_Obj_free( &bH ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 2.0 / 3.0 * size * size * size / dtime / 1e9; FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A, x, FLA_MINUS_ONE, b ); FLA_Nrm2_external( b, b_norm ); FLA_Obj_extract_real_scalar( b_norm, &b_norm_value ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, " %d %6.3f %le\n", size, flops[i], b_norm_value ); #else fprintf( fpp, " %d %6.3f\n", size, flops[i] ); fprintf( stdout, "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); fprintf( stdout, "Matrix size: %u x %u | nb_alg: %u\n", size, size, nb_alg ); fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &b_norm ); } #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "];\n\n" ); #else fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); #endif FLA_free( dtimes ); FLA_free( flops ); FLA_Finalize(); return 0; }
int main(int argc, char *argv[]) { int n, nfirst, nlast, ninc, nlast_unb, i, irep, nrepeats, nb_alg; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, Aref, Aold, delta; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times and the fastest run in recorded */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable This is used to set the y-axis range for the graphs. Here is how you figure out what to enter (on Linux machines): 1) more /proc/cpuinfo (this lists the contents of this file). 2) read through this and figure out the clock rate of the machine (in GHz). 3) Find out (from an expert of from the web) the number of floating point instructions that can be performed per core per clock cycle. 4) Figure out if you are using "multithreaded BLAS" which automatically parallelize calls to the Basic Linear Algebra Subprograms. If so, check how many cores are available. 5) Multiply 2) x 3) x 4) and enter this in response to the below. If you enter a value for max GFLOPS that is lower that the maximum that is observed in the experiments, then the top of the graph is set to the observed maximum. Thus, one possibility is to simply set this to 0.0. */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed. Unblocked versions are only tested to nlast_unb */ printf( "%% enter nfirst, nlast, ninc, nlast_unb:" ); scanf( "%d%d%d%d", &nfirst, &nlast, &ninc, &nlast_unb ); printf( "%% %d %d %d %d\n", nfirst, nlast, ninc, nlast_unb ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aref ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aold ); FLA_Obj_create( FLA_DOUBLE, 1, 1, 1, 1, &delta ); /* Generate random matrix A and save in Aold */ FLA_Random_matrix( Aold ); /* Add something large to the diagonal to make sure it isn't ill-conditionsed */ d_n = ( double ) n; *( ( double * ) FLA_Obj_buffer_at_view( delta ) ) = d_n; FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold ); /* Set gflops = billions of floating point operations that will be performed */ gflops = 1.0/3.0 * n * n * n * 1.0e-09; /* Time the reference implementation */ #if TIME_LAPACK == TRUE #else // if ( n <= nlast_unb ) #endif { for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, Aref ); dtime = FLA_Clock(); REF_Chol( TIME_LAPACK, Aref, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); } /* Time FLA_Chol */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); FLA_Chol( FLA_LOWER_TRIANGULAR, A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); if ( gflops / dtime_best > max_gflops ) max_gflops = gflops / dtime_best; fflush( stdout ); /* Time the your implementations */ /* Variant 1 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR1 == TRUE Chol_unb_var1( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR1 == TRUE Chol_blk_var1( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 2 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR2 == TRUE Chol_unb_var2( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR2 == TRUE Chol_blk_var2( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 3 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR3 == TRUE Chol_unb_var3( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR3 == TRUE Chol_blk_var3( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &Aold ); FLA_Obj_free( &Aref ); FLA_Obj_free( &delta ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); #if OCTAVE == TRUE /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), '-k;libflame;' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), '-m;reference;' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), \"-rx;UnbVar1;\" ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), \"-go;UnbVar2;\" ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), \"-b*;UnbVar3;\" ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), \"-rx;BlkVar1;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), \"-go;BlkVar2;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), \"-b*;BlkVar3;\", \"markersize\", 3 ); \n" ); #else /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.x' ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.o' ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.*' ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r-x'); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g-o'); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b-*'); \n" ); #endif printf( "hold off \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); #if OCTAVE == TRUE printf( "legend( 2 ); \n" ); printf(" print -landscape -solid -color -deps -F:24 Chol.eps\n" ); #else printf( "legend( 'FLA Chol', ...\n"); printf( " 'Simple loops', ...\n"); printf( " 'unb var1', ...\n"); printf( " 'unb var2', ...\n"); printf( " 'unb var3', ...\n"); printf( " 'blk var1', ...\n"); printf( " 'blk var2', ...\n"); printf( " 'blk var3', 2);\n"); printf( "print -r100 -dpdf Chol.pdf\n"); #endif FLA_Finalize( ); exit( 0 ); }
void time_Lyap( int param_combo, int type, int nrepeats, int m, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_save, norm; if ( param_combo == 0 && type == FLA_ALG_FRONT ) { *gflops = 0.0; *diff = 0.0; return; } FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm ); FLASH_Copy( C, C_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_save, C ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ //case FLA_ALG_REFERENCE: // REF_Lyap( FLA_NO_TRANSPOSE, isgn, A_flat, C_flat, scale ); // break; case FLA_ALG_FRONT: FLASH_Lyap( FLA_NO_TRANSPOSE, isgn, A, C, scale ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ //case FLA_ALG_REFERENCE: // REF_Lyap( FLA_CONJ_TRANSPOSE, isgn, A_flat, C_flat, scale ); // break; case FLA_ALG_FRONT: FLASH_Lyap( FLA_CONJ_TRANSPOSE, isgn, A, C, scale ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } /* if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } */ { FLA_Obj X, W; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X ); FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W ); FLASH_Copy( C, X ); FLASH_Hermitianize( FLA_UPPER_TRIANGULAR, X ); if ( param_combo == 0 ) { FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W ); FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, X, A, FLA_ONE, W ); } else if ( param_combo == 1 ) { FLASH_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W ); FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE, W ); } FLASH_Scal( isgn, W ); FLASH_Axpy( FLA_MINUS_ONE, C_save, W ); FLASH_Norm1( W, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLASH_Obj_free( &X ); FLASH_Obj_free( &W ); } *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_save, C ); FLASH_Obj_free( &C_save ); FLA_Obj_free( &norm ); }
int main( int argc, char *argv[] ) { int i, j, n_threads, n_repeats, n_trials, increment, begin, sorting, caching, work_stealing, data_affinity; dim_t size, nb_alg; FLA_Datatype datatype = FLA_DOUBLE; FLA_Inv inv = FLA_NO_INVERSE; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Obj A, B, x, b, b_norm, AH, BH; double length, b_norm_value = 0.0, dtime, *dtimes, *flops; #ifndef FLA_ENABLE_WINDOWS_BUILD char output_file_m[100]; FILE *fpp; #endif fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' ); scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity ); fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #else sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | FLASH |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #endif FLA_Init(); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); FLASH_Queue_set_num_threads( n_threads ); FLASH_Queue_set_sorting( sorting ); FLASH_Queue_set_caching( caching ); FLASH_Queue_set_work_stealing( work_stealing ); FLASH_Queue_set_data_affinity( data_affinity ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); FLA_Obj_create( datatype, size, size, 0, 0, &B ); FLA_Obj_create( datatype, size, 1, 0, 0, &x ); FLA_Obj_create( datatype, size, 1, 0, 0, &b ); FLA_Obj_create( datatype, 1, 1, 0, 0, &b_norm ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( x ); FLA_Random_matrix( b ); FLA_Symmetrize( uplo, A ); FLA_Symmetrize( uplo, B ); length = ( double ) FLA_Obj_length( B ); FLA_Add_to_diag( &length, B ); FLA_Symv_external( uplo, FLA_ONE, B, x, FLA_ZERO, b ); FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &nb_alg, &BH ); FLASH_Chol( uplo, BH ); dtime = FLA_Clock(); FLASH_Eig_gest( inv, uplo, AH, BH ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; FLASH_Obj_free( &AH ); FLASH_Obj_free( &BH ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 1.0 * size * size * size / dtime / 1e9; #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, " %d %6.3f %le\n", size, flops[i], b_norm_value ); #else fprintf( fpp, " %d %6.3f\n", size, flops[i] ); fprintf( stdout, "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); fprintf( stdout, "Matrix size: %u x %u | nb_alg: %u\n", size, size, nb_alg ); fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &b_norm ); } #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "];\n\n" ); #else fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); #endif FLA_free( dtimes ); FLA_free( flops ); FLA_Finalize(); return 0; }
void time_Copy( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copy( A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Copy( A, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * m * n / dtime_old / 1.0e9; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &C_flat ); }
void time_Transpose( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj A_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_old, A_tmp; fla_blocksize_t* bp; fla_transpose_t* cntl_trans_var_unb; fla_transpose_t* cntl_trans_var_blk; fla_swap_t* cntl_swap_var_blk; fla_swap_t* cntl_swap_blas; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_swap_blas = FLA_Cntl_swap_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_swap_var_blk = FLA_Cntl_swap_obj_create( FLA_FLAT, FLA_UNBLOCKED_VARIANT1, bp, cntl_swap_blas ); cntl_trans_var_unb = FLA_Cntl_transpose_obj_create( FLA_FLAT, FLA_UNBLOCKED_VARIANT1, NULL, NULL, NULL ); cntl_trans_var_blk = FLA_Cntl_transpose_obj_create( FLA_FLAT, variant, bp, cntl_trans_var_unb, cntl_swap_var_blk ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_old ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_tmp ); FLA_Copy_external( A, A_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_old, A ); *dtime = FLA_Clock(); switch( variant ){ case 0: //FLA_Copyt_external( FLA_TRANSPOSE, A, A_tmp ); //FLA_Set( FLA_ZERO, A ); //FLA_Copyt_external( FLA_NO_TRANSPOSE, A_tmp, A ); FLA_Transpose( A ); break; case 1:{ /* Time variant 1 */ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Transpose_unb_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Transpose_blk_var1( A, cntl_trans_var_blk ); break; default: printf("trouble\n"); } break; } case 2:{ /* Time variant 2 */ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Transpose_unb_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Transpose_blk_var2( A, cntl_trans_var_blk ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_trans_var_blk ); FLA_Cntl_obj_free( cntl_trans_var_unb ); FLA_Cntl_obj_free( cntl_swap_var_blk ); FLA_Cntl_obj_free( cntl_swap_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ){ FLA_Copy_external( A, A_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( A, A_ref ); } *gflops = 4 * n * n / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( A_old, A ); FLA_Obj_free( &A_old ); FLA_Obj_free( &A_tmp ); }