FLA_Error FLASH_Random_matrix( FLA_Obj H ) { FLA_Obj F; // Exit early if one dimension is zero. if ( FLA_Obj_has_zero_dim( H ) ) return FLA_SUCCESS; // Create a temporary flat copy of the hierarchical object. FLASH_Obj_create_flat_copy_of_hier( H, &F ); // Randomize the flat matrix object. FLA_Random_matrix( F ); // Copy the flat object's contents back to the hierarchical object. FLASH_Obj_hierarchify( F, H ); // Free the temporary flat object. FLA_Obj_free( &F ); return FLA_SUCCESS; }
int main( int argc, char** argv ) { FLA_Datatype testtype = TESTTYPE; FLA_Datatype realtype = REALTYPE; dim_t m; FLA_Obj a, b; FLA_Error init_result; if ( argc == 2 ) { m = atoi(argv[1]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m\n", argv[0]); fprintf(stderr, " m : test vector length\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); FLA_Obj_create( testtype, m, 1, 0, 0, &a ); FLA_Random_matrix( a ); FLA_Obj_fshow( stdout, "- a -", a, "% 6.4e", "--" ); FLA_Obj_create( realtype, 1, m, 0, 0, &b ); FLA_Obj_extract_real_part( a, b ); FLA_Obj_fshow( stdout, "- a real -", b, "% 6.4e", "--" ); FLA_Obj_extract_imag_part( a, b ); FLA_Obj_fshow( stdout, "- a imag -", b, "% 6.4e", "--" ); FLA_Obj_free( &b ); FLA_Obj_free( &a ); FLA_Finalize_safe( init_result ); }
int main(int argc, char *argv[]) { int datatype, m_input, n_input, m, n, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; char *colors = "brkgmcbrkgmcbrkgmc"; char *ticks = "o+*xso+*xso+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, B, C, C_ref; FLA_Init( ); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d%d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( n < 0 ) n = p / abs(n_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){ // If multiplying A on the left, A is m x m; ...on the right, A is n x n. if ( pc_str[param_combo][0] == 'l' ) FLA_Obj_create( datatype, m, m, 0, 0, &A ); else FLA_Obj_create( datatype, n, n, 0, 0, &A ); FLA_Obj_create( datatype, m, n, 0, 0, &B ); FLA_Obj_create( datatype, m, n, 0, 0, &C ); FLA_Obj_create( datatype, m, n, 0, 0, &C_ref ); FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); FLA_Copy_external( C, C_ref ); fprintf( stdout, "data_symm_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_Symm( param_combo, FLA_ALG_REFERENCE, n_repeats, m, n, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Symm( param_combo, FLA_ALG_FRONT, n_repeats, m, n, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_symm_%s( :,1 ), data_symm_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_symm_%s( :,1 ), data_symm_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_symm\\_%s', 'fla\\_symm\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME symm front-end performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc symm_front_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int datatype, m_input, m, p_first, p_last, p_inc, p, nb_alg, variant, n_repeats, i, j, n_variants = N_VARIANTS; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, b, b_orig, norm; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / f2c_abs(m_input); FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, m, 1, 0, 0, &b_orig ); /* FLA_Obj_create( datatype, m, m, m, 1, &A ); FLA_Obj_create( datatype, m, 1, 1, 1, &b ); FLA_Obj_create( datatype, m, 1, 1, 1, &b_orig ); */ if ( FLA_Obj_is_single_precision( A ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm ); FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Random_matrix( b ); FLA_Copy_external( b, b_orig ); /* time_Trinv_un( 0, FLA_ALG_REFERENCE, n_repeats, m, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf ]; \n", i, p, gflops ); fflush( stdout ); */ for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:7 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Trinv_un( variant, FLA_ALG_UNBLOCKED, n_repeats, m, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Trinv_un( variant, FLA_ALG_UNB_OPT, n_repeats, m, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Trinv_un( variant, FLA_ALG_BLOCKED, n_repeats, m, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } FLA_Obj_free( &A ); FLA_Obj_free( &b ); FLA_Obj_free( &b_orig ); FLA_Obj_free( &norm ); fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); for ( i = 1; i <= n_variants; i++ ){ fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", variant, variant, colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i <= n_variants; i++ ) fprintf( stdout, "'FLAME var%d', ... \n", i ); fprintf( stdout, "'Location', 'SouthWest' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME trinv\\_u performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc trinv_l_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); }
int main(int argc, char *argv[]) { int datatype, m_input, n_input, m, n, min_m_n, p_first, p_last, p_inc, pp, pivot_combo, n_repeats, i, n_pivot_combos = N_PIVOT_COMBOS; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj C, b, b_orig, b_norm; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d %d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; for ( pp = p_first, i = 1; pp <= p_last; pp += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = pp / abs(m_input); if( n < 0 ) n = pp / abs(n_input); min_m_n = min( m, n ); for ( pivot_combo = 0; pivot_combo < n_pivot_combos; pivot_combo++ ){ FLA_Obj_create( datatype, m, n, 0, 0, &C ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, m, 1, 0, 0, &b_orig ); if ( FLA_Obj_is_single_precision( C ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &b_norm ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &b_norm ); FLA_Random_matrix( C ); FLA_Random_matrix( b ); FLA_Copy_external( b, b_orig ); fprintf( stdout, "data_lu_%s( %d, 1:5 ) = [ %d ", pc_str[pivot_combo], i, pp ); fflush( stdout ); //time_LU( pivot_combo, FLA_ALG_REFERENCE, n_repeats, m, n, // C, b, b_orig, b_norm, &dtime, &diff, &gflops ); //fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); //fflush( stdout ); time_LU( pivot_combo, FLA_ALG_FRONT, n_repeats, m, n, C, b, b_orig, b_norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &C ); FLA_Obj_free( &b ); FLA_Obj_free( &b_orig ); FLA_Obj_free( &b_norm ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_pivot_combos; i++ ) { fprintf( stdout, "plot( data_lu_%s( :,1 ), data_lu_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_lu_%s( :,1 ), data_lu_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_pivot_combos; i++ ) fprintf( stdout, "'ref\\_lu\\_%s', 'fla\\_lu\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME LU front-end performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc lu_front_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int datatype, precision, m_input, k_input, n_input, m, k, n, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; char *colors = "brkgmcbrkgmcbrkgmc"; char *ticks = "o+*xso+*xso+*xso+*xs"; char m_dim_desc[14]; char k_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char k_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, Ad, Az, B, Bd, Bz, C, Cd, Cz, C_ref, indexd, indexz; FLA_Obj alpha0d, alpha0z, alpha1d, alpha1z, normd, normz; FLA_Obj alphad, alphaz, betad, betaz, rhod, rhoz; FLA_Obj xd, xz, yd, yz; FLA_Init( ); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m k n (-1 means bind to problem size): ", '%' ); scanf( "%d%d%d", &m_input, &k_input, &n_input ); fprintf( stdout, "%c %d %d %d\n", '%', m_input, k_input, n_input ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( k_input > 0 ) { sprintf( k_dim_desc, "k = %d", k_input ); sprintf( k_dim_tag, "k%dc", k_input); } else if( k_input < -1 ) { sprintf( k_dim_desc, "k = p/%d", -k_input ); sprintf( k_dim_tag, "k%dp", -k_input ); } else if( k_input == -1 ) { sprintf( k_dim_desc, "k = p" ); sprintf( k_dim_tag, "k%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } //precision = FLA_SINGLE_PRECISION; precision = FLA_DOUBLE_PRECISION; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; k = k_input; n = n_input; if( m < 0 ) m = p / f2c_abs(m_input); if( k < 0 ) k = p / f2c_abs(k_input); if( n < 0 ) n = p / f2c_abs(n_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){ // Determine datatype based on trans argument. if ( pc_str[param_combo][0] == 'c' || pc_str[param_combo][1] == 'c' ) { if ( precision == FLA_SINGLE_PRECISION ) datatype = FLA_COMPLEX; else datatype = FLA_DOUBLE_COMPLEX; } else { if ( precision == FLA_SINGLE_PRECISION ) datatype = FLA_FLOAT; else datatype = FLA_DOUBLE; } // If transposing A, switch dimensions. if ( pc_str[param_combo][0] == 'n' ) FLA_Obj_create( datatype, m, k, 0, 0, &A ); else FLA_Obj_create( datatype, k, m, 0, 0, &A ); // If transposing B, switch dimensions. if ( pc_str[param_combo][1] == 'n' ) FLA_Obj_create( datatype, k, n, 0, 0, &B ); else FLA_Obj_create( datatype, n, k, 0, 0, &B ); FLA_Obj_create( datatype, m, n, 0, 0, &C ); FLA_Obj_create( datatype, m, n, 0, 0, &C_ref ); FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); FLA_Copy_external( C, C_ref ); fprintf( stdout, "data_gemm_%s( %d, 1:5 ) = [ %4d %4d %4d ", pc_str[param_combo], i, m, k, n ); fflush( stdout ); time_Gemm( param_combo, FLA_ALG_REFERENCE, n_repeats, m, k, n, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); /* time_Gemm( param_combo, FLA_ALG_FRONT, n_repeats, m, k, n, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); */ fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_gemm_%s( :,1 ), data_gemm_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_gemm_%s( :,1 ), data_gemm_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_gemm\\_%s', 'fla\\_gemm\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME gemm front-end performance (%s, %s, %s)' );\n", m_dim_desc, k_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc gemm_front_%s_%s_%s.eps\n", m_dim_tag, k_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int datatype, n_input, mB_input, mC_input, mD_input, mB, mC, mD, n, p_first, p_last, p_inc, p, b_alg, variant, n_repeats, i, n_variants = 1; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj B, C, D, T, R, E; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter algorithmic blocksize:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter n (-1 means bind to problem size): ", '%' ); scanf( "%d", &n_input ); fprintf( stdout, "%c %d\n", '%', n_input ); fprintf( stdout, "%c enter mB mC mD (-1 means bind to problem size): ", '%' ); scanf( "%d %d %d", &mB_input, &mC_input, &mD_input ); fprintf( stdout, "%c %d %d %d\n", '%', mB_input, mC_input, mD_input ); fprintf( stdout, "\nclear all;\n\n" ); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { mB = mB_input; mC = mC_input; mD = mD_input; n = n_input; if( mB < 0 ) mB = p / abs(mB_input); if( mC < 0 ) mC = p / abs(mC_input); if( mD < 0 ) mD = p / abs(mD_input); if( n < 0 ) n = p / abs(n_input); for ( variant = 0; variant < n_variants; variant++ ){ FLA_Obj_create( datatype, mB, n, 0, 0, &B ); FLA_Obj_create( datatype, mC, n, 0, 0, &C ); FLA_Obj_create( datatype, mD, n, 0, 0, &D ); FLA_Obj_create( datatype, b_alg, n, 0, 0, &T ); FLA_Obj_create( datatype, n, n, 0, 0, &R ); FLA_Obj_create( datatype, n, n, 0, 0, &E ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); FLA_Random_matrix( D ); FLA_Set( FLA_ZERO, R ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, B, FLA_ONE, R ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, D, FLA_ONE, R ); FLA_Chol( FLA_UPPER_TRIANGULAR, R ); FLA_Set( FLA_ZERO, E ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, B, FLA_ONE, E ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, C, FLA_ONE, E ); FLA_Chol( FLA_UPPER_TRIANGULAR, E ); fprintf( stdout, "data_uddate_ut( %d, 1:5 ) = [ %d ", i, p ); fflush( stdout ); time_UDdate_UT( variant, FLA_ALG_FRONT, n_repeats, mB, mC, mD, n, B, C, D, T, R, E, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &D ); FLA_Obj_free( &T ); FLA_Obj_free( &R ); FLA_Obj_free( &E ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_variants; i++ ) { fprintf( stdout, "plot( data_qr_ut( :,1 ), data_qr_ut( :, 2 ), '%c:%c' ); \n", colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_qr_ut( :,1 ), data_qr_ut( :, 4 ), '%c-.%c' ); \n", colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_variants; i++ ) fprintf( stdout, "'ref\\_qr\\_ut', 'fla\\_qr\\_ut', ... \n" ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME UDdate_UT front-end performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc qr_ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int m_input, k_input, n_input, m, n, k, p_first, p_last, p_inc, p, nb_alg, n_repeats, variant, n_threads, n_thread_experiments, i, j; int n_threads_exp[64]; char *colors = "brkgmckkk"; char *ticks = "o+*xso+*x"; char m_dim_desc[14]; char k_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[5]; char k_dim_tag[5]; char n_dim_tag[5]; char nth_str[32]; double max_gflops=6.0; double dtime, gflops, diff, d_n; FLA_Obj A, B, C, C_ref; /* Initialize FLAME */ FLA_Init( ); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m k n (-1 means bind to problem size: ", '%' ); scanf( "%d%d%d", &m_input, &k_input, &n_input ); fprintf( stdout, "%c %d %d %d\n", '%', m_input, k_input, n_input ); fprintf( stdout, "%c enter number of thread experiments: ", '%' ); scanf( "%d", &n_thread_experiments ); fprintf( stdout, "%c %d\n", '%', n_thread_experiments ); fprintf( stdout, "%c enter number of threads for each experiment (separated by spaces): ", '%' ); for( i = 0; i < n_thread_experiments; ++i ) scanf( "%d", &n_threads_exp[i] ); fprintf( stdout, "%c", '%' ); for( i = 0; i < n_thread_experiments; ++i ) fprintf( stdout, " %d", n_threads_exp[i] ); /* Delete all existing data structures */ fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( k_input > 0 ) { sprintf( k_dim_desc, "k = %d", k_input ); sprintf( k_dim_tag, "k%dc", k_input); } else if( k_input < -1 ) { sprintf( k_dim_desc, "k = p/%d", -k_input ); sprintf( k_dim_tag, "k%dp", -k_input ); } else if( k_input == -1 ) { sprintf( k_dim_desc, "k = p" ); sprintf( k_dim_tag, "k%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } m = p_last; k = p_last; n = p_last; sprintf( nth_str, "OMP_NUM_THREADS=%d", n_threads_exp[ n_thread_experiments-1 ] ); putenv( nth_str ); blas_cpu_number = n_threads_exp[ n_thread_experiments-1 ]; blas_thread_init(); for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; k = k_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( k < 0 ) k = p / abs(k_input); if( n < 0 ) n = p / abs(n_input); FLA_Obj_create( FLA_DOUBLE, m, k, &A ); FLA_Obj_create( FLA_DOUBLE, k, n, &B ); FLA_Obj_create( FLA_DOUBLE, m, n, &C ); FLA_Obj_create( FLA_DOUBLE, m, n, &C_ref ); /* Generate random matrices A, C */ if( p > 4000 ){ FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); FLA_Copy_external( C, C_ref ); } blas_cpu_number = 1; //time_Gemm_nn( 0, FLA_ALG_REFERENCE, n_repeats, p, nb_alg, // A, B, C, C_ref, &dtime, &diff, &gflops ); //fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf ]; \n", i, p, gflops ); //fflush( stdout ); for ( j = 0; j < n_thread_experiments; j++ ){ n_threads = n_threads_exp[j]; blas_cpu_number = n_threads; fprintf( stdout, "data_nth%d( %d, 1:3 ) = [ %d ", n_threads, i, p ); fflush( stdout ); time_Gemm_nn( 0, FLA_ALG_REFERENCE, n_repeats, p, nb_alg, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ fprintf( stdout, "figure;\n" ); /* Indicate that you want to add to the existing plot */ fprintf( stdout, "hold on;\n" ); /* Plot the data for the other numbers of threads */ for ( i = 0; i < n_thread_experiments; i++ ){ fprintf( stdout, "plot( data_nth%d( :,1 ), data_nth%d( :, 2 ), '%c:%c' ); \n", n_threads_exp[ i ], n_threads_exp[ i ], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_thread_experiments-1; i++ ) fprintf( stdout, "'%d threads', ... \n", n_threads_exp[ i ] ); fprintf( stdout, "'%d threads', 'Location', 'Best' ); \n", n_threads_exp[ n_thread_experiments-1 ] ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, n_threads_exp[n_thread_experiments-1] * max_gflops ); fprintf( stdout, "title( 'Goto BLAS dgemm performance (%s, %s, %s)' );\n", m_dim_desc, k_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc gemm_nn_goto_p_%s_%s_%s.eps\n", m_dim_tag, k_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); }
void libfla_test_apqut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input; signed int n_input; FLA_Side side; FLA_Trans trans; FLA_Direct direct; FLA_Store storev; FLA_Obj A, T, W, B, eye, norm; FLA_Obj B_save; FLA_Obj A_test, T_test, W_test, B_test; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans ); FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct ); FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev ); // We want to make sure the Apply_Q_UT routines work with rectangular // matrices. So we use m > n when testing with column-wise storage (via // QR factorization) and m < n when testing with row-wise storage (via // LQ factorization). if ( storev == FLA_COLUMNWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } else // if ( storev == FLA_ROWWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); if ( storev == FLA_COLUMNWISE ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye ); FLA_Apply_Q_UT_create_workspace( T, B, &W ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_matrix( A ); FLA_Set_to_identity( B ); FLA_Set_to_identity( eye ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); else // if ( storev == FLA_ROWWISE ) FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { A_test = A; T_test = T; W_test = W; B_test = B; } // Compute a Householder factorization. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test ); else FLASH_LQ_UT( A_test, T_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test ); else FLA_LQ_UT( A_test, T_test ); } // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( B_save, B_test ); else FLA_Copy_external( B_save, B_test ); time = FLA_Clock(); libfla_test_apqut_impl( impl, side, trans, direct, storev, A_test, T_test, W_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Multiply by its conjugate-transpose to get what should be (near) identity // and then subtract from actual identity to get what should be (near) zero. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( B_test, B ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &W_test ); FLASH_Obj_free( &B_test ); } // Compute the norm of eye, which contains I - Q * Q'. FLA_Norm1( eye, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Compute the performance of the best experiment repeat. *perf = ( 4.0 * m * min_m_n * n - 2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Free the supporting flat objects. FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &B ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z; dim_t m, n, k; dim_t min_m_n; FLA_Error init_result; double residual_A, residual_Axy; int use_form_q = 1; if ( argc == 4 ) { m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n k\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " k : number of house holder vectors applied for testing\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME LQ^H setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_LQ_UT_create_T( A, &T ); // Rand A and create A_copy. FLA_Random_matrix( A ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); FLA_Copy( A, A_copy ); // LQ test ( A = L Q^H ) FLA_LQ_UT( A, T ); // Create Q (identity), L (A_copy) FLA_Obj_create( datatype, m, n, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Obj_create( datatype, m, m, 0, 0, &D ); FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk ); FLA_Obj_create( datatype, k, k, 0, 0, &Dk ); FLA_Obj_create( datatype, m, m, 0, 0, &L ); // Q^H := I H_{0}^H ... H_{k-1}^H if ( use_form_q ) { FLA_LQ_UT_form_Q( A, T, Q ); } else { FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, Q ); FLA_Obj_free( &W ); } // D := Q^T Q FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Q, Q, FLA_ZERO, D ); // Qk := I H0 ... Hk FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT ); FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP ); if ( use_form_q ) { // Overwrite the result to test FLAME API FLA_Set( FLA_ZERO, Qk ); FLA_Copy( Ak, Qk ); FLA_LQ_UT_form_Q( Ak, Tk, Qk ); } else { FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, Ak, Tk, W, Qk ); FLA_Obj_free( &W ); } // Dk := Qk^T Qk FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); // L := A (Q^H)^H if ( use_form_q ) { // Note that the formed Q is actually Q^H; transb should be carefully assigned. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_copy, Q, FLA_ZERO, L ); } else { FLA_Apply_Q_UT_create_workspace( T, L, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, L ); FLA_Obj_free( &W ); } FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, L, Q, FLA_ZERO, A_recovered ); // Create vectors for testing FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z ); // x is given FLA_Set( FLA_ONE, x ); // y := Ax FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y ); // z := L (Q^H) x , libflame FLA_Apply_Q_UT_create_workspace( T, x, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, x ); FLA_Obj_free( &W ); if ( m < n ) FLA_Part_2x1( x, &x, &W, m, FLA_TOP ); else FLA_Part_1x2( L, &L, &W, n, FLA_LEFT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z ); // Comapre (A_copy, A_recovered), (y,z) and (y,w) residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); residual_Axy = FLA_Max_elemwise_diff( y, z ); if ( 1 || residual_A > EPS || residual_Axy > EPS ) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k); fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n", residual_A, residual_Axy ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Obj_free( &L ); FLA_Obj_free( &Q ); FLA_Obj_free( &Qk ); FLA_Obj_free( &D ); FLA_Obj_free( &Dk ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Finalize_safe( init_result ); }
int main(int argc, char *argv[]) { int m_input, n_input, m, n, min_m_n, p_first, p_last, p_inc, p, nb_alg, variant, n_repeats, i, datatype, n_variants = 2; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, t, T, TT, w, W, WW, b, b_ref; /* Initialize FLAME */ FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d %d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( n < 0 ) n = p / abs(n_input); min_m_n = min( m, n ); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; /* FLA_Obj_create( datatype, m, n, n, 1, &A ); FLA_Obj_create( datatype, m, n, n, 1, &A ); FLA_Obj_create( datatype, 1, min_m_n, 0, 0, &t ); FLA_Obj_create( datatype, m, n, n, 1, &T ); FLA_Obj_create( datatype, nb_alg, n, n, 1, &TT ); FLA_Obj_create( datatype, 1, 1, 0, 0, &w ); FLA_Obj_create( datatype, m, 1, 1, 1, &W ); FLA_Obj_create( datatype, nb_alg, 1, 1, 1, &WW ); FLA_Obj_create( datatype, m, 1, 1, 1, &b ); FLA_Obj_create( datatype, m, 1, 1, 1, &b_ref ); */ /* FLA_Obj_create( datatype, m, n, n, 1, &A ); //FLA_Obj_create( datatype, 1, min_m_n, 0, 0, &t ); //FLA_Obj_create( datatype, m, n, n, 1, &T ); //FLA_Obj_create( datatype, nb_alg, n, n, 1, &TT ); //FLA_Obj_create( datatype, 1, 1, 0, 0, &w ); //FLA_Obj_create( datatype, m, 1, 1, 1, &W ); //FLA_Obj_create( datatype, nb_alg, 1, 1, 1, &WW ); //FLA_Obj_create( datatype, m, 1, 1, 1, &b ); //FLA_Obj_create( datatype, m, 1, 1, 1, &b_ref ); */ //FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_Obj_create( datatype, m, n, n, 1, &A ); FLA_Obj_create( datatype, 1, min_m_n, 0, 0, &t ); FLA_Obj_create( datatype, m, n, 0, 0, &T ); //FLA_Obj_create( datatype, m, n, n, 1, &T ); FLA_Obj_create( datatype, nb_alg, n, 0, 0, &TT ); FLA_Obj_create( datatype, 1, 1, 0, 0, &w ); FLA_Obj_create( datatype, m, 1, 0, 0, &W ); FLA_Obj_create( datatype, nb_alg, 1, 0, 0, &WW ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, m, 1, 0, 0, &b_ref ); FLA_Random_matrix( A ); FLA_Random_matrix( b ); /* time_LQ( 0, FLA_ALG_REFERENCE, n_repeats, m, n, nb_alg, A, A_ref, t, T, W, b, b_ref, &dtime, &diff, &gflops ); fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff ); fflush( stdout ); */ for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p ); fflush( stdout ); time_LQ( variant, FLA_ALG_UNBLOCKED, n_repeats, m, n, nb_alg, A, t, T, TT, w, W, WW, b, b_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_LQ( variant, FLA_ALG_UNB_OPT1, n_repeats, m, n, nb_alg, A, t, T, TT, w, W, WW, b, b_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_LQ( variant, FLA_ALG_BLOCKED, n_repeats, m, n, nb_alg, A, t, T, TT, w, W, WW, b, b_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &t ); FLA_Obj_free( &T ); FLA_Obj_free( &TT ); FLA_Obj_free( &w ); FLA_Obj_free( &W ); FLA_Obj_free( &WW ); FLA_Obj_free( &b ); FLA_Obj_free( &b_ref ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME LQ performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc lq_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, b_alg, variant, n_repeats, i, datatype, n_variants = 1; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double safemin; double dtime, gflops, diff; FLA_Obj A, l, Q, T, W; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } /* char ch = 's'; safemin = dlamch_( &ch ); printf( "safemin = %23.15e\n", safemin ); ch = 'e'; double eps = dlamch_( &ch ); printf( "eps dla = %23.15e\n", eps ); printf( "eps fla = %23.15e\n", FLA_EPSILON_D ); */ for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / f2c_abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &Q ); FLA_Obj_create( datatype, 32, m, 0, 0, &T ); FLA_Obj_create( datatype, 32, m, 0, 0, &W ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), m, 1, 0, 0, &l ); //FLA_Random_herm_matrix( FLA_LOWER_TRIANGULAR, A ); //FLA_Random_spd_matrix( FLA_LOWER_TRIANGULAR, A ); FLA_Random_matrix( A ); FLA_Obj_set_to_identity( Q ); FLA_QR_UT( A, T ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q ); fill_eigenvalues( l ); //FLA_Obj_show( "eig", l, "%9.2e ", "" ); FLA_Apply_diag_matrix( FLA_LEFT, FLA_NO_CONJUGATE, l, Q ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, Q ); FLA_Copy( Q, A ); time_Hevd_ln( 0, FLA_ALG_REFERENCE, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "data_REFs( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff ); fflush( stdout ); time_Hevd_ln( -1, FLA_ALG_REFERENCE, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "data_REFd( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:9 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Hevd_ln( variant, FLA_ALG_UNBLOCKED, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); //time_Hevd_ln( variant, FLA_ALG_UNB_OPT, n_repeats, m, b_alg, // A, l, &dtime, &diff, &gflops ); //fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); //fflush( stdout ); fprintf( stdout, "];\n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &Q ); FLA_Obj_free( &l ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME Hevd_ln performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int datatype, m_input, m, p_first, p_last, p_inc, p, variant, n_repeats, i, j, nb_alg, nfc, nlc, n_variants = 1; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj C, C_ref, t; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "%c enter nfc, nlc (number of columns, initial and trailing, not processed): ", '%' ); scanf( "%d %d", &nfc, &nlc ); fprintf( stdout, "%c %d %d\n", '%', nfc, nlc ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } //datatype = FLA_FLOAT; datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; //datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); for ( variant = 0; variant < n_variants; variant++ ){ FLA_Obj_create( datatype, m, m, &C ); FLA_Obj_create( datatype, m, m, &C_ref ); FLA_Obj_create( datatype, m, 1, &t ); FLA_Random_matrix( C ); FLA_Copy_external( C, C_ref ); fprintf( stdout, "data_hess( %d, 1:5 ) = [ %d ", i, p ); fflush( stdout ); time_Hess( variant, FLA_ALG_REFERENCE, n_repeats, m, nfc, nlc, C, C_ref, t, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Hess( variant, FLA_ALG_FRONT, n_repeats, m, nfc, nlc, C, C_ref, t, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); FLA_Obj_free( &t ); } fprintf( stdout, "\n" ); } fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); fprintf( stdout, "plot( data_hess( :,1 ), data_hess( :, 2 ), '%c:%c' ); \n", colors[ 0 ], ticks[ 0 ] ); fprintf( stdout, "plot( data_hess( :,1 ), data_hess( :, 4 ), '%c:%c' ); \n", colors[ 1 ], ticks[ 1 ] ); fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'ref\\_hess', 'fla\\_hess', ... \n" ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME Hessenberg reduction front-end performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc hess_front_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); }
int main(int argc, char *argv[]) { int datatype, n_threads, m_input, m, n_input, n, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; dim_t n_panels, nb_flash, nb_alg; double dtime, gflops, diff; FLA_Obj A, ATW, R, RTW, b, x; FLA_Obj A_flat, b_flat, x_flat; FLA_Init( ); fprintf( stdout, "%c number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter algorithmic blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c enter FLASH blocksize: ", '%' ); scanf( "%u", &nb_flash ); fprintf( stdout, "%c %u\n", '%', nb_flash ); fprintf( stdout, "%c enter problem size first, last, inc: ", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d %d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "%c enter the number of QR subproblem panels: ", '%' ); scanf( "%u", &n_panels ); fprintf( stdout, "%c %u\n", '%', n_panels ); fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); //datatype = FLA_FLOAT; datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; //datatype = FLA_DOUBLE_COMPLEX; //FLASH_Queue_disable(); FLASH_Queue_set_num_threads( n_threads ); //FLASH_Queue_set_verbose_output( TRUE ); // FLA_Check_error_level_set( FLA_NO_ERROR_CHECKING ); for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if ( m < 0 ) m = p * f2c_abs(m_input); if ( n < 0 ) n = p * f2c_abs(n_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ) { FLA_Obj_create( datatype, m, n, 0, 0, &A_flat ); FLA_Obj_create( datatype, n, 1, 0, 0, &x_flat ); FLA_Obj_create( datatype, m, 1, 0, 0, &b_flat ); FLA_Random_matrix( A_flat ); FLA_Random_matrix( b_flat ); FLASH_CAQR_UT_inc_create_hier_matrices( n_panels, A_flat, 1, &nb_flash, nb_alg, &A, &ATW, &R, &RTW ); FLASH_Obj_create_hier_copy_of_flat( b_flat, 1, &nb_flash, &b ); FLASH_Obj_create_hier_copy_of_flat( x_flat, 1, &nb_flash, &x ); fprintf( stdout, "data_caqrutinc_%s( %d, 1:3 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_CAQR_UT_inc( param_combo, FLA_ALG_FRONT, n_repeats, m, n, n_panels, A, ATW, R, RTW, b, x, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A_flat ); FLA_Obj_free( &b_flat ); FLA_Obj_free( &x_flat ); FLASH_Obj_free( &A ); FLASH_Obj_free( &ATW ); FLASH_Obj_free( &R ); FLASH_Obj_free( &RTW ); FLASH_Obj_free( &b ); FLASH_Obj_free( &x ); } } FLA_Finalize( ); return 0; }
int main( int argc, char *argv[] ) { int i, j, size, n_threads, n_repeats, n_trials, nb_alg, increment, begin; FLA_Datatype datatype = FLA_DOUBLE; FLA_Obj A; double b_norm_value = 0.0, dtime, *dtimes, *flops, *T; char output_file_m[100]; FILE *fpp; fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n\n", '%', n_threads ); sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | PLASMA |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); FLA_Init(); PLASMA_Init( n_threads ); PLASMA_Disable( PLASMA_AUTOTUNING ); PLASMA_Set( PLASMA_TILE_SIZE, nb_alg ); PLASMA_Set( PLASMA_INNER_BLOCK_SIZE, nb_alg / 4 ); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); fprintf( fpp, "%s = [\n", OUTPUT_FILE ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); PLASMA_Alloc_Workspace_dgeqrf( size, size, &T ); dtime = FLA_Clock(); PLASMA_dgeqrf( size, size, FLA_Obj_buffer_at_view( A ), size, T ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; free( T ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 4.0 / 3.0 * size * size * size / dtime / 1e9; fprintf( fpp, " %d %6.3f\n", size, flops[i] ); printf( "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); printf( "Matrix size: %d x %d | nb_alg: %d\n", size, size, nb_alg ); printf( "Norm of difference: %le\n\n", b_norm_value ); FLA_Obj_free( &A ); } fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); FLA_free( dtimes ); FLA_free( flops ); PLASMA_Finalize(); FLA_Finalize(); return 0; }
int main(int argc, char *argv[]) { int datatype, m_input, k_input, m, k, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; char *colors = "brkgmcbrkgmcbrkgmc"; char *ticks = "o+*xso+*xso+*xso+*xs"; char m_dim_desc[14]; char k_dim_desc[14]; char m_dim_tag[10]; char k_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, B, C, C_ref; FLA_Init( ); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m k (-1 means bind to problem size): ", '%' ); scanf( "%d%d", &m_input, &k_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, k_input ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( k_input > 0 ) { sprintf( k_dim_desc, "k = %d", k_input ); sprintf( k_dim_tag, "k%dc", k_input); } else if( k_input < -1 ) { sprintf( k_dim_desc, "k = p/%d", -k_input ); sprintf( k_dim_tag, "k%dp", -k_input ); } else if( k_input == -1 ) { sprintf( k_dim_desc, "k = p" ); sprintf( k_dim_tag, "k%dp", 1 ); } //datatype = FLA_FLOAT; datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; //datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; k = k_input; if( m < 0 ) m = p / f2c_abs(m_input); if( k < 0 ) k = p / f2c_abs(k_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){ // If transposing A, switch dimensions. if ( pc_str[param_combo][1] == 'n' ) { FLA_Obj_create( datatype, m, k, 0, 0, &A ); FLA_Obj_create( datatype, m, k, 0, 0, &B ); } else { FLA_Obj_create( datatype, k, m, 0, 0, &A ); FLA_Obj_create( datatype, k, m, 0, 0, &B ); } FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Obj_create( datatype, m, m, 0, 0, &C_ref ); FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); fprintf( stdout, "data_syr2k_%s( %d, 1:3 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_Syr2k( param_combo, FLA_ALG_REFERENCE, n_repeats, m, k, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); /* time_Syr2k( param_combo, FLA_ALG_FRONT, n_repeats, m, k, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); */ fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_syr2k_%s( :,1 ), data_syr2k_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_syr2k_%s( :,1 ), data_syr2k_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_syr2k\\_%s', 'fla\\_syr2k\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME syr2k front-end performance (%s, %s)' );\n", m_dim_desc, k_dim_desc ); fprintf( stdout, "print -depsc syr2k_front_%s_%s.eps\n", m_dim_tag, k_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int datatype, n_threads, m_input, m, n_input, n, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; dim_t b_flash, b_alg; char *colors = "brkgmcbrkgmcbrkgmc"; char *ticks = "o+*xso+*xso+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, TW, b, x; FLA_Obj A_flat, b_flat, x_flat; FLA_Init( ); fprintf( stdout, "%c number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter FLASH blocksize: ", '%' ); scanf( "%u", &b_flash ); fprintf( stdout, "%c %u\n", '%', b_flash ); fprintf( stdout, "%c enter problem size first, last, inc: ", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d %d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } //datatype = FLA_FLOAT; datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; //datatype = FLA_DOUBLE_COMPLEX; FLASH_Queue_set_num_threads( n_threads ); //FLASH_Queue_set_verbose_output( TRUE ); //FLA_Check_error_level_set( FLA_NO_ERROR_CHECKING ); //FLASH_Queue_disable(); for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if ( m < 0 ) m = p * abs(m_input); if ( n < 0 ) n = p * abs(n_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ) { FLA_Obj_create( datatype, m, n, 0, 0, &A_flat ); FLA_Obj_create( datatype, n, 1, 0, 0, &x_flat ); FLA_Obj_create( datatype, m, 1, 0, 0, &b_flat ); FLA_Random_matrix( A_flat ); FLA_Random_matrix( b_flat ); FLASH_QR_UT_create_hier_matrices( A_flat, 1, &b_flash, &A, &TW ); FLASH_Obj_create_hier_copy_of_flat( b_flat, 1, &b_flash, &b ); FLASH_Obj_create_hier_copy_of_flat( x_flat, 1, &b_flash, &x ); fprintf( stdout, "data_qrut_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_QR_UT( param_combo, FLA_ALG_FRONT, n_repeats, m, n, A, TW, b, x, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A_flat ); FLA_Obj_free( &b_flat ); FLA_Obj_free( &x_flat ); FLASH_Obj_free( &A ); FLASH_Obj_free( &TW ); FLASH_Obj_free( &b ); FLASH_Obj_free( &x ); } } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_qrut_%s( :,1 ), data_qrut_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_qrut_%s( :,1 ), data_qrut_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_qrut\\_%s', 'fla\\_qrut\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME qrut front-end performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc qrut_front_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main( int argc, char** argv ) { FLA_Datatype comptype = COMPTYPE; FLA_Datatype realtype = REALTYPE; dim_t m; FLA_Obj a, aT, aB, a0, a1, a2; FLA_Obj v, vT, vB, v0, v1, v2; FLA_Error init_result; int use_abs = 1; if ( argc == 3 ) { m = atoi(argv[1]); use_abs = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m use_abs\n", argv[0]); fprintf(stderr, " m : test vector length\n"); fprintf(stderr, " use_abs : 0 - norm (realtype), 1 - abs (complex type)\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); FLA_Obj_create( comptype, m, 1, 0, 0, &a ); FLA_Obj_create( use_abs ? comptype : realtype, m, 1, 0, 0, &v ); FLA_Random_matrix( a ); FLA_Set( FLA_ZERO, v ); FLA_Obj_fshow( stdout, "- a -", a, "% 6.4e", "--" ); // Normalize a vector FLA_Part_2x1( a, &aT, &aB, 0, FLA_TOP ); FLA_Part_2x1( v, &vT, &vB, 0, FLA_TOP ); while ( FLA_Obj_length( aB ) > 0 ) { FLA_Repart_2x1_to_3x1( aT, &a0, &a1, aB, &a2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( vT, &v0, &v1, vB, &v2, 1, FLA_BOTTOM ); // -------------------------------------------- if ( use_abs ) { // a and v are complex datatype FLA_Copy( a1, v1 ); FLA_Absolute_value( v1 ); } else { // v is real datatype FLA_Nrm2( a1, v1 ); } if ( FLA_Obj_equals( v1, FLA_ZERO ) ) printf( " ZERO DETECTED\n" ); else FLA_Inv_scal( v1, a1 ); // Normalize the scalar // -------------------------------------------- FLA_Cont_with_3x1_to_2x1( &aT, a0, a1, &aB, a2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &vT, v0, v1, &vB, v2, FLA_TOP ); } FLA_Obj_fshow( stdout, "- a -", a, "% 6.4e", "--" ); FLA_Obj_fshow( stdout, "- v -", v, "% 6.4e", "--" ); // Check whether it is normalized FLA_Part_2x1( a, &aT, &aB, 0, FLA_TOP ); FLA_Part_2x1( v, &vT, &vB, 0, FLA_TOP ); while ( FLA_Obj_length( aB ) > 0 ) { FLA_Repart_2x1_to_3x1( aT, &a0, &a1, aB, &a2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( vT, &v0, &v1, vB, &v2, 1, FLA_BOTTOM ); // -------------------------------------------- if ( use_abs ) { // a and v are same datatype FLA_Copy( a1, v1 ); FLA_Absolute_value( v1 ); } else { // v is realdatatype FLA_Nrm2( a1, v1 ); } // -------------------------------------------- FLA_Cont_with_3x1_to_2x1( &aT, a0, a1, &aB, a2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &vT, v0, v1, &vB, v2, FLA_TOP ); } FLA_Obj_fshow( stdout, " - all should be one - ", v, "% 6.4e", "--"); FLA_Obj_free( &a ); FLA_Obj_free( &v ); FLA_Finalize_safe( init_result ); }
void libfla_test_qrut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input = -2; signed int n_input = -1; FLA_Obj A, T, x, b, y, norm; FLA_Obj A_save; FLA_Obj A_test, T_test, x_test, b_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); if ( impl == FLA_TEST_FLAT_FRONT_END || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 1 ) ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); else if ( var == 2 ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], min_m_n, min_m_n, &T ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], 1, min_m_n, &T ); // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create vectors to form a linear system. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, n, 1, 0, 0, &y ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a random right-hand side vector. FLA_Random_matrix( b ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &b_flash, &b_test ); FLASH_Obj_create_hier_copy_of_flat( x, 1, &b_flash, &x_test ); } else { A_test = A; T_test = T; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( A_save, A_test ); else FLA_Copy_external( A_save, A_test ); time = FLA_Clock(); libfla_test_qrut_impl( impl, A_test, T_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Perform a linear solve with the result. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_solve( A_test, T_test, b_test, x_test ); FLASH_Obj_flatten( x_test, x ); } else { FLA_QR_UT_solve( A_test, T_test, b, x ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &b_test ); FLASH_Obj_free( &x_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_free(); // Compute the performance of the best experiment repeat. *perf = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save, x, FLA_MINUS_ONE, b ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &y ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); }
int main(int argc, char *argv[]) { int m_input, k_input, n_input, m, n, k, p_first, p_last, p_inc, p, nb_alg, nrepeats, variant, n_threads, n_thread_experiments, i, j, nvariants = N_VARIANTS; int n_threads_exp[64]; int n_threads_exp_m[64]; int n_threads_exp_k[64]; int n_threads_exp_n[64]; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char k_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[5]; char k_dim_tag[5]; char n_dim_tag[5]; double max_gflops=6.0; double dtime, gflops, diff, d_n; FLA_Obj A, B, C, Cref; /* Initialize FLAME */ FLA_Init( ); FLA_Task_partitioning_init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &nrepeats ); fprintf( stdout, "%c %d\n", '%', nrepeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m k n (-1 means bind to problem size): ", '%' ); scanf( "%d%d%d", &m_input, &k_input, &n_input ); fprintf( stdout, "%c %d %d %d\n", '%', m_input, k_input, n_input ); fprintf( stdout, "%c enter variant or variant-permutation (1..6,13,31,15,35): ", '%' ); scanf( "%d", &variant ); fprintf( stdout, "%c %d\n", '%', variant ); fprintf( stdout, "%c enter number of thread experiments: ", '%' ); scanf( "%d", &n_thread_experiments ); fprintf( stdout, "%c %d\n", '%', n_thread_experiments ); fprintf( stdout, "%c enter t, t_m, t_k, and t_n for each experiment: ", '%' ); for( i = 0; i < n_thread_experiments; ++i ) scanf( "%d %d %d %d", &n_threads_exp[i], &n_threads_exp_m[i], &n_threads_exp_k[i], &n_threads_exp_n[i] ); fprintf( stdout, "\n" ); for( i = 0; i < n_thread_experiments; ++i ) fprintf( stdout, "%c %2d = %2d x %2d x %2d\n", '%', n_threads_exp[i], n_threads_exp_m[i], n_threads_exp_k[i], n_threads_exp_n[i] ); /* Delete all existing data structures */ fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( k_input > 0 ) { sprintf( k_dim_desc, "k = %d", k_input ); sprintf( k_dim_tag, "k%dc", k_input); } else if( k_input < -1 ) { sprintf( k_dim_desc, "k = p/%d", -k_input ); sprintf( k_dim_tag, "k%dp", -k_input ); } else if( k_input == -1 ) { sprintf( k_dim_desc, "k = p" ); sprintf( k_dim_tag, "k%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; k = k_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( k < 0 ) k = p / abs(k_input); if( n < 0 ) n = p / abs(n_input); /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, m, k, &A ); FLA_Obj_create( FLA_DOUBLE, k, n, &B ); FLA_Obj_create( FLA_DOUBLE, m, n, &C ); FLA_Obj_create( FLA_DOUBLE, m, n, &Cref ); /* Generate random matrices A, C */ FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); FLA_Copy_external( C, Cref ); /* Time the reference implementation */ time_Gemm_nn( 0, FLA_ALG_REFERENCE, nrepeats, n, nb_alg, A, B, C, Cref, &dtime, &diff, &gflops ); fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf ]; \n", i, p, gflops ); fflush( stdout ); for ( j = 0; j < n_thread_experiments; j++ ){ n_threads = n_threads_exp[j]; FLA_Task_partitioning_set( n_threads_exp[j], n_threads_exp_m[j], n_threads_exp_k[j], n_threads_exp_n[j] ); FLA_omp_set_num_threads( n_threads_exp[j] ); FLA_omp_set_num_stages( n_threads_exp_k[j] ); fprintf( stdout, "data_nth%d_%dx%dx%d( %d, 1:3 ) = [ %d ", n_threads, n_threads_exp_m[j], n_threads_exp_k[j], n_threads_exp_n[j], i, p ); fflush( stdout ); //time_Gemm_nn( variant, FLA_ALG_OPENMP_BVAR, nrepeats, n, nb_alg, time_Gemm_nn( variant, FLA_ALG_OPENMP_CVAR, nrepeats, p, nb_alg, A, B, C, Cref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &Cref ); fprintf( stdout, "\n" ); } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ fprintf( stdout, "figure;\n" ); /* Plot the performance of the reference implementation */ //fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); /* Indicate that you want to add to the existing plot */ fprintf( stdout, "hold on;\n" ); /* Plot the data for the other numbers of threads */ for ( i = 0; i < n_thread_experiments; i++ ){ fprintf( stdout, "plot( data_nth%d_%dx%dx%d( :,1 ), data_nth%d_%dx%dx%d( :, 2 ), '%c:%c' ); \n", n_threads_exp[ i ], n_threads_exp_m[i], n_threads_exp_k[i], n_threads_exp_n[i], n_threads_exp[ i ], n_threads_exp_m[i], n_threads_exp_k[i], n_threads_exp_n[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_thread_experiments-1; i++ ) fprintf( stdout, "'n\\_threads %d=%dx%dx%d', ... \n", n_threads_exp[ i ], n_threads_exp_m[i], n_threads_exp_k[i], n_threads_exp_n[i] ); i = n_thread_experiments-1; fprintf( stdout, "'n\\_threads %d=%dx%dx%d', 2 ); \n", n_threads_exp[ i ], n_threads_exp_m[i], n_threads_exp_k[i], n_threads_exp_n[i] ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, n_threads_exp[n_thread_experiments-1] * max_gflops ); fprintf( stdout, "title( 'OpenFLAME gemm\\_nn\\_var%d performance (%s, %s, %s)' );\n", variant, m_dim_desc, k_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc gemm_nn_ompfac_var%d_%s_%s_%s.eps\n", variant, m_dim_tag, k_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); }
int main(int argc, char *argv[]) { int n, nfirst, nlast, ninc, nlast_unb, i, irep, nrepeats, nb_alg; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, Aref, Aold, delta; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times and the fastest run in recorded */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable This is used to set the y-axis range for the graphs. Here is how you figure out what to enter (on Linux machines): 1) more /proc/cpuinfo (this lists the contents of this file). 2) read through this and figure out the clock rate of the machine (in GHz). 3) Find out (from an expert of from the web) the number of floating point instructions that can be performed per core per clock cycle. 4) Figure out if you are using "multithreaded BLAS" which automatically parallelize calls to the Basic Linear Algebra Subprograms. If so, check how many cores are available. 5) Multiply 2) x 3) x 4) and enter this in response to the below. If you enter a value for max GFLOPS that is lower that the maximum that is observed in the experiments, then the top of the graph is set to the observed maximum. Thus, one possibility is to simply set this to 0.0. */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed. Unblocked versions are only tested to nlast_unb */ printf( "%% enter nfirst, nlast, ninc, nlast_unb:" ); scanf( "%d%d%d%d", &nfirst, &nlast, &ninc, &nlast_unb ); printf( "%% %d %d %d %d\n", nfirst, nlast, ninc, nlast_unb ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aref ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aold ); FLA_Obj_create( FLA_DOUBLE, 1, 1, 1, 1, &delta ); /* Generate random matrix A and save in Aold */ FLA_Random_matrix( Aold ); /* Add something large to the diagonal to make sure it isn't ill-conditionsed */ d_n = ( double ) n; *( ( double * ) FLA_Obj_buffer_at_view( delta ) ) = d_n; FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold ); /* Set gflops = billions of floating point operations that will be performed */ gflops = 1.0/3.0 * n * n * n * 1.0e-09; /* Time the reference implementation */ #if TIME_LAPACK == TRUE #else // if ( n <= nlast_unb ) #endif { for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, Aref ); dtime = FLA_Clock(); REF_Chol( TIME_LAPACK, Aref, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); } /* Time FLA_Chol */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); FLA_Chol( FLA_LOWER_TRIANGULAR, A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); if ( gflops / dtime_best > max_gflops ) max_gflops = gflops / dtime_best; fflush( stdout ); /* Time the your implementations */ /* Variant 1 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR1 == TRUE Chol_unb_var1( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR1 == TRUE Chol_blk_var1( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 2 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR2 == TRUE Chol_unb_var2( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR2 == TRUE Chol_blk_var2( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 3 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR3 == TRUE Chol_unb_var3( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR3 == TRUE Chol_blk_var3( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &Aold ); FLA_Obj_free( &Aref ); FLA_Obj_free( &delta ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); #if OCTAVE == TRUE /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), '-k;libflame;' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), '-m;reference;' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), \"-rx;UnbVar1;\" ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), \"-go;UnbVar2;\" ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), \"-b*;UnbVar3;\" ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), \"-rx;BlkVar1;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), \"-go;BlkVar2;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), \"-b*;BlkVar3;\", \"markersize\", 3 ); \n" ); #else /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.x' ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.o' ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.*' ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r-x'); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g-o'); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b-*'); \n" ); #endif printf( "hold off \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); #if OCTAVE == TRUE printf( "legend( 2 ); \n" ); printf(" print -landscape -solid -color -deps -F:24 Chol.eps\n" ); #else printf( "legend( 'FLA Chol', ...\n"); printf( " 'Simple loops', ...\n"); printf( " 'unb var1', ...\n"); printf( " 'unb var2', ...\n"); printf( " 'unb var3', ...\n"); printf( " 'blk var1', ...\n"); printf( " 'blk var2', ...\n"); printf( " 'blk var3', 2);\n"); printf( "print -r100 -dpdf Chol.pdf\n"); #endif FLA_Finalize( ); exit( 0 ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, A_flame, A_lapack, C; int m; FLA_Error init_result; FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame; FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack; testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack; int lwork, info, is_flame; if ( argc == 3 ) { m = atoi(argv[1]); is_flame = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m is_flame\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " is_flame : 1 yes, 0 no\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); fprintf( stdout, "lapack2flame: %d x %d: \n", m, m); FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Random_matrix( A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Random_matrix( C ); if ( is_flame ) { fprintf( stdout, " flame executed\n"); FLA_Bidiag_UT_create_T( A_flame, &TU, &TV ); FLA_Bidiag_UT( A_flame, TU, TV ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame ); FLA_Bidiag_UT_form_U( U_flame, TU, U_flame ); FLA_Bidiag_UT_form_V( V_flame, TV, V_flame ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_flame ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_flame ); FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame ); FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_flame, B_flame ); FLA_Set_diagonal_matrix( e_flame, BTR ); } if (1) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------"); } } else { fprintf( stdout, " lapack executed\n"); FLA_Obj_create( datatype, m, 1, 0, 0, &tauq ); FLA_Obj_create( datatype, m, 1, 0, 0, &taup ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_lapack ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_lapack ); buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack ); buff_tauq = (testtype*)FLA_Obj_buffer_at_view( tauq ); buff_taup = (testtype*)FLA_Obj_buffer_at_view( taup ); buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack ); lwork = 32*m; FLA_Obj_create( datatype, lwork, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sgebrd_( &m, &m, buff_A_lapack, &m, buff_d_lapack, buff_e_lapack, buff_tauq, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack ); FLA_Copy( A_lapack, U_lapack ); FLA_Copy( A_lapack, V_lapack ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack ); sorgbr_( "Q", &m, &m, &m, buff_U_lapack, &m, buff_tauq, buff_W, &lwork, &info ); sorgbr_( "P", &m, &m, &m, buff_V_lapack, &m, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_lapack, B_lapack ); FLA_Set_diagonal_matrix( e_lapack, BTR ); } FLA_Obj_free( &W ); if (1) { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------"); } } { testtype dummy; int zero = 0, one = 1; FLA_Obj D_lapack; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack ); if ( is_flame ) { buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame ); } FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sbdsqr_( "U", &m, &m, &m, &zero, buff_d_lapack, buff_e_lapack, buff_V_lapack, &m, buff_U_lapack, &m, &dummy, &one, buff_W, &info ); FLA_Obj_free( &W ); if (info != 0) printf( " Error info = %d\n", info ); if ( is_flame ) FLA_Set_diagonal_matrix( d_flame, D_lapack ); else FLA_Set_diagonal_matrix( d_lapack, D_lapack ); if ( is_flame ) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } else { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } FLA_Obj_free( &D_lapack ); } if ( is_flame ) { FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &U_flame ); FLA_Obj_free( &V_flame ); FLA_Obj_free( &d_flame ); FLA_Obj_free( &e_flame ); FLA_Obj_free( &B_flame ); } else { FLA_Obj_free( &tauq ); FLA_Obj_free( &taup ); FLA_Obj_free( &d_lapack ); FLA_Obj_free( &e_lapack ); FLA_Obj_free( &U_lapack ); FLA_Obj_free( &V_lapack ); FLA_Obj_free( &B_lapack ); } FLA_Obj_free( &A ); FLA_Obj_free( &A_flame ); FLA_Obj_free( &A_lapack ); FLA_Obj_free( &C ); FLA_Finalize_safe( init_result ); }
int main( int argc, char *argv[] ) { int i, j, n_threads, n_repeats, n_trials, increment, begin, sorting, caching, work_stealing, data_affinity; dim_t size, nb_alg; FLA_Datatype datatype = FLA_DOUBLE; FLA_Inv inv = FLA_NO_INVERSE; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Obj A, B, x, b, b_norm, AH, BH; double length, b_norm_value = 0.0, dtime, *dtimes, *flops; #ifndef FLA_ENABLE_WINDOWS_BUILD char output_file_m[100]; FILE *fpp; #endif fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' ); scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity ); fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #else sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | FLASH |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #endif FLA_Init(); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); FLASH_Queue_set_num_threads( n_threads ); FLASH_Queue_set_sorting( sorting ); FLASH_Queue_set_caching( caching ); FLASH_Queue_set_work_stealing( work_stealing ); FLASH_Queue_set_data_affinity( data_affinity ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); FLA_Obj_create( datatype, size, size, 0, 0, &B ); FLA_Obj_create( datatype, size, 1, 0, 0, &x ); FLA_Obj_create( datatype, size, 1, 0, 0, &b ); FLA_Obj_create( datatype, 1, 1, 0, 0, &b_norm ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( x ); FLA_Random_matrix( b ); FLA_Symmetrize( uplo, A ); FLA_Symmetrize( uplo, B ); length = ( double ) FLA_Obj_length( B ); FLA_Add_to_diag( &length, B ); FLA_Symv_external( uplo, FLA_ONE, B, x, FLA_ZERO, b ); FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &nb_alg, &BH ); FLASH_Chol( uplo, BH ); dtime = FLA_Clock(); FLASH_Eig_gest( inv, uplo, AH, BH ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; FLASH_Obj_free( &AH ); FLASH_Obj_free( &BH ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 1.0 * size * size * size / dtime / 1e9; #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, " %d %6.3f %le\n", size, flops[i], b_norm_value ); #else fprintf( fpp, " %d %6.3f\n", size, flops[i] ); fprintf( stdout, "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); fprintf( stdout, "Matrix size: %u x %u | nb_alg: %u\n", size, size, nb_alg ); fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &b_norm ); } #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "];\n\n" ); #else fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); #endif FLA_free( dtimes ); FLA_free( flops ); FLA_Finalize(); return 0; }
int main(int argc, char *argv[]) { int datatype, m_input, m, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; FLA_Uplo uplo; FLA_Diag diag; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, b, b_orig, norm; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){ //FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, m, 1, &A ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, m, 1, 0, 0, &b_orig ); if ( FLA_Obj_is_single_precision( A ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm ); FLA_Param_map_netlib_to_flame_uplo( &pc_str[param_combo][0], &uplo ); FLA_Param_map_netlib_to_flame_diag( &pc_str[param_combo][1], &diag ); FLA_Random_tri_matrix( uplo, diag, A ); FLA_Random_matrix( b ); FLA_Copy_external( b, b_orig ); fprintf( stdout, "data_trinv_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); /* time_Trinv( param_combo, FLA_ALG_REFERENCE, n_repeats, m, uplo, diag, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); */ time_Trinv( param_combo, FLA_ALG_FRONT, n_repeats, m, uplo, diag, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &b ); FLA_Obj_free( &b_orig ); FLA_Obj_free( &norm ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_trinv_%s( :,1 ), data_trinv_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_trinv_%s( :,1 ), data_trinv_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_trinv\\_%s', 'fla\\_trinv\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthWest' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME trinv front-end performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc trinv_front_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
void libfla_test_symm_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; unsigned int n; signed int n_input = -1; FLA_Side side; FLA_Uplo uplo; FLA_Obj A, B, C, x, y, z, w, norm; FLA_Obj alpha, beta; FLA_Obj C_save; FLA_Obj A_test, B_test, C_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); // Create the matrices for the current operation. if ( side == FLA_LEFT ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, m, 1, 0, 0, &w ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, n, 1, 0, 0, &w ); } libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C ); // Create a norm scalar. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_symm_matrix( uplo, A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); // Initialize the test vectors. FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, z ); FLA_Set( FLA_ZERO, w ); // Set constants. alpha = FLA_TWO; beta = FLA_MINUS_ONE; // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test ); } else { A_test = A; B_test = B; C_test = C; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( C_save, C_test ); else FLA_Copy_external( C_save, C_test ); time = FLA_Clock(); libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Copy the solution to flat matrix X. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( C_test, C ); } else { // No action needed since C_test and C refer to the same object. } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &B_test ); FLASH_Obj_free( &C_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_free(); // Compute the performance of the best experiment repeat. if ( side == FLA_LEFT ) *perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; else *perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute: // y = C * x // and compare to // z = ( beta * C_orig + alpha * A * B ) x (side = left) // z = ( beta * C_orig + alpha * B * A ) x (side = right) FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); if ( side == FLA_LEFT ) { FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Symv_external( uplo, alpha, A, w, FLA_ZERO, z ); } else { FLA_Symv_external( uplo, FLA_ONE, A, x, FLA_ZERO, w ); FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B, w, FLA_ZERO, z ); } FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z ); // Compute || y - z ||. //FLA_Axpy_external( FLA_MINUS_ONE, y, z ); //FLA_Nrm2_external( z, norm ); //FLA_Obj_extract_real_scalar( norm, residual ); *residual = FLA_Max_elemwise_diff( y, z ); // Free the supporting flat objects. FLA_Obj_free( &C_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); FLA_Obj_free( &norm ); }
int main(int argc, char *argv[]) { int m_input, n_input, m, n, p_first, p_last, p_inc, p, nb_alg, variant, n_repeats, i, j, datatype, n_variants = 18; int sign; int blocksize[16]; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, B, C, C_ref, scale, isgn, norm; /* Initialize FLAME */ FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c Enter sign (-1 or 1):", '%' ); scanf( "%d", &sign ); fprintf( stdout, "%c %d\n", '%', sign ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d %d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); /* Delete all existing data structures */ fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } if ( 0 < sign ) isgn = FLA_ONE; else isgn = FLA_MINUS_ONE; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( n < 0 ) n = p / abs(n_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, n, n, 0, 0, &B ); FLA_Obj_create( datatype, m, n, 0, 0, &C ); FLA_Obj_create( datatype, m, n, 0, 0, &C_ref ); if ( datatype == FLA_DOUBLE || datatype == FLA_DOUBLE_COMPLEX ) { FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &scale ); FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm ); } else if ( datatype == FLA_FLOAT || datatype == FLA_COMPLEX ) { FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &scale ); FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm ); } FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, B ); FLA_Random_matrix( C ); FLA_Norm1( A, norm ); FLA_Shift_diag( FLA_NO_CONJUGATE, norm, A ); FLA_Norm1( B, norm ); if ( FLA_Obj_is( isgn, FLA_MINUS_ONE ) ) FLA_Negate( norm ); FLA_Shift_diag( FLA_NO_CONJUGATE, norm, B ); time_Sylv_nn( 0, FLA_ALG_REFERENCE, n_repeats, m, n, nb_alg, isgn, A, B, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf ]; \n", i, p, gflops ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Sylv_nn( variant, FLA_ALG_UNB_OPT, n_repeats, m, n, nb_alg, isgn, A, B, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Sylv_nn( variant, FLA_ALG_BLOCKED, n_repeats, m, n, nb_alg, isgn, A, B, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); FLA_Obj_free( &scale ); FLA_Obj_free( &norm ); fprintf( stdout, "\n" ); } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ fprintf( stdout, "figure;\n" ); /* Plot the performance of the reference implementation */ fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); /* Indicate that you want to add to the existing plot */ fprintf( stdout, "hold on;\n" ); /* Plot the data for the other numbers of threads */ for ( i = 1; i <= n_variants; i++ ){ fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i <= n_variants; i++ ) fprintf( stdout, "'FLAME var%d', ... \n", i ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME sylv\\_nn performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc sylv_nn_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); }
int main( int argc, char *argv[] ) { int i, j, n_threads, n_repeats, n_trials, increment, begin, sorting, caching, work_stealing, data_affinity; dim_t size, nb_alg; FLA_Datatype datatype = FLA_DOUBLE; FLA_Obj A, x, b, b_norm, AH, pH, bH; double b_norm_value, dtime, *dtimes, *flops; #ifndef FLA_ENABLE_WINDOWS_BUILD char output_file_m[100]; FILE *fpp; #endif fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' ); scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity ); fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #else sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | FLASH |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #endif FLA_Init(); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); FLASH_Queue_set_num_threads( n_threads ); FLASH_Queue_set_sorting( sorting ); FLASH_Queue_set_caching( caching ); FLASH_Queue_set_work_stealing( work_stealing ); FLASH_Queue_set_data_affinity( data_affinity ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); FLA_Obj_create( datatype, size, 1, 0, 0, &x ); FLA_Obj_create( datatype, size, 1, 0, 0, &b ); FLA_Obj_create( datatype, 1, 1, 0, 0, &b_norm ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); FLA_Random_matrix( b ); FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH ); FLASH_Obj_create( FLA_INT, size, 1, 1, &nb_alg, &pH ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_alg, &bH ); dtime = FLA_Clock(); FLASH_LU_piv( AH, pH ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; FLASH_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pH, bH ); FLASH_Trsv( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, AH, bH ); FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, AH, bH ); FLASH_Obj_free( &AH ); FLASH_Obj_free( &pH ); FLASH_Obj_flatten( bH, x ); FLASH_Obj_free( &bH ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 2.0 / 3.0 * size * size * size / dtime / 1e9; FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A, x, FLA_MINUS_ONE, b ); FLA_Nrm2_external( b, b_norm ); FLA_Obj_extract_real_scalar( b_norm, &b_norm_value ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, " %d %6.3f %le\n", size, flops[i], b_norm_value ); #else fprintf( fpp, " %d %6.3f\n", size, flops[i] ); fprintf( stdout, "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); fprintf( stdout, "Matrix size: %u x %u | nb_alg: %u\n", size, size, nb_alg ); fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &b_norm ); } #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "];\n\n" ); #else fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); #endif FLA_free( dtimes ); FLA_free( flops ); FLA_Finalize(); return 0; }
int main(int argc, char *argv[]) { int datatype, m_input, m, p_first, p_last, p_inc, p, n_repeats, param_combo, i, j, n_param_combos = N_PARAM_COMBOS; int sign; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, C, C_ref, scale, isgn, norm; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter sign (-1 or 1):", '%' ); scanf( "%d", &sign ); fprintf( stdout, "%c %d\n", '%', sign ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( 0 < sign ) isgn = FLA_ONE; else isgn = FLA_MINUS_ONE; //datatype = FLA_FLOAT; datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; //datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){ FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Obj_create( datatype, m, m, 0, 0, &C_ref ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &scale ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Norm1( A, norm ); FLA_Shift_diag( FLA_NO_CONJUGATE, norm, A ); FLA_Random_matrix( C ); FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C ); fprintf( stdout, "data_lyap_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_Lyap( param_combo, FLA_ALG_REFERENCE, n_repeats, m, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Lyap( param_combo, FLA_ALG_FRONT, n_repeats, m, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); FLA_Obj_free( &scale ); FLA_Obj_free( &norm ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_lyap_%s( :,1 ), data_lyap_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_lyap_%s( :,1 ), data_lyap_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_lyap\\_%s', 'fla\\_lyap\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME lyap front-end performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc lyap_front_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int m_input, n_input, m, n, p_first, p_last, p_inc, p, nb_alg, n_repeats, variant, i, j, datatype, n_variants = N_VARIANTS; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, B, C, C_ref; /* Initialize FLAME */ FLA_Init( ); fprintf( stdout, "%c number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size: ", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc: ", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d%d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); /* Delete all existing data structures */ fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( n < 0 ) n = p / abs(n_input); //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; /* Allocate space for the matrices */ FLA_Obj_create( datatype, m, m, &A ); FLA_Obj_create( datatype, m, n, &C ); FLA_Obj_create( datatype, m, n, &C_ref ); /* Generate random matrices A, C */ FLA_Random_tri_matrix( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, A ); FLA_Random_matrix( C ); FLA_Copy_external( C, C_ref ); /* Time the reference implementation */ time_Trmm_luh( 0, FLA_ALG_REFERENCE, n_repeats, p, nb_alg, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf ]; \n", i, p, gflops ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ //fprintf( stdout, "data_var%d( %d, 1:7 ) = [ %d ", variant, i, p ); fprintf( stdout, "data_var%d( %d, 1:5 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Trmm_luh( variant, FLA_ALG_UNBLOCKED, n_repeats, p, nb_alg, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Trmm_luh( variant, FLA_ALG_BLOCKED, n_repeats, p, nb_alg, A, B, C, C_ref, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); //time_Trmm_luh( variant, FLA_ALG_OPTIMIZED, n_repeats, p, nb_alg, // A, B, C, C_ref, &dtime, &diff, &gflops ); //fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); //fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ fprintf( stdout, "figure;\n" ); /* Plot the performance of the reference implementation */ fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); /* Indicate that you want to add to the existing plot */ fprintf( stdout, "hold on;\n" ); /* Plot the data for the other numbers of threads */ for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); //fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 6 ), '%c--%c' ); \n", // i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) //fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', 'opt\\_var%d', ... \n", i, i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); i = n_variants; fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME trmm\\_luc performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc trmm_luc_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); }
int main(int argc, char *argv[]) { int m, n, k, nfirst, nlast, ninc, i, irep, nrepeats, nb_alg, check;; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, B, C, Cref, Cold; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed */ printf( "%% enter nfirst, nlast, ninc:" ); scanf( "%d%d%d", &nfirst, &nlast, &ninc ); printf( "%% %d %d %d\n", nfirst, nlast, ninc ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &B ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &C ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cref ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cold ); /* Generate random matrices L and B */ FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( Cold ); gflops = 2.0 * n * n * n * 1.0e-09; /* Time FLA_Symm */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, Cref ); dtime = FLA_Clock(); FLA_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ONE, Cref ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); /* Time the your implementations */ #if TEST_UNB_VAR1==TRUE /* Variant 1 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var1( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR1==TRUE /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var1( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR2==TRUE /* Variant 2 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var2( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR2==TRUE /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var2( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR3==TRUE /* Variant 3 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var3( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR3==TRUE /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var3( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR4==TRUE /* Variant 4 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var4( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR4==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var4( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR5==TRUE /* Variant 5 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var5( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR5==TRUE /* Variant 5 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var5( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR6==TRUE /* Variant 6 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var6( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var6( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR6==TRUE /* Variant 6 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var6( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var6( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR7==TRUE /* Variant 7 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var7( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var7( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR7==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var7( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var7( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR8==TRUE /* Variant 8 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var8( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var8( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR8==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var8( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var8( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &Cref ); FLA_Obj_free( &Cold ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ // printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ #if TEST_UNB_VAR1==TRUE printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.' ); \n" ); #endif #if TEST_UNB_VAR2==TRUE printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.' ); \n" ); #endif #if TEST_UNB_VAR3==TRUE printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.' ); \n" ); #endif #if TEST_UNB_VAR4==TRUE printf( "plot( data_unb_var4( :,1 ), data_unb_var4( :, 2 ), 'm-.' ); \n" ); #endif #if TEST_UNB_VAR5==TRUE printf( "plot( data_unb_var5( :,1 ), data_unb_var5( :, 2 ), 'c-.' ); \n" ); #endif #if TEST_UNB_VAR6==TRUE printf( "plot( data_unb_var6( :,1 ), data_unb_var6( :, 2 ), 'y-.' ); \n" ); #endif #if TEST_UNB_VAR7==TRUE printf( "plot( data_unb_var7( :,1 ), data_unb_var7( :, 2 ), 'k-.' ); \n" ); #endif #if TEST_UNB_VAR8==TRUE printf( "plot( data_unb_var8( :,1 ), data_unb_var8( :, 2 ), 'm:' ); \n" ); #endif #if TEST_BLK_VAR1==TRUE printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r--' ); \n" ); #endif #if TEST_BLK_VAR2==TRUE printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g--' ); \n" ); #endif #if TEST_BLK_VAR3==TRUE printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b--' ); \n" ); #endif #if TEST_BLK_VAR4==TRUE printf( "plot( data_blk_var4( :,1 ), data_blk_var4( :, 2 ), 'm--' ); \n" ); #endif #if TEST_BLK_VAR5==TRUE printf( "plot( data_blk_var5( :,1 ), data_blk_var5( :, 2 ), 'c--' ); \n" ); #endif #if TEST_BLK_VAR6==TRUE printf( "plot( data_blk_var6( :,1 ), data_blk_var6( :, 2 ), 'y--' ); \n" ); #endif #if TEST_BLK_VAR7==TRUE printf( "plot( data_blk_var7( :,1 ), data_blk_var7( :, 2 ), 'k--' ); \n" ); #endif #if TEST_BLK_VAR8==TRUE printf( "plot( data_blk_var8( :,1 ), data_blk_var8( :, 2 ), 'm-' ); \n" ); #endif printf( "hold on \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); // printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); printf( "legend( 'FLA Trsm', ...\n"); #if TEST_UNB_VAR1==TRUE printf( " 'unb var1', ...\n"); #endif #if TEST_UNB_VAR2==TRUE printf( " 'unb var2', ...\n"); #endif #if TEST_UNB_VAR3==TRUE printf( " 'unb var3', ...\n"); #endif #if TEST_UNB_VAR4==TRUE printf( " 'unb var4', ...\n"); #endif #if TEST_UNB_VAR5==TRUE printf( " 'unb var5', ...\n"); #endif #if TEST_UNB_VAR6==TRUE printf( " 'unb var6', ...\n"); #endif #if TEST_UNB_VAR7==TRUE printf( " 'unb var7', ...\n"); #endif #if TEST_UNB_VAR8==TRUE printf( " 'unb var8', ...\n"); #endif #if TEST_BLK_VAR1==TRUE printf( " 'blk var1', ...\n"); #endif #if TEST_BLK_VAR2==TRUE printf( " 'blk var2', ...\n"); #endif #if TEST_BLK_VAR3==TRUE printf( " 'blk var3', ...\n"); #endif #if TEST_BLK_VAR4==TRUE printf( " 'blk var4', ...\n"); #endif #if TEST_BLK_VAR5==TRUE printf( " 'blk var5', ...\n"); #endif #if TEST_BLK_VAR6==TRUE printf( " 'blk var6', ...\n"); #endif #if TEST_BLK_VAR7==TRUE printf( " 'blk var7', ...\n"); #endif #if TEST_BLK_VAR8==TRUE printf( " 'blk var8', ...\n"); #endif printf( " 2 );\n"); FLA_Finalize( ); }