void time_SPDinv( int param_combo, int type, int nrepeats, int m, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_SPDinv( FLA_LOWER_TRIANGULAR, C_flat ); break; case FLA_ALG_FRONT: FLASH_SPDinv( FLA_LOWER_TRIANGULAR, C ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_SPDinv( FLA_UPPER_TRIANGULAR, C_flat ); break; case FLA_ALG_FRONT: FLASH_SPDinv( FLA_UPPER_TRIANGULAR, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ){ FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else{ *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_length( C ) / dtime_old / 1e9; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &C_flat ); }
void libfla_test_eig_gest_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Uplo inv; FLA_Uplo uplo; FLA_Obj A, B, Y, norm; FLA_Obj A_save, B_save; FLA_Obj A_test, B_test, Y_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); if ( inv == FLA_NO_INVERSE && ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) ) ) { *perf = 0.0; *residual = 0.0; return; } // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); // Initialize the test matrices. FLA_Random_spd_matrix( uplo, A ); FLA_Scalr( uplo, FLA_TWO, A ); FLA_Hermitianize( uplo, A ); FLA_Random_spd_matrix( uplo, B ); FLA_Scalr( uplo, FLA_TWO, B ); FLA_Chol( uplo, B ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); } else { A_test = A; Y_test = Y; B_test = B; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_hierarchify( A_save, A_test ); FLASH_Obj_hierarchify( B_save, B_test ); } else { FLA_Copy_external( A_save, A_test ); FLA_Copy_external( B_save, B_test ); } time = FLA_Clock(); libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Check our solution. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLA_Trans trans_left, trans_right; FLASH_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } FLASH_Obj_flatten( A_test, A ); } else { FLA_Trans trans_left, trans_right; FLA_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &Y_test ); FLASH_Obj_free( &B_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_free(); // Compute the performance of the best experiment repeat. *perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Axpy_external( FLA_MINUS_ONE, A_save, A ); FLA_Norm1( A, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &Y ); FLA_Obj_free( &B ); }
void time_Trinv( int param_combo, int type, int nrepeats, int m, FLA_Diag diag, FLA_Obj A, FLA_Obj A_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_old, A_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Copy( A, A_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( A_old, A ); FLASH_Obj_flatten( A, A_flat ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trinv( FLA_LOWER_TRIANGULAR, diag, A_flat ); break; case FLA_ALG_FRONT: FLASH_Trinv( FLA_LOWER_TRIANGULAR, diag, A ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trinv( FLA_UPPER_TRIANGULAR, diag, A_flat ); break; case FLA_ALG_FRONT: FLASH_Trinv( FLA_UPPER_TRIANGULAR, diag, A ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ){ FLASH_Obj_hierarchify( A_flat, A_ref ); *diff = 0.0; } else{ *diff = FLASH_Max_elemwise_diff( A, A_ref ); } *gflops = 1.0 / 4.0 * m * m * m / dtime_old / 1e9; *dtime = dtime_old; FLASH_Copy( A_old, A ); FLASH_Obj_free( &A_old ); FLASH_Obj_free( &A_flat ); }
void time_LU( int pivot_combo, int type, int nrepeats, int m, int n, dim_t nb_alg, dim_t nb_flash, FLA_Obj A, FLA_Obj p, FLA_Obj x, FLA_Obj b, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj AH_save, b_save; FLA_Obj AH, pH, bH, LH; FLASH_LU_incpiv_create_hier_matrices( A, 1, &nb_flash, nb_alg, &AH, &pH, &LH ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_flash, &bH ); FLASH_Obj_create_copy_of( FLA_NO_TRANSPOSE, AH, &AH_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, b, &b_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( AH_save, AH ); *dtime = FLA_Clock(); switch( pivot_combo ){ case 0: { switch( type ) { case FLA_ALG_FRONT_OPT0: FLASH_LU_incpiv_noopt( AH, pH, LH ); break; case FLA_ALG_FRONT_OPT1: FLASH_LU_incpiv_opt1( AH, pH, LH ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLASH_FS_incpiv( AH, pH, LH, bH ); FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, AH, bH ); FLASH_Obj_flatten( bH, x ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A, x, FLA_MINUS_ONE, b ); FLA_Nrm2_external( b, norm ); FLA_Obj_extract_real_scalar( norm, diff ); } *gflops = 2.0 / 3.0 * m * m * n / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy( b_save, b ); FLASH_Obj_free( &AH ); FLASH_Obj_free( &pH ); FLASH_Obj_free( &bH ); FLASH_Obj_free( &LH ); FLA_Obj_free( &b_save ); FLASH_Obj_free( &AH_save ); }
void time_Syr2k( int param_combo, int type, int nrepeats, int m, int k, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, B_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, B, &B_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( B, B_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ) { // Time parameter combination 0 case 0: { switch( type ) { case FLA_ALG_REFERENCE: REF_Syr2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syr2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1: { switch( type ) { case FLA_ALG_REFERENCE: REF_Syr2k( FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syr2k( FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2: { switch( type ) { case FLA_ALG_REFERENCE: REF_Syr2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syr2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3: { switch( type ) { case FLA_ALG_REFERENCE: REF_Syr2k( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Syr2k( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * m * m * k / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &B_flat ); FLASH_Obj_free( &C_flat ); }
void libfla_test_lu_piv_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; signed int m_input = -1; signed int n_input = -1; FLA_Obj A, p, x, b, norm; FLA_Obj A_save; FLA_Obj A_test, p_test, x_test, b_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); FLA_Obj_create( FLA_INT, min( m, n ), 1, 0, 0, &p ); // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create vectors to form a linear system. FLA_Obj_create( datatype, m, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a random right-hand side vector. FLA_Random_matrix( b ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( p, 1, &b_flash, &p_test ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &b_flash, &b_test ); FLASH_Obj_create_hier_copy_of_flat( x, 1, &b_flash, &x_test ); } else { A_test = A; p_test = p; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_lu_piv_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( A_save, A_test ); else FLA_Copy_external( A_save, A_test ); time = FLA_Clock(); libfla_test_lu_piv_impl( impl, A_test, p_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } #ifndef AMD_ONLY_PERFORMANCE // Perform a linear solve with the result. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_LU_piv_solve( A_test, p_test, b_test, x_test ); FLASH_Obj_flatten( x_test, x ); } else { FLA_LU_piv_solve( A_test, p_test, b, x ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &p_test ); FLASH_Obj_free( &b_test ); FLASH_Obj_free( &x_test ); } #endif // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_lu_piv_cntl_free(); // Compute the performance of the best experiment repeat. *perf = 2.0 / 3.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; #ifndef AMD_ONLY_PERFORMANCE // Compute the residual. FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save, x, FLA_MINUS_ONE, b ); FLA_Nrm2_external( b, norm ); FLA_Obj_extract_real_scalar( norm, residual ); #else *residual = 0.0; #endif // Free the supporting flat objects. FLA_Obj_free( &p ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); }
void time_Copy( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copy( A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Copy( A, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * m * n / dtime_old / 1.0e9; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &C_flat ); }
int main(int argc, char *argv[]) { int datatype, n_blocks_m, n_threads, m_input, n_input, m, n, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; dim_t nb_flash, nb_alg; char *colors = "brkgmcbrkgmcbrkgmc"; char *ticks = "o+*xso+*xso+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, A_flat_ref, A_flat, B, B_flat, D, D_flat, t, T, T_flat; FLA_Init( ); fprintf( stdout, "%c number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter algorithmic blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc: ", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d%d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLASH_Queue_set_num_threads( n_threads ); for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( n < 0 ) n = p / abs(n_input); nb_flash = n; for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ) { FLA_Obj_create( datatype, m, nb_flash, &A_flat ); FLA_Obj_create( datatype, m, nb_flash, &A_flat_ref ); FLA_Obj_create( datatype, m, nb_flash, &T_flat ); FLA_Obj_create( datatype, nb_flash, 1, &t ); FLASH_Obj_create( datatype, m, nb_flash, 1, &nb_flash, &A ); n_blocks_m = FLA_Obj_length( A ); FLASH_Obj_create_ext( datatype, nb_alg * n_blocks_m, nb_flash, 1, &nb_alg, &nb_flash, &T ); FLA_Set( FLA_ZERO, T_flat ); FLASH_Set( FLA_ZERO, T ); FLASH_Random_matrix( A ); FLASH_Obj_flatten( A, A_flat ); FLA_Part_2x1( A, &B, &D, 1, FLA_TOP ); FLA_Part_2x1( A_flat, &B_flat, &D_flat, FLA_Obj_width( A_flat ), FLA_TOP ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, *(FLASH_OBJ_PTR_AT(B)) ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, B_flat ); fprintf( stdout, "data_qr2ut_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_QR2_UT( param_combo, FLA_ALG_REFERENCE, n_repeats, m, n, A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_QR2_UT( param_combo, FLA_ALG_FRONT, n_repeats, m, n, A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A_flat ); FLA_Obj_free( &A_flat_ref ); FLA_Obj_free( &T_flat ); FLA_Obj_free( &t ); FLASH_Obj_free( &A ); FLASH_Obj_free( &T ); } fprintf( stdout, "\n" ); } fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_qr2ut\\_%s', 'fla\\_qr2ut\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME qr2ut front-end performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc qr2ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); return 0; }