int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, nb_alg, variant, n_repeats, i, j, datatype, n_variants = 4; int sign; int blocksize[16]; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, C, C_ref, scale, isgn, norm; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter sign (-1 or 1):", '%' ); scanf( "%d", &sign ); fprintf( stdout, "%c %d\n", '%', sign ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( 0 < sign ) isgn = FLA_ONE; else isgn = FLA_MINUS_ONE; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Obj_create( datatype, m, m, 0, 0, &C_ref ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &scale ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Norm1( A, norm ); FLA_Shift_diag( FLA_NO_CONJUGATE, norm, A ); FLA_Random_matrix( C ); FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C ); /* time_Lyap_h( 0, FLA_ALG_REFERENCE, n_repeats, m, nb_alg, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf ]; \n", i, p, gflops ); fflush( stdout ); */ for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:7 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Lyap_h( variant, FLA_ALG_UNBLOCKED, n_repeats, m, nb_alg, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Lyap_h( variant, FLA_ALG_UNB_OPT, n_repeats, m, nb_alg, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Lyap_h( variant, FLA_ALG_BLOCKED, n_repeats, m, nb_alg, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } FLA_Obj_free( &A ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); FLA_Obj_free( &scale ); FLA_Obj_free( &norm ); fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ){ fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i <= n_variants; i++ ) fprintf( stdout, "'FLAME var%d', ... \n", i ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME sylv\\_nn performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc sylv_nn_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, k_accum, b_alg, n_iter_max, variant, n_repeats, i, n_variants = 2; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff1, diff2; FLA_Datatype datatype, dt_real; FLA_Obj A, l, Q, Ql, TT, r, d, e, A_orig, G, R, W2, de, alpha; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter n_iter_max (per eigenvalue): ", '%' ); scanf( "%d", &n_iter_max ); fprintf( stdout, "%c %d\n", '%', n_iter_max ); fprintf( stdout, "%c enter number of sets of Givens rotations to accumulate:", '%' ); scanf( "%d", &k_accum ); fprintf( stdout, "%c %d\n", '%', k_accum ); fprintf( stdout, "%c enter blocking size for application of G:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &A_orig ); FLA_Obj_create( datatype, m, m, 0, 0, &Q ); FLA_Obj_create( datatype, m, m, 0, 0, &Ql ); FLA_Obj_create( datatype, m, 1, 0, 0, &r ); FLA_Obj_create( datatype, m, m, 0, 0, &W2 ); FLA_Obj_create( datatype, m-1, k_accum, 0, 0, &G ); dt_real = FLA_Obj_datatype_proj_to_real( A ); FLA_Obj_create( dt_real, m, 1, 0, 0, &l ); FLA_Obj_create( dt_real, m, 1, 0, 0, &d ); FLA_Obj_create( dt_real, m-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, m, m, 0, 0, &R ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &alpha ); *FLA_DOUBLE_PTR( alpha ) = 1.0 / ( sqrt( sqrt( (double) m ) ) ); FLA_Random_unitary_matrix( Q ); //FLA_Fill_with_uniform_dist( FLA_ONE, l ); //FLA_Fill_with_inverse_dist( FLA_ONE, l ); FLA_Fill_with_geometric_dist( alpha, l ); { FLA_Copy( Q, Ql ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, Ql ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Ql, Q, FLA_ZERO, A ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Copy( A, A_orig ); } FLA_Set( FLA_ZERO, l ); FLA_Set( FLA_ZERO, Q ); FLA_Tridiag_UT_create_T( A, &TT ); FLA_Tridiag_UT( FLA_LOWER_TRIANGULAR, A, TT ); FLA_Tridiag_UT_realify( FLA_LOWER_TRIANGULAR, A, r ); FLA_Tridiag_UT_extract_diagonals( FLA_LOWER_TRIANGULAR, A, d, e ); FLA_Tridiag_UT_form_Q( FLA_LOWER_TRIANGULAR, A, TT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); FLA_Obj_free( &TT ); time_Tevd_v( 0, FLA_ALG_REFERENCE, n_repeats, m, k_accum, b_alg, n_iter_max, A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops ); fprintf( stdout, "data_REFq( %d, 1:3 ) = [ %d %6.3lf %9.2e %6.2le %6.2le ]; \n", i, p, gflops, dtime, diff1, diff2 ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Tevd_v( variant, FLA_ALG_UNB_OPT, n_repeats, m, k_accum, b_alg, n_iter_max, A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops ); fprintf( stdout, "%6.3lf %9.2e %6.2le %6.2le ", gflops, dtime, diff1, diff2 ); fflush( stdout ); fprintf( stdout, "];\n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &A_orig ); FLA_Obj_free( &Q ); FLA_Obj_free( &Ql ); FLA_Obj_free( &G ); FLA_Obj_free( &W2 ); FLA_Obj_free( &r ); FLA_Obj_free( &l ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &R ); FLA_Obj_free( &alpha ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME Hevd_lv performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
void libfla_test_symm_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; unsigned int n; signed int n_input = -1; FLA_Side side; FLA_Uplo uplo; FLA_Obj A, B, C, x, y, z, w, norm; FLA_Obj alpha, beta; FLA_Obj C_save; FLA_Obj A_test, B_test, C_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); // Create the matrices for the current operation. if ( side == FLA_LEFT ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, m, 1, 0, 0, &w ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, n, 1, 0, 0, &w ); } libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C ); // Create a norm scalar. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_symm_matrix( uplo, A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); // Initialize the test vectors. FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, z ); FLA_Set( FLA_ZERO, w ); // Set constants. alpha = FLA_TWO; beta = FLA_MINUS_ONE; // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test ); } else { A_test = A; B_test = B; C_test = C; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( C_save, C_test ); else FLA_Copy_external( C_save, C_test ); time = FLA_Clock(); libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Copy the solution to flat matrix X. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( C_test, C ); } else { // No action needed since C_test and C refer to the same object. } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &B_test ); FLASH_Obj_free( &C_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_free(); // Compute the performance of the best experiment repeat. if ( side == FLA_LEFT ) *perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; else *perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute: // y = C * x // and compare to // z = ( beta * C_orig + alpha * A * B ) x (side = left) // z = ( beta * C_orig + alpha * B * A ) x (side = right) FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); if ( side == FLA_LEFT ) { FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Symv_external( uplo, alpha, A, w, FLA_ZERO, z ); } else { FLA_Symv_external( uplo, FLA_ONE, A, x, FLA_ZERO, w ); FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B, w, FLA_ZERO, z ); } FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z ); // Compute || y - z ||. //FLA_Axpy_external( FLA_MINUS_ONE, y, z ); //FLA_Nrm2_external( z, norm ); //FLA_Obj_extract_real_scalar( norm, residual ); *residual = FLA_Max_elemwise_diff( y, z ); // Free the supporting flat objects. FLA_Obj_free( &C_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Bidiag_blk_external( FLA_Obj A, FLA_Obj tu, FLA_Obj tv ) { int info = 0; #ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES FLA_Datatype datatype; int m_A, n_A, cs_A; int min_m_n, max_m_n; int lwork; FLA_Obj d, e, work_obj; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Bidiag_check( A, tu, tv ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); max_m_n = FLA_Obj_max_dim( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n, 1, 0, 0, &d ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n - 1, 1, 0, 0, &e ); lwork = (m_A + n_A) * FLA_Query_blocksize( datatype, FLA_DIMENSION_MIN ); FLA_Obj_create( datatype, lwork, 1, 0, 0, &work_obj ); switch( datatype ){ case FLA_FLOAT: { float* buff_A = ( float * ) FLA_FLOAT_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); float* buff_tu = ( float * ) FLA_FLOAT_PTR( tu ); float* buff_tv = ( float * ) FLA_FLOAT_PTR( tv ); float* buff_work = ( float * ) FLA_FLOAT_PTR( work_obj ); F77_sgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_DOUBLE: { double* buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); double* buff_tu = ( double * ) FLA_DOUBLE_PTR( tu ); double* buff_tv = ( double * ) FLA_DOUBLE_PTR( tv ); double* buff_work = ( double * ) FLA_DOUBLE_PTR( work_obj ); F77_dgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_COMPLEX: { scomplex* buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); scomplex* buff_tu = ( scomplex * ) FLA_COMPLEX_PTR( tu ); scomplex* buff_tv = ( scomplex * ) FLA_COMPLEX_PTR( tv ); scomplex* buff_work = ( scomplex * ) FLA_COMPLEX_PTR( work_obj ); F77_cgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); dcomplex* buff_tu = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tu ); dcomplex* buff_tv = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tv ); dcomplex* buff_work = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( work_obj ); F77_zgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } } FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &work_obj ); #else FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED ); #endif return info; }
FLA_Error FLA_Svd_uv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj scale, T, S, rL, rR, d, e, G, H; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create a real scaling factor. FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale ); // Compute a scaling factor; If none is needed, sigma will be set to one. FLA_Svd_compute_scaling( A, scale ); // Scale the matrix if scale is non-unit. if ( !FLA_Obj_equals( scale, FLA_ONE ) ) FLA_Scal( scale, A ); if ( m_A < crossover_ratio * n_A ) { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( A, d, e ); // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, U, V, b_alg ); } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; // Perform a QR factorization on A and form Q in U. FLA_QR_UT_create_T( A, &TQ ); FLA_QR_UT( A, TQ ); FLA_QR_UT_form_Q( A, TQ, U ); FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( R, d, e ); // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, R, V, b_alg ); // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); FLA_Obj_free( &R ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); // If the matrix was scaled, rescale the singular values. if ( !FLA_Obj_equals( scale, FLA_ONE ) ) FLA_Inv_scal( scale, s ); FLA_Obj_free( &scale ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); return r_val; }
void libfla_test_qrut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input = -2; signed int n_input = -1; FLA_Obj A, T, x, b, y, norm; FLA_Obj A_save; FLA_Obj A_test, T_test, x_test, b_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); if ( impl == FLA_TEST_FLAT_FRONT_END || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 1 ) ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); else if ( var == 2 ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], min_m_n, min_m_n, &T ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], 1, min_m_n, &T ); // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create vectors to form a linear system. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, n, 1, 0, 0, &y ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a random right-hand side vector. FLA_Random_matrix( b ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &b_flash, &b_test ); FLASH_Obj_create_hier_copy_of_flat( x, 1, &b_flash, &x_test ); } else { A_test = A; T_test = T; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( A_save, A_test ); else FLA_Copy_external( A_save, A_test ); time = FLA_Clock(); libfla_test_qrut_impl( impl, A_test, T_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Perform a linear solve with the result. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_solve( A_test, T_test, b_test, x_test ); FLASH_Obj_flatten( x_test, x ); } else { FLA_QR_UT_solve( A_test, T_test, b, x ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &b_test ); FLASH_Obj_free( &x_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_free(); // Compute the performance of the best experiment repeat. *perf = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save, x, FLA_MINUS_ONE, b ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &y ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); }
void time_Apply_G_rf( int variant, int type, int n_repeats, int m, int k, int n, int b_alg, FLA_Obj A, FLA_Obj A_ref, FLA_Obj G, FLA_Obj P, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, G_save, norm; if ( FLA_Obj_is_real( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } else if ( FLA_Obj_is_complex( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); //dim_t b_flash_m = b_alg; //dim_t b_flash_n = n; //FLASH_Obj_create_hier_copy_of_flat_ext( A, 1, &b_flash_m, &b_flash_n, &AH ); //printf ( "flash dims: %d x %d\n", FLA_Obj_length( AH ), FLA_Obj_width( AH ) ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_hierarchify( A_save, AH ); *dtime = FLA_Clock(); switch( variant ){ case 0: break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var1( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var1( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var1( G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var2( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var2( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var2( G, A, b_alg ); break; } break; } // Time variant 3 case 3: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var3( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var3( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var3( G, A, b_alg ); break; } break; } // Time variant 6 case 6: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var6( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var6( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var6( G, A, b_alg ); break; } break; } // Time variant 9 case 9: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var9( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var9( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var9( G, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( variant == 1 && type == FLA_ALG_UNB_OPT ) { //FLA_Obj_show( "A_ref", A, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A", A, "%9.2e ", "" ); FLA_Copy( A, A_ref ); *diff = 0.0; } else { //FLA_Obj_show( "A", A, "%9.2e + %9.2e ", "" ); //if ( variant == 7 && type == FLA_ALG_UNB_ASM ) //FLA_Obj_show( "A", A, "%9.2e", "" ); //if ( variant == 9 ) FLASH_Obj_flatten( AH, A ); FLA_Axpy( FLA_MINUS_ONE, A_ref, A ); FLA_Norm_frob( A, norm ); FLA_Obj_extract_real_scalar( norm, diff ); //*diff = FLA_Max_elemwise_diff( A_ref, A ); } *gflops = 6.0 * k * m * ( n - 1 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 2.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_free( &AH ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &norm ); }
void time_Eig_gest_nu( int variant, int type, int n_repeats, int n, int b_alg, FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, double *dtime, double *diff, double *gflops ) { int irep; double dtime_save = 1.0e9; FLA_Obj A_save, B_save, norm; fla_blocksize_t* bp; fla_eig_gest_t* cntl_eig_gest_var; fla_eig_gest_t* cntl_eig_gest_unb; if ( ( type == FLA_ALG_UNBLOCKED || type == FLA_ALG_UNB_OPT ) && n > 300 ) { *gflops = 0.0; *diff = 0.0; return; } if ( variant == 3 ) { *gflops = 0.0; *diff = 0.0; return; } bp = FLA_Blocksize_create( b_alg, b_alg, b_alg, b_alg ); cntl_eig_gest_unb = FLA_Cntl_eig_gest_obj_create( FLA_FLAT, //FLA_UNBLOCKED_VARIANT1, FLA_UNB_OPT_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_eig_gest_var = FLA_Cntl_eig_gest_obj_create( FLA_FLAT, variant, bp, cntl_eig_gest_unb, fla_axpy_cntl_blas, fla_axpy_cntl_blas, fla_gemm_cntl_blas, fla_gemm_cntl_blas, fla_gemm_cntl_blas, fla_hemm_cntl_blas, fla_her2k_cntl_blas, fla_trmm_cntl_blas, fla_trmm_cntl_blas, fla_trsm_cntl_blas, fla_trsm_cntl_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); FLA_Copy_external( B, B_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( B_save, B ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Eig_gest_nu( A, B ); break; case 1: { // Time variant 1 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var1( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var1( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var1( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 2: { // Time variant 2 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var2( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var2( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var2( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 3: { // Time variant 3 switch( type ) { case FLA_ALG_UNBLOCKED: //FLA_Eig_gest_nu_unb_var3( A, Y, B ); break; case FLA_ALG_UNB_OPT: //FLA_Eig_gest_nu_opt_var3( A, Y, B ); break; case FLA_ALG_BLOCKED: //FLA_Eig_gest_nu_blk_var3( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 4: { // Time variant 4 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var4( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var4( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var4( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 5: { // Time variant 5 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var5( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var5( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var5( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_save = min( *dtime, dtime_save ); } FLA_Cntl_obj_free( cntl_eig_gest_var ); FLA_Cntl_obj_free( cntl_eig_gest_unb ); FLA_Blocksize_free( bp ); // Recover A. if ( inv == FLA_NO_INVERSE ) { if ( uplo == FLA_LOWER_TRIANGULAR ) { // A = L' * A_orig * L // A_orig = inv(L') * A * inv(L) FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A ); FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trsm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } else // if ( uplo == FLA_UPPER_TRIANGULAR ) { // A = U * A_orig * U' // A_orig = inv(U) * A * inv(U') FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } } else // if ( inv == FLA_INVERSE ) { if ( uplo == FLA_LOWER_TRIANGULAR ) { // A = inv(L) * A_orig * inv(L') // A_orig = L * A * L' FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A ); FLA_Trmm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trmm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } else // if ( uplo == FLA_UPPER_TRIANGULAR ) { // A = inv(U') * A_orig * inv(U) // A_orig = U' * A * U FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A ); FLA_Trmm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trmm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } } *diff = FLA_Max_elemwise_diff( A, A_save ); /* if ( type == FLA_ALG_UNBLOCKED ) { FLA_Obj_show( "A", A, "%10.3e", "" ); FLA_Obj_show( "A_orig", A_save, "%10.3e", "" ); } */ *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_save / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_save; FLA_Copy_external( A_save, A ); FLA_Copy_external( B_save, B ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); FLA_Obj_free( &norm ); }
void time_LQ_UT( int param_combo, int type, int nrepeats, int m, int n, int b_flash, FLA_Obj A, FLA_Obj TW, FLA_Obj b, FLA_Obj x, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save; FLASH_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( A_save, A ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_FRONT: FLASH_LQ_UT( A, TW ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj A_save_flat, x_flat, b_flat, y_flat; FLA_Obj norm; FLASH_Obj_create_flat_copy_of_hier( A_save, &A_save_flat ); FLASH_Obj_create_flat_copy_of_hier( b, &b_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, x, &x_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, x, &y_flat ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); /* { FLA_Obj At, Tt; FLASH_Obj_create_flat_copy_of_hier( A_save, &At ); FLA_Obj_create( FLA_Obj_datatype( A_save ), b_flash, FLA_Obj_min_dim( At ), 0, 0, &Tt ); FLA_LQ_UT( At, Tt ); FLASH_Obj_show( "A_save", A_save, "%9.1e", "" ); FLASH_Obj_show( "A", A, "%9.1e", "" ); FLA_Obj_show( "At", At, "%9.1e", "" ); FLA_Obj_free( &At ); FLA_Obj_free( &Tt ); } */ FLASH_LQ_UT_solve( A, TW, b, x ); FLASH_Obj_flatten( x, x_flat ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save_flat, x_flat, FLA_MINUS_ONE, b_flat ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save_flat, b_flat, FLA_ZERO, y_flat ); FLA_Nrm2_external( y_flat, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &A_save_flat ); FLA_Obj_free( &b_flat ); FLA_Obj_free( &x_flat ); FLA_Obj_free( &y_flat ); FLA_Obj_free( &norm ); } *gflops = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Obj_free( &A_save ); }
FLA_Error FLA_Tridiag_unb_external( FLA_Uplo uplo, FLA_Obj A, FLA_Obj t ) { int info = 0; #ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES FLA_Datatype datatype; int n_A, cs_A; FLA_Obj d, e; char blas_uplo; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Tridiag_check( uplo, A, t ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); n_A = FLA_Obj_width( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), n_A, 1, 0, 0, &d ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), n_A - 1, 1, 0, 0, &e ); FLA_Param_map_flame_to_netlib_uplo( uplo, &blas_uplo ); switch( datatype ){ case FLA_FLOAT: { float* buff_A = ( float * ) FLA_FLOAT_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); float* buff_t = ( float * ) FLA_FLOAT_PTR( t ); F77_ssytd2( &blas_uplo, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_t, &info ); break; } case FLA_DOUBLE: { double* buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); double* buff_t = ( double * ) FLA_DOUBLE_PTR( t ); F77_dsytd2( &blas_uplo, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_t, &info ); break; } case FLA_COMPLEX: { scomplex* buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); scomplex* buff_t = ( scomplex * ) FLA_COMPLEX_PTR( t ); F77_chetd2( &blas_uplo, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_t, &info ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); dcomplex* buff_t = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( t ); F77_zhetd2( &blas_uplo, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_t, &info ); break; } } FLA_Obj_free( &d ); FLA_Obj_free( &e ); #else FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED ); #endif return info; }
FLA_Error FLA_Tevdr_external( FLA_Evd_type jobz, FLA_Obj d, FLA_Obj e, FLA_Obj l, FLA_Obj A ) { int info = 0; #ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES FLA_Datatype datatype; FLA_Datatype dt_real; int n_A, cs_A; int lisuppz, lwork, liwork; FLA_Obj isuppz, work, iwork; char blas_jobz; char blas_range; int i; int vl, vu; int il, iu; int nzc; int try_rac; int n_eig_found; //if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) // FLA_Tevdd_check( jobz, d, e, A ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); n_A = FLA_Obj_width( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Param_map_flame_to_netlib_evd_type( jobz, &blas_jobz ); // Hard-code some parameters. blas_range = 'A'; nzc = n_A; try_rac = TRUE; // Allocate space for the isuppz array. lisuppz = 2 * n_A; FLA_Obj_create( FLA_INT, lisuppz, 1, 0, 0, &isuppz ); // Make a workspace query the first time through. This will provide us with // and ideal workspace size. lwork = -1; liwork = -1; FLA_Obj_create( dt_real, 1, 1, 0, 0, &work ); FLA_Obj_create( FLA_INT, 1, 1, 0, 0, &iwork ); for ( i = 0; i < 2; ++i ) { if ( i == 1 ) { // Grab the queried ideal workspace size from the work arrays, free the // work object, and then re-allocate the workspace with the ideal size. if ( datatype == FLA_FLOAT || datatype == FLA_COMPLEX ) { lwork = ( int ) *FLA_FLOAT_PTR( work ); liwork = ( int ) *FLA_INT_PTR( iwork ); } else if ( datatype == FLA_DOUBLE || datatype == FLA_DOUBLE_COMPLEX ) { lwork = ( int ) *FLA_DOUBLE_PTR( work ); liwork = ( int ) *FLA_INT_PTR( iwork ); } //printf( "ideal workspace for n = %d\n", n_A ); //printf( " lwork = %d\n", lwork ); //printf( " liwork = %d\n", liwork ); FLA_Obj_free( &work ); FLA_Obj_free( &iwork ); FLA_Obj_create( dt_real, lwork, 1, 0, 0, &work ); FLA_Obj_create( FLA_INT, liwork, 1, 0, 0, &iwork ); } switch( datatype ) { case FLA_FLOAT: { float* buff_d = ( float* ) FLA_FLOAT_PTR( d ); float* buff_e = ( float* ) FLA_FLOAT_PTR( e ); float* buff_l = ( float* ) FLA_FLOAT_PTR( l ); float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); int* buff_isuppz = ( int* ) FLA_INT_PTR( isuppz ); float* buff_work = ( float* ) FLA_FLOAT_PTR( work ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_sstemr( &blas_jobz, &blas_range, &n_A, buff_d, buff_e, &vl, &vu, &il, &iu, &n_eig_found, buff_l, buff_A, &cs_A, &nzc, buff_isuppz, &try_rac, buff_work, &lwork, buff_iwork, &liwork, &info ); break; } case FLA_DOUBLE: { double* buff_d = ( double* ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double* ) FLA_DOUBLE_PTR( e ); double* buff_l = ( double* ) FLA_DOUBLE_PTR( l ); double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); int* buff_isuppz = ( int* ) FLA_INT_PTR( isuppz ); double* buff_work = ( double* ) FLA_DOUBLE_PTR( work ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_dstemr( &blas_jobz, &blas_range, &n_A, buff_d, buff_e, &vl, &vu, &il, &iu, &n_eig_found, buff_l, buff_A, &cs_A, &nzc, buff_isuppz, &try_rac, buff_work, &lwork, buff_iwork, &liwork, &info ); break; } case FLA_COMPLEX: { float* buff_d = ( float* ) FLA_FLOAT_PTR( d ); float* buff_e = ( float* ) FLA_FLOAT_PTR( e ); float* buff_l = ( float* ) FLA_FLOAT_PTR( l ); scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); int* buff_isuppz = ( int* ) FLA_INT_PTR( isuppz ); float* buff_work = ( float* ) FLA_FLOAT_PTR( work ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_cstemr( &blas_jobz, &blas_range, &n_A, buff_d, buff_e, &vl, &vu, &il, &iu, &n_eig_found, buff_l, buff_A, &cs_A, &nzc, buff_isuppz, &try_rac, buff_work, &lwork, buff_iwork, &liwork, &info ); break; } case FLA_DOUBLE_COMPLEX: { double* buff_d = ( double* ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double* ) FLA_DOUBLE_PTR( e ); double* buff_l = ( double* ) FLA_DOUBLE_PTR( l ); dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); int* buff_isuppz = ( int* ) FLA_INT_PTR( isuppz ); double* buff_work = ( double* ) FLA_DOUBLE_PTR( work ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_zstemr( &blas_jobz, &blas_range, &n_A, buff_d, buff_e, &vl, &vu, &il, &iu, &n_eig_found, buff_l, buff_A, &cs_A, &nzc, buff_isuppz, &try_rac, buff_work, &lwork, buff_iwork, &liwork, &info ); break; } } } FLA_Obj_free( &isuppz ); FLA_Obj_free( &work ); FLA_Obj_free( &iwork ); #else FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED ); #endif return info; }
FLA_Error FLA_Svdd_external( FLA_Svd_type jobz, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V ) { int info = 0; #ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES FLA_Datatype datatype; FLA_Datatype dt_real; FLA_Datatype dt_int; int m_A, n_A, cs_A; int cs_U; int cs_V; int min_m_n; int lwork, lrwork, liwork; FLA_Obj work, rwork, iwork; char blas_jobz; int i; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Svdd_check( jobz, A, s, U, V ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_int = FLA_INT; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); cs_A = FLA_Obj_col_stride( A ); cs_U = FLA_Obj_col_stride( U ); cs_V = FLA_Obj_col_stride( V ); min_m_n = min( m_A, n_A ); // Allocate the rwork and iwork arrays up front. if ( jobz == FLA_SVD_VECTORS_NONE ) lrwork = 5 * min_m_n; else lrwork = 5 * min_m_n * min_m_n + 7 * min_m_n; liwork = 8 * min_m_n; FLA_Obj_create( dt_int, liwork, 1, 0, 0, &iwork ); if ( FLA_Obj_is_complex( A ) ) FLA_Obj_create( dt_real, lrwork, 1, 0, 0, &rwork ); FLA_Param_map_flame_to_netlib_svd_type( jobz, &blas_jobz ); // Make a workspace query the first time through. This will provide us with // and ideal workspace size based on an internal block size. lwork = -1; FLA_Obj_create( datatype, 1, 1, 0, 0, &work ); for ( i = 0; i < 2; ++i ) { if ( i == 1 ) { // Grab the queried ideal workspace size from the work array, free the // work object, and then re-allocate the workspace with the ideal size. if ( datatype == FLA_FLOAT || datatype == FLA_COMPLEX ) lwork = ( int ) *FLA_FLOAT_PTR( work ); else if ( datatype == FLA_DOUBLE || datatype == FLA_DOUBLE_COMPLEX ) lwork = ( int ) *FLA_DOUBLE_PTR( work ); FLA_Obj_free( &work ); FLA_Obj_create( datatype, lwork, 1, 0, 0, &work ); } switch( datatype ) { case FLA_FLOAT: { float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); float* buff_s = ( float* ) FLA_FLOAT_PTR( s ); float* buff_U = ( float* ) FLA_FLOAT_PTR( U ); float* buff_V = ( float* ) FLA_FLOAT_PTR( V ); float* buff_work = ( float* ) FLA_FLOAT_PTR( work ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_sgesdd( &blas_jobz, &m_A, &n_A, buff_A, &cs_A, buff_s, buff_U, &cs_U, buff_V, &cs_V, buff_work, &lwork, buff_iwork, &info ); break; } case FLA_DOUBLE: { double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); double* buff_s = ( double* ) FLA_DOUBLE_PTR( s ); double* buff_U = ( double* ) FLA_DOUBLE_PTR( U ); double* buff_V = ( double* ) FLA_DOUBLE_PTR( V ); double* buff_work = ( double* ) FLA_DOUBLE_PTR( work ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_dgesdd( &blas_jobz, &m_A, &n_A, buff_A, &cs_A, buff_s, buff_U, &cs_U, buff_V, &cs_V, buff_work, &lwork, buff_iwork, &info ); break; } case FLA_COMPLEX: { scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); float* buff_s = ( float* ) FLA_FLOAT_PTR( s ); scomplex* buff_U = ( scomplex* ) FLA_COMPLEX_PTR( U ); scomplex* buff_V = ( scomplex* ) FLA_COMPLEX_PTR( V ); scomplex* buff_work = ( scomplex* ) FLA_COMPLEX_PTR( work ); float* buff_rwork = ( float* ) FLA_FLOAT_PTR( rwork ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_cgesdd( &blas_jobz, &m_A, &n_A, buff_A, &cs_A, buff_s, buff_U, &cs_U, buff_V, &cs_V, buff_work, &lwork, buff_rwork, buff_iwork, &info ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); double* buff_s = ( double* ) FLA_DOUBLE_PTR( s ); dcomplex* buff_U = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( U ); dcomplex* buff_V = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( V ); dcomplex* buff_work = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( work ); double* buff_rwork = ( double* ) FLA_DOUBLE_PTR( rwork ); int* buff_iwork = ( int* ) FLA_INT_PTR( iwork ); F77_zgesdd( &blas_jobz, &m_A, &n_A, buff_A, &cs_A, buff_s, buff_U, &cs_U, buff_V, &cs_V, buff_work, &lwork, buff_rwork, buff_iwork, &info ); break; } } } FLA_Obj_free( &work ); FLA_Obj_free( &iwork ); if ( FLA_Obj_is_complex( A ) ) FLA_Obj_free( &rwork ); #else FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED ); #endif return info; }
void libfla_test_eig_gest_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Uplo inv; FLA_Uplo uplo; FLA_Obj A, B, Y, norm; FLA_Obj A_save, B_save; FLA_Obj A_test, B_test, Y_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); if ( inv == FLA_NO_INVERSE && ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) ) ) { *perf = 0.0; *residual = 0.0; return; } // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); // Initialize the test matrices. FLA_Random_spd_matrix( uplo, A ); FLA_Scalr( uplo, FLA_TWO, A ); FLA_Hermitianize( uplo, A ); FLA_Random_spd_matrix( uplo, B ); FLA_Scalr( uplo, FLA_TWO, B ); FLA_Chol( uplo, B ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); } else { A_test = A; Y_test = Y; B_test = B; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_hierarchify( A_save, A_test ); FLASH_Obj_hierarchify( B_save, B_test ); } else { FLA_Copy_external( A_save, A_test ); FLA_Copy_external( B_save, B_test ); } time = FLA_Clock(); libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Check our solution. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLA_Trans trans_left, trans_right; FLASH_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } FLASH_Obj_flatten( A_test, A ); } else { FLA_Trans trans_left, trans_right; FLA_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &Y_test ); FLASH_Obj_free( &B_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_free(); // Compute the performance of the best experiment repeat. *perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Axpy_external( FLA_MINUS_ONE, A_save, A ); FLA_Norm1( A, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &Y ); FLA_Obj_free( &B ); }
void time_Apply_Q_UT_lnfc( int variant, int type, int n_repeats, int m, int n, int nb_alg, FLA_Obj A, FLA_Obj A_orig, FLA_Obj t, FLA_Obj T, FLA_Obj s, FLA_Obj S, FLA_Obj B, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, A_orig_save, B_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_orig_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); FLA_Copy_external( A, A_orig_save ); FLA_Copy_external( B, B_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( A_orig_save, A_orig ); FLA_Copy_external( B_save, B ); *dtime = FLA_Clock(); switch( variant ) { case 0: REF_Apply_Q_UT_lnfc( A, t, B ); //REF_Bidiag_form_U_blk_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, t, B ); //FLA_Bidiag_blk_external( A_orig, t, s ); //REF_Bidiag_form_U_blk_external( FLA_LEFT, FLA_NO_TRANSPOSE, A_orig, t, B ); break; case 1: { // Time variant 1 switch( type ){ case FLA_ALG_BLOCKED: //FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, B ); FLA_QR_UT_form_Q( A, T, B ); //FLA_Bidiag_UT_form_U( A, T, B ); //FLA_Bidiag_UT( A_orig, T, S ); //FLA_Bidiag_UT_form_U( A_orig, T, B ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } /* if ( variant == 0 ) { FLA_Copy_external( b, b_ref ); if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, b ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, b, FLA_ONE, b_ref ); FLA_Nrm2_external( b_ref, norm ); if ( FLA_Obj_is_single_precision( A ) ) *diff = *(FLA_FLOAT_PTR(norm)); else *diff = *(FLA_DOUBLE_PTR(norm)); } else */ { FLA_Obj_set_to_identity( A ); //FLA_Obj_show( "B", B, "%8.1e %8.1e ", "" ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, A ); FLA_Norm_frob( A, norm ); FLA_Obj_extract_real_scalar( norm, diff ); } /* *gflops = 2.0 * n * n * ( m - n / 3.0 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; */ *gflops = ( 4.0 * ( 2.0 * m * n * n - 2.0 / 3.0 * n * n * n ) + 4.0 * ( 4.0 / 3.0 * m * m * m ) + 4.0 * ( 4.0 / 3.0 * n * n * n ) + ( 13.0 * 2 * m * m ) + 2.0 * ( 3.0 * 2 * m * m * m ) + 2.0 * ( 3.0 * 2 * n * n * n ) ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( A_orig_save, A_orig ); FLA_Copy_external( B_save, B ); FLA_Obj_free( &A_save ); FLA_Obj_free( &A_orig_save ); FLA_Obj_free( &B_save ); FLA_Obj_free( &norm ); }
void time_Lyap_h( int variant, int type, int n_repeats, int m, int nb_alg, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_save, norm; fla_blocksize_t* bp; fla_lyap_t* cntl_lyap_unb; fla_lyap_t* cntl_lyap_opt; fla_lyap_t* cntl_lyap_blk; if ( type == FLA_ALG_UNB_OPT && variant > 4 ) { *gflops = 0.0; *diff = 0.0; return; } bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_lyap_unb = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_UNB_VAR_OFFSET + variant, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_lyap_opt = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_OPT_VAR_OFFSET + variant, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_lyap_blk = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_BLK_VAR_OFFSET + variant, bp, fla_scal_cntl_blas, fla_lyap_cntl_leaf, fla_sylv_cntl, fla_gemm_cntl_blas, fla_gemm_cntl_blas, fla_hemm_cntl_blas, fla_her2k_cntl_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm ); FLA_Copy_external( C, C_save ); for ( irep = 0 ; irep < n_repeats; irep++ ) { FLA_Copy_external( C_save, C ); *dtime = FLA_Clock(); switch( variant ) { case 0: REF_Lyap_h( isgn, A, C, scale ); break; case 1: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var1( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var1( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var1( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 2: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var2( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var2( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var2( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 3: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var3( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var3( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var3( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 4: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var4( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var4( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var4( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Blocksize_free( bp ); FLA_Cntl_obj_free( cntl_lyap_unb ); FLA_Cntl_obj_free( cntl_lyap_opt ); FLA_Cntl_obj_free( cntl_lyap_blk ); /* if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C ); *diff = FLA_Max_elemwise_diff( C, C_ref ); } */ { FLA_Obj X, W; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W ); FLA_Copy( C, X ); FLA_Hermitianize( FLA_UPPER_TRIANGULAR, X ); FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE, W ); FLA_Scal( isgn, W ); /* if ( variant == 3 && type == FLA_ALG_UNBLOCKED ) { FLA_Obj_show( "W", W, "%10.3e + %10.3e ", "" ); FLA_Obj_show( "C_save", C_save, "%10.3e + %10.3e ", "" ); } */ FLA_Axpy( FLA_MINUS_ONE, C_save, W ); FLA_Norm1( W, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &X ); FLA_Obj_free( &W ); } *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_save, C ); FLA_Obj_free( &C_save ); FLA_Obj_free( &norm ); }
void time_Bidiag_UT( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj tu, FLA_Obj tv, FLA_Obj TU, FLA_Obj TV, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( param_combo ){ case 0: { switch( type ) { case FLA_ALG_REFERENCE: REF_Bidiag_UT( A, tu, tv ); break; case FLA_ALG_FRONT: FLA_Bidiag_UT( A, TU, TV ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj AL, AR; FLA_Obj ATL, ATR, ABL, ABR; FLA_Obj QU; FLA_Obj QV, QVL, QVR; FLA_Obj E, EL, ER; FLA_Obj F; FLA_Obj WU, WV, eye; FLA_Obj tvT, tvB; dim_t m_A, n_A, m_TU; //FLA_Obj_show( "A_save", A_save, "%10.3e", "" ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); m_TU = FLA_Obj_length( TU ); FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &QU ); FLA_Obj_create( FLA_Obj_datatype( A ), n_A, n_A, 0, 0, &QV ); FLA_Obj_create( FLA_Obj_datatype( A ), m_TU, m_A, 0, 0, &WU ); FLA_Obj_create( FLA_Obj_datatype( A ), m_TU, n_A, 0, 0, &WV ); FLA_Set_to_identity( QU ); FLA_Set_to_identity( QV ); FLA_Part_1x2( QV, &QVL, &QVR, 1, FLA_LEFT ); FLA_Part_1x2( A, &AL, &AR, 1, FLA_LEFT ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 1, 1, FLA_BL ); FLA_Part_2x1( tv, &tvT, &tvB, 1, FLA_BOTTOM ); if ( type == FLA_ALG_REFERENCE ) { if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, AR, tv, QVR ); // // Need to apply backwards transformation, since vectors are stored columnwise. // QL? RQ? // //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, AR, tvT, QVR ); } else { FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, TU, WU, QU ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, AR, TV, WV, QVR ); } /* FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Set_to_identity( eye ); //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, // FLA_ONE, QV, QV, FLA_MINUS_ONE, eye ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, QU, QU, FLA_MINUS_ONE, eye ); FLA_Obj_show( "eye", eye, "%10.3e", "" ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &eye ); */ FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A_save, QV, FLA_ZERO, E ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, QU, E, FLA_ZERO, F ); //FLA_Obj_show( "A_save", A_save, "%10.3e", "" ); FLA_Copy( A, E ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, E ); FLA_Part_1x2( E, &EL, &ER, 1, FLA_LEFT ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, ER ); //FLA_Obj_show( "B", E, "%10.3e", "" ); //FLA_Obj_show( "Q'AV", F, "%10.3e", "" ); //FLA_Obj_show( "B", E, "%10.3e + %10.3e ", "" ); //FLA_Obj_show( "Q'AV", F, "%10.3e + %10.3e ", "" ); *diff = FLA_Max_elemwise_diff( E, F ); FLA_Obj_free( &E ); FLA_Obj_free( &F ); FLA_Obj_free( &QU ); FLA_Obj_free( &QV ); FLA_Obj_free( &WU ); FLA_Obj_free( &WV ); } *gflops = 4.0 * n * n * ( m - n / 3.0 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &norm ); }
void time_Tevd_v( int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max, FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l, double *dtime, double *diff1, double* diff2, double *gflops ) { int irep; double k, dtime_old = 1.0e9; FLA_Obj A_save, G_save, d_save, e_save; if ( //( variant == 0 ) || //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || FALSE ) { *dtime = 0.0; *gflops = 0.0; *diff1 = 0.0; *diff2 = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); FLA_Copy_external( d, d_save ); FLA_Copy_external( e, e_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Tevd_v( d, e, A ); break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj V, A_rev_evd, norm, eye; FLA_Copy( d, l ); //FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" ); FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); /* FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, D, FLA_ZERO, A_rev_evd ); FLA_Copy( A_rev_evd, D ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, D, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); */ //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); /* FLA_Obj_free( &EL ); FLA_Obj_free( &EU ); FLA_Obj_free( &D ); FLA_Obj_free( &dc ); FLA_Obj_free( &ec ); */ FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; if ( FLA_Obj_is_complex( A ) ) { *gflops = ( ( 4.5 * k * m * m ) + 2.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } else { *gflops = ( ( 4.5 * k * m * m ) + 1.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &d_save ); FLA_Obj_free( &e_save ); }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, b_alg, variant, n_repeats, i, datatype, n_variants = 1; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double safemin; double dtime, gflops, diff; FLA_Obj A, l, Q, T, W; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } /* char ch = 's'; safemin = dlamch_( &ch ); printf( "safemin = %23.15e\n", safemin ); ch = 'e'; double eps = dlamch_( &ch ); printf( "eps dla = %23.15e\n", eps ); printf( "eps fla = %23.15e\n", FLA_EPSILON_D ); */ for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / f2c_abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &Q ); FLA_Obj_create( datatype, 32, m, 0, 0, &T ); FLA_Obj_create( datatype, 32, m, 0, 0, &W ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), m, 1, 0, 0, &l ); //FLA_Random_herm_matrix( FLA_LOWER_TRIANGULAR, A ); //FLA_Random_spd_matrix( FLA_LOWER_TRIANGULAR, A ); FLA_Random_matrix( A ); FLA_Obj_set_to_identity( Q ); FLA_QR_UT( A, T ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q ); fill_eigenvalues( l ); //FLA_Obj_show( "eig", l, "%9.2e ", "" ); FLA_Apply_diag_matrix( FLA_LEFT, FLA_NO_CONJUGATE, l, Q ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, Q ); FLA_Copy( Q, A ); time_Hevd_ln( 0, FLA_ALG_REFERENCE, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "data_REFs( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff ); fflush( stdout ); time_Hevd_ln( -1, FLA_ALG_REFERENCE, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "data_REFd( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:9 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Hevd_ln( variant, FLA_ALG_UNBLOCKED, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); //time_Hevd_ln( variant, FLA_ALG_UNB_OPT, n_repeats, m, b_alg, // A, l, &dtime, &diff, &gflops ); //fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); //fflush( stdout ); fprintf( stdout, "];\n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &Q ); FLA_Obj_free( &l ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME Hevd_ln performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
FLA_Error REF_Svdd_uv_components( FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) /* { *dtime_bred = 1; *dtime_bsvd = 1; *dtime_appq = 1; *dtime_qrfa = 1; *dtime_gemm = 1; return FLA_Svdd_external( FLA_SVD_VECTORS_ALL, A, s, U, V ); } */ { FLA_Datatype dt_A; FLA_Datatype dt_A_real; dim_t m_A, n_A; dim_t min_m_n; FLA_Obj tq, tu, tv, d, e, Ur, Vr, W; FLA_Obj eT, epsilonB; FLA_Uplo uplo = FLA_UPPER_TRIANGULAR; double crossover_ratio = 16.0 / 10.0; double dtime_temp; dt_A = FLA_Obj_datatype( A ); dt_A_real = FLA_Obj_datatype_proj_to_real( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tq ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tu ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tv ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &e ); FLA_Obj_create( dt_A_real, n_A, n_A, 0, 0, &Ur ); FLA_Obj_create( dt_A_real, n_A, n_A, 0, 0, &Vr ); FLA_Part_2x1( e, &eT, &epsilonB, 1, FLA_BOTTOM ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce to bidiagonal form. FLA_Bidiag_blk_external( A, tu, tv ); FLA_Bidiag_UT_extract_diagonals( A, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Divide-and-conquor algorithm. FLA_Bsvdd_external( uplo, d, e, Ur, Vr ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U. FLA_Copy_external( Ur, U ); FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, tu, U ); // Form V. FLA_Copy_external( Vr, V ); FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, A, tv, V ); } *dtime_appq = FLA_Clock() - dtime_temp; *dtime_qrfa = 0.0; *dtime_gemm = 0.0; } else { FLA_Obj AT, AB; FLA_Obj UL, UR; FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); // Create a temporary n-by-n matrix R. FLA_Obj_create( dt_A, n_A, n_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Perform a QR factorization. FLA_QR_blk_external( A, tq ); FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, UL ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, A ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form Q. FLA_QR_form_Q_external( U, tq ); } *dtime_appq = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Reduce R to bidiagonal form. FLA_Bidiag_blk_external( AT, tu, tv ); FLA_Bidiag_UT_extract_diagonals( A, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Divide-and-conquor algorithm. FLA_Bsvdd_external( uplo, d, e, Ur, Vr ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U in W. FLA_Copy_external( Ur, W ); FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, AT, tu, W ); // Form V. FLA_Copy_external( Vr, V ); FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, AT, tv, V ); } *dtime_appq += FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying // back to U. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, W, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; // Free R. FLA_Obj_free( &W ); } } else { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy singular values to output vector. FLA_Copy( d, s ); // Sort singular values and vectors. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &tq ); FLA_Obj_free( &tu ); FLA_Obj_free( &tv ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &Ur ); FLA_Obj_free( &Vr ); return FLA_SUCCESS; }
void time_Hess_UT( int variant, int type, int nrepeats, int m, FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Hess_UT( A, t ); break; case FLA_ALG_FRONT: FLA_Hess_UT( A, T ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } //if ( type == FLA_ALG_REFERENCE ) //{ // ; //} //else { FLA_Obj AT, AB; FLA_Obj Q, QT, QB; FLA_Obj E, ET, EB; FLA_Obj F; dim_t m_A, m_T; m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Part_2x1( Q, &QT, &QB, 1, FLA_TOP ); FLA_Part_2x1( A, &AT, &AB, 1, FLA_TOP ); if ( type == FLA_ALG_REFERENCE ) { if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB ); } else FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QB ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, Q, FLA_ZERO, E ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, Q, E, FLA_ZERO, F ); FLA_Copy( A, E ); FLA_Part_2x1( E, &ET, &EB, 1, FLA_TOP ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, EB ); *diff = FLA_Max_elemwise_diff( E, F ); FLA_Obj_free( &Q ); FLA_Obj_free( &E ); FLA_Obj_free( &F ); } *gflops = ( 10.0 / 3.0 * m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Svd_compute_scaling( FLA_Obj A, FLA_Obj sigma ) { FLA_Datatype dt_real; FLA_Obj norm; FLA_Obj safmin; FLA_Obj prec; FLA_Obj rmin; FLA_Obj rmax; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Svd_compute_scaling_check( A, sigma ); dt_real = FLA_Obj_datatype_proj_to_real( A ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &norm ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &prec ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &safmin ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &rmin ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &rmax ); // Query safmin, precision. FLA_Mach_params( FLA_MACH_PREC, prec ); FLA_Mach_params( FLA_MACH_SFMIN, safmin ); //FLA_Obj_show( "safmin", safmin, "%20.12e", "" ); //FLA_Obj_show( "prec", prec, "%20.12e", "" ); // rmin = sqrt( safmin ) / prec; FLA_Copy( safmin, rmin ); FLA_Sqrt( rmin ); FLA_Inv_scal( prec, rmin ); // rmax = 1 / rmin; FLA_Copy( rmin, rmax ); FLA_Invert( FLA_NO_CONJUGATE, rmax ); //FLA_Obj_show( "rmin", rmin, "%20.12e", "" ); //FLA_Obj_show( "rmax", rmax, "%20.12e", "" ); // Find the maximum absolute value of A. FLA_Max_abs_value( A, norm ); if ( FLA_Obj_gt( norm, FLA_ZERO ) && FLA_Obj_lt( norm, rmin ) ) { // sigma = rmin / norm; FLA_Copy( rmin, sigma ); FLA_Inv_scal( norm, sigma ); } else if ( FLA_Obj_gt( norm, rmax ) ) { // sigma = rmax / norm; FLA_Copy( rmax, sigma ); FLA_Inv_scal( norm, sigma ); } else { // sigma = 1.0; FLA_Copy( FLA_ONE, sigma ); } FLA_Obj_free( &norm ); FLA_Obj_free( &prec ); FLA_Obj_free( &safmin ); FLA_Obj_free( &rmin ); FLA_Obj_free( &rmax ); return FLA_SUCCESS; }
FLA_Error FLA_Fill_with_logarithmic_dist( FLA_Obj alpha, FLA_Obj x ) { FLA_Obj lT, l0, lB, lambda1, l2; FLA_Obj l, k, alpha2; FLA_Datatype dt_real; dim_t n_x; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Fill_with_logarithmic_dist_check( alpha, x ); dt_real = FLA_Obj_datatype_proj_to_real( x ); n_x = FLA_Obj_vector_dim( x ); // Create a local counter to increment as we create the distribution. FLA_Obj_create( dt_real, 1, 1, 0, 0, &k ); // Create a local vector l. We will work with this vector, which is // the same length as x, so that we can use vertical partitioning. FLA_Obj_create( dt_real, n_x, 1, 0, 0, &l ); // Create a local real scalar alpha2 of the same precision as // alpha. Then copy alpha to alpha2, which will convert the // complex value to real, if necessary (ie: if alpha is complex). FLA_Obj_create( dt_real, 1, 1, 0, 0, &alpha2 ); FLA_Copy( alpha, alpha2 ); // Initialize k to 0. FLA_Set( FLA_ZERO, k ); FLA_Part_2x1( l, &lT, &lB, 0, FLA_TOP ); while ( FLA_Obj_length( lB ) > 0 ) { FLA_Repart_2x1_to_3x1( lT, &l0, /* ** */ /* ******* */ &lambda1, lB, &l2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ // lambda1 = alpha^k; FLA_Pow( alpha2, k, lambda1 ); // k = k + 1; FLA_Mult_add( FLA_ONE, FLA_ONE, k ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &lT, l0, lambda1, /* ** */ /* ******* */ &lB, l2, FLA_TOP ); } // Normalize by last element. FLA_Part_2x1( l, &lT, &lB, 1, FLA_BOTTOM ); FLA_Inv_scal( lB, l ); // Overwrite x with the distribution we created in l. FLA_Copy( l, x ); FLA_Obj_free( &l ); FLA_Obj_free( &k ); FLA_Obj_free( &alpha2 ); return FLA_SUCCESS; }
FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, S, rL, rR, d, e, G, H, RG, RH, W; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; double dtime_temp; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // If the matrix is a scalar, then the SVD is easy. if ( min_m_n == 1 ) { FLA_Copy( A, s ); FLA_Set_to_identity( U ); FLA_Set_to_identity( V ); return FLA_SUCCESS; } // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create matrices to hold the left and right Givens matrices. FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG ); FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH ); FLA_Obj_create( dt, m_A, n_A, 0, 0, &W ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the sub-diagonal to the real domain. // Extract the diagonal and sub-diagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_diagonals( A, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; //FLA_QR_UT_create_T( A, &TQ ); FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ ); dtime_temp = FLA_Clock(); { // Perform a QR factorization on A and form Q in U. FLA_QR_UT( A, TQ ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { FLA_QR_UT_form_Q( A, TQ, U ); } *dtime_appq = FLA_Clock() - dtime_temp; FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_diagonals( R, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); } *dtime_appq += FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; FLA_Obj_free( &R ); } } else // if ( m_A < n_A ) { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); FLA_Obj_free( &RG ); FLA_Obj_free( &RH ); FLA_Obj_free( &W ); return r_val; }
void libfla_test_apqut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input; signed int n_input; FLA_Side side; FLA_Trans trans; FLA_Direct direct; FLA_Store storev; FLA_Obj A, T, W, B, eye, norm; FLA_Obj B_save; FLA_Obj A_test, T_test, W_test, B_test; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans ); FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct ); FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev ); // We want to make sure the Apply_Q_UT routines work with rectangular // matrices. So we use m > n when testing with column-wise storage (via // QR factorization) and m < n when testing with row-wise storage (via // LQ factorization). if ( storev == FLA_COLUMNWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } else // if ( storev == FLA_ROWWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); if ( storev == FLA_COLUMNWISE ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye ); FLA_Apply_Q_UT_create_workspace( T, B, &W ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_matrix( A ); FLA_Set_to_identity( B ); FLA_Set_to_identity( eye ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); else // if ( storev == FLA_ROWWISE ) FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { A_test = A; T_test = T; W_test = W; B_test = B; } // Compute a Householder factorization. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test ); else FLASH_LQ_UT( A_test, T_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test ); else FLA_LQ_UT( A_test, T_test ); } // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( B_save, B_test ); else FLA_Copy_external( B_save, B_test ); time = FLA_Clock(); libfla_test_apqut_impl( impl, side, trans, direct, storev, A_test, T_test, W_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Multiply by its conjugate-transpose to get what should be (near) identity // and then subtract from actual identity to get what should be (near) zero. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( B_test, B ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &W_test ); FLASH_Obj_free( &B_test ); } // Compute the norm of eye, which contains I - Q * Q'. FLA_Norm1( eye, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Compute the performance of the best experiment repeat. *perf = ( 4.0 * m * min_m_n * n - 2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Free the supporting flat objects. FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &B ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; dim_t b_flash; dim_t n_threads; FLA_Datatype datatype; FLA_Uplo uplo; FLA_Inv inv; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, B, norm; FLA_Init(); fprintf( stdout, "%c number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter FLASH blocksize: ", '%' ); scanf( "%u", &b_flash ); fprintf( stdout, "%c %u\n", '%', b_flash ); fprintf( stdout, "%c enter problem size first, last, inc: ", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLASH_Queue_set_num_threads( n_threads ); for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){ if ( pc_str[param_combo][0] == 'i' ) inv = FLA_INVERSE; else inv = FLA_NO_INVERSE; if ( pc_str[param_combo][1] == 'l' ) uplo = FLA_LOWER_TRIANGULAR; else uplo = FLA_UPPER_TRIANGULAR; FLASH_Obj_create( datatype, m, m, 1, &b_flash, &A ); FLASH_Obj_create( datatype, m, m, 1, &b_flash, &B ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLASH_Random_spd_matrix( uplo, A ); FLASH_Hermitianize( uplo, A ); FLASH_Random_spd_matrix( uplo, B ); FLASH_Chol( uplo, B ); fprintf( stdout, "data_eig_gest_%s( %d, 1:3 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_Eig_gest( param_combo, FLA_ALG_FRONT, n_repeats, m, inv, uplo, A, B, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLASH_Obj_free( &A ); FLASH_Obj_free( &B ); FLA_Obj_free( &norm ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_eig_gest_%s( :,1 ), data_eig_gest_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_eig_gest_%s( :,1 ), data_eig_gest_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_eig_gest\\_%s', 'fla\\_eig_gest\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME eig_gest front-end performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc eig_gest_front_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize(); return 0; }
void time_Hevd_lv_components( int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg, FLA_Obj A, FLA_Obj l, double* dtime, double* diff1, double* diff2, double* gflops, double* dtime_tred, double* gflops_tred, double* dtime_tevd, double* gflops_tevd, double* dtime_appq, double* gflops_appq, int* k_perf ) { int i; double k; double dtime_save = 1.0e9; double dtime_tred_save = 1.0e9; double dtime_tevd_save = 1.0e9; double dtime_appq_save = 1.0e9; double flops_tred; double flops_tevd; double flops_appq; double mult_tred; double mult_tevd; double mult_appq; FLA_Obj A_save, Z; if ( ( variant == -3 ) || ( variant == -4 ) || ( variant == -5 ) || //( variant == 0 ) || //( variant == -1 ) || //( variant == -2 ) || //( variant == 1 ) || //( variant == 2 ) || //( variant == 3 ) || //( variant == 4 ) || FALSE ) { *gflops = 0.0; *dtime = 0.0; *diff1 = 0.0; *diff2 = 0.0; *dtime_tred = 0.0; *dtime_tevd = 0.0; *dtime_appq = 0.0; *gflops_tred = 0.0; *gflops_tevd = 0.0; *gflops_appq = 0.0; *k_perf = 0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); FLA_Copy_external( A, A_save ); for ( i = 0 ; i < n_repeats; i++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case -3: { *k_perf = 0; REF_Hevd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -4: { *k_perf = 0; REF_Hevdd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -5: { *k_perf = 0; REF_Hevdr_lv( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } case 0: { *k_perf = 0; REF_Hevd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -1: { *k_perf = 0; REF_Hevdd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -2: { *k_perf = 0; REF_Hevdr_lv_components( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 1 case 1: { *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 2 case 2: { *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } } *dtime = FLA_Clock() - *dtime; if ( *dtime < dtime_save ) { dtime_save = *dtime; dtime_tred_save = *dtime_tred; dtime_tevd_save = *dtime_tevd; dtime_appq_save = *dtime_appq; } } *dtime = dtime_save; *dtime_tred = dtime_tred_save; *dtime_tevd = dtime_tevd_save; *dtime_appq = dtime_appq_save; //if ( variant == -3 || variant == 0 ) //printf( "\ndtime is %9.3e\n", *dtime ); { FLA_Obj V, A_rev_evd, norm, eye; if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; flops_tred = ( ( 4.0 / 3.0 ) * m * m * m ); flops_tevd = ( 4.5 * k * m * m + 3.0 * k * m * m * m ); if ( variant == -1 || variant == -2 || variant == -4 || variant == -5 ) flops_appq = ( 2.0 * m * m * m ); else flops_appq = ( 4.0 / 3.0 * m * m * m ); /* if ( FLA_Obj_is_complex( A ) ) { *gflops = ( 4.0 * flops_tred + 2.0 * flops_tevd + 4.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9; } else { *gflops = ( 1.0 * flops_tred + 1.0 * flops_tevd + 1.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9; } */ if ( FLA_Obj_is_complex( A ) ) { mult_tred = 4.0; mult_tevd = 2.0; mult_appq = 4.0; } else { mult_tred = 1.0; mult_tevd = 1.0; mult_appq = 1.0; } *gflops = ( mult_tred * flops_tred + mult_tevd * flops_tevd + mult_appq * flops_appq ) / *dtime / 1e9; *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &Z ); }
FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg, double* dtime_tred, double* dtime_tevd, double* dtime_appq ) { FLA_Error r_val = FLA_SUCCESS; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, r, d, e, G, R, W; FLA_Obj d0, e0, ls, pu; dim_t mn_A; dim_t n_G = k_accum; double dtime_temp; mn_A = FLA_Obj_length( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); *dtime_tred = 1; *dtime_tevd = 1; *dtime_appq = 1; // If the matrix is a scalar, then the EVD is easy. if ( mn_A == 1 ) { FLA_Copy( A, l ); FLA_Set( FLA_ONE, A ); return FLA_SUCCESS; } // Create a matrix to hold block Householder transformations. FLA_Tridiag_UT_create_T( A, &T ); // Create a vector to hold the realifying scalars. FLA_Obj_create( dt, mn_A, 1, 0, 0, &r ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d0 ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e0 ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &pu ); FLA_Obj_create( FLA_INT, mn_A, 1, 0, 0, &ls ); FLA_Obj_create( dt_comp, mn_A-1, n_G, 0, 0, &G ); FLA_Obj_create( dt_real, mn_A, mn_A, 0, 0, &R ); FLA_Obj_create( dt, mn_A, mn_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Reduce the matrix to tridiagonal form. FLA_Tridiag_UT( uplo, A, T ); } *dtime_tred = FLA_Clock() - dtime_temp; // Apply scalars to rotate elements on the sub-diagonal to the real domain. FLA_Tridiag_UT_realify( uplo, A, r ); // Extract the diagonal and sub-diagonal from A. FLA_Tridiag_UT_extract_diagonals( uplo, A, d, e ); dtime_temp = FLA_Clock(); { // Form Q, overwriting A. FLA_Tridiag_UT_form_Q( uplo, A, T ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the scalars in r to Q. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); // Find the eigenvalues only. FLA_Copy( d, d0 ); FLA_Copy( e, e0 ); //r_val = FLA_Tevd_n_opt_var1( n_iter_max, d0, e0, G, A ); { int info; double* buff_d = FLA_DOUBLE_PTR( d0 ); double* buff_e = FLA_DOUBLE_PTR( e0 ); dsterf_( &mn_A, buff_d, buff_e, &info ); } FLA_Sort( FLA_FORWARD, d0 ); FLA_Set( FLA_ZERO, ls ); FLA_Set( FLA_ZERO, pu ); dtime_temp = FLA_Clock(); { // Perform an eigenvalue decomposition on the tridiagonal matrix. r_val = FLA_Tevd_v_opt_var4( n_iter_max, d, e, d0, ls, pu, G, R, W, A, b_alg ); } *dtime_tevd = FLA_Clock() - dtime_temp; // Copy the converged eigenvalues to the output vector. FLA_Copy( d, l ); // Sort the eigenvalues and eigenvectors in ascending order. FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_free( &T ); FLA_Obj_free( &r ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &d0 ); FLA_Obj_free( &pu ); FLA_Obj_free( &e0 ); FLA_Obj_free( &ls ); FLA_Obj_free( &G ); FLA_Obj_free( &R ); FLA_Obj_free( &W ); return r_val; }
int main(int argc, char *argv[]) { int datatype, m_input, m, p_first, p_last, p_inc, p, n_repeats, param_combo, i, j, n_param_combos = N_PARAM_COMBOS; int sign; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, C, C_ref, scale, isgn, norm; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter sign (-1 or 1):", '%' ); scanf( "%d", &sign ); fprintf( stdout, "%c %d\n", '%', sign ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( 0 < sign ) isgn = FLA_ONE; else isgn = FLA_MINUS_ONE; //datatype = FLA_FLOAT; datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; //datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){ FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Obj_create( datatype, m, m, 0, 0, &C_ref ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &scale ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Norm1( A, norm ); FLA_Shift_diag( FLA_NO_CONJUGATE, norm, A ); FLA_Random_matrix( C ); FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C ); fprintf( stdout, "data_lyap_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_Lyap( param_combo, FLA_ALG_REFERENCE, n_repeats, m, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Lyap( param_combo, FLA_ALG_FRONT, n_repeats, m, isgn, A, C, C_ref, scale, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &C ); FLA_Obj_free( &C_ref ); FLA_Obj_free( &scale ); FLA_Obj_free( &norm ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_lyap_%s( :,1 ), data_lyap_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_lyap_%s( :,1 ), data_lyap_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_lyap\\_%s', 'fla\\_lyap\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME lyap front-end performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc lyap_front_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
void time_Lyap( int param_combo, int type, int nrepeats, int m, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_save, norm; if ( param_combo == 0 && type == FLA_ALG_FRONT ) { *gflops = 0.0; *diff = 0.0; return; } FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm ); FLASH_Copy( C, C_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_save, C ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ //case FLA_ALG_REFERENCE: // REF_Lyap( FLA_NO_TRANSPOSE, isgn, A_flat, C_flat, scale ); // break; case FLA_ALG_FRONT: FLASH_Lyap( FLA_NO_TRANSPOSE, isgn, A, C, scale ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ //case FLA_ALG_REFERENCE: // REF_Lyap( FLA_CONJ_TRANSPOSE, isgn, A_flat, C_flat, scale ); // break; case FLA_ALG_FRONT: FLASH_Lyap( FLA_CONJ_TRANSPOSE, isgn, A, C, scale ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } /* if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } */ { FLA_Obj X, W; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X ); FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W ); FLASH_Copy( C, X ); FLASH_Hermitianize( FLA_UPPER_TRIANGULAR, X ); if ( param_combo == 0 ) { FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W ); FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, X, A, FLA_ONE, W ); } else if ( param_combo == 1 ) { FLASH_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W ); FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE, W ); } FLASH_Scal( isgn, W ); FLASH_Axpy( FLA_MINUS_ONE, C_save, W ); FLASH_Norm1( W, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLASH_Obj_free( &X ); FLASH_Obj_free( &W ); } *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_save, C ); FLASH_Obj_free( &C_save ); FLA_Obj_free( &norm ); }
void libfla_test_hessut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Obj A, T, W, Qh, AQ, QhAQ, norm; FLA_Obj AT, AB; FLA_Obj QhT, QhB; FLA_Obj A_save; // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); if ( impl == FLA_TEST_FLAT_FRONT_END || impl == FLA_TEST_FLAT_BLK_VAR ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &T ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &W ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &T ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &W ); } // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create auxiliary matrices to be used when checking the result. FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Qh ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &AQ ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &QhAQ ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_hessut_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { FLA_Copy_external( A_save, A ); time = FLA_Clock(); libfla_test_hessut_impl( impl, A, T ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_hessut_cntl_free(); // Compute the performance of the best experiment repeat. *perf = ( 10.0 / 3.0 * m * m * m ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Check the result by computing R - Q' A_orig Q. FLA_Set_to_identity( Qh ); FLA_Part_2x1( Qh, &QhT, &QhB, 1, FLA_TOP ); FLA_Part_2x1( A, &AT, &AB, 1, FLA_TOP ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QhB ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, Qh, FLA_ZERO, AQ ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, Qh, AQ, FLA_ZERO, QhAQ ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, AB ); *residual = FLA_Max_elemwise_diff( A, QhAQ ); // Free the supporting flat objects. FLA_Obj_free( &W ); FLA_Obj_free( &Qh ); FLA_Obj_free( &AQ ); FLA_Obj_free( &QhAQ ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); }