FLA_Error FLA_Obj_create_buffer_task( dim_t rs, dim_t cs, FLA_Obj obj, void* cntl ) { FLA_Error r_val; r_val = FLA_Obj_create_buffer( rs, cs, &obj ); FLA_Set( FLA_ZERO, obj ); return r_val; }
FLA_Error FLA_Set_to_identity( FLA_Obj A ) { if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Set_to_identity_check( A ); FLA_Set( FLA_ZERO, A ); FLA_Set_diag( FLA_ONE, A ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blk_var4( FLA_Obj A, FLA_Obj T ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj YT, Y0, YB, Y1, Y2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Y, Z; FLA_Obj ABR_l; FLA_Obj UB_l, U2_l; FLA_Obj YB_l, Y2_l; FLA_Obj ZB_l, Z2_l; FLA_Obj WT_l; FLA_Obj T1_tl; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Y ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Y, &YT, &YB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( YT, &Y0, /* ** */ /* ** */ &Y1, YB, &Y2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( YB, &YB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( ZB, &ZB_l, &none, bb, FLA_LEFT ); FLA_Part_2x1( UB_l, &none, &U2_l, b, FLA_TOP ); FLA_Part_2x1( YB_l, &none, &Y2_l, b, FLA_TOP ); FLA_Part_2x1( ZB_l, &none, &Z2_l, b, FLA_TOP ); // [ ABR, YB, ZB, T1 ] = FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1, b ); //FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1_tl ); //FLA_Hess_UT_step_ofu_var4( ABR, YB, ZB, T1_tl ); FLA_Hess_UT_step_opt_var4( ABR, YB, ZB, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } // A22 = A22 - U2 * Y2' - Z2 * U2'; FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, U2_l, Y2_l, FLA_ONE, A22 ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, Z2_l, U2_l, FLA_ONE, A22 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &YT, Y0, Y1, /* ** */ /* ** */ &YB, Y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
void libfla_test_symm_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; unsigned int n; signed int n_input = -1; FLA_Side side; FLA_Uplo uplo; FLA_Obj A, B, C, x, y, z, w, norm; FLA_Obj alpha, beta; FLA_Obj C_save; FLA_Obj A_test, B_test, C_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); // Create the matrices for the current operation. if ( side == FLA_LEFT ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, m, 1, 0, 0, &w ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, n, 1, 0, 0, &w ); } libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C ); // Create a norm scalar. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_symm_matrix( uplo, A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); // Initialize the test vectors. FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, z ); FLA_Set( FLA_ZERO, w ); // Set constants. alpha = FLA_TWO; beta = FLA_MINUS_ONE; // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test ); } else { A_test = A; B_test = B; C_test = C; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( C_save, C_test ); else FLA_Copy_external( C_save, C_test ); time = FLA_Clock(); libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Copy the solution to flat matrix X. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( C_test, C ); } else { // No action needed since C_test and C refer to the same object. } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &B_test ); FLASH_Obj_free( &C_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_free(); // Compute the performance of the best experiment repeat. if ( side == FLA_LEFT ) *perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; else *perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute: // y = C * x // and compare to // z = ( beta * C_orig + alpha * A * B ) x (side = left) // z = ( beta * C_orig + alpha * B * A ) x (side = right) FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); if ( side == FLA_LEFT ) { FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Symv_external( uplo, alpha, A, w, FLA_ZERO, z ); } else { FLA_Symv_external( uplo, FLA_ONE, A, x, FLA_ZERO, w ); FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B, w, FLA_ZERO, z ); } FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z ); // Compute || y - z ||. //FLA_Axpy_external( FLA_MINUS_ONE, y, z ); //FLA_Nrm2_external( z, norm ); //FLA_Obj_extract_real_scalar( norm, residual ); *residual = FLA_Max_elemwise_diff( y, z ); // Free the supporting flat objects. FLA_Obj_free( &C_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg, double* dtime_tred, double* dtime_tevd, double* dtime_appq ) { FLA_Error r_val = FLA_SUCCESS; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, r, d, e, G, R, W; FLA_Obj d0, e0, ls, pu; dim_t mn_A; dim_t n_G = k_accum; double dtime_temp; mn_A = FLA_Obj_length( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); *dtime_tred = 1; *dtime_tevd = 1; *dtime_appq = 1; // If the matrix is a scalar, then the EVD is easy. if ( mn_A == 1 ) { FLA_Copy( A, l ); FLA_Set( FLA_ONE, A ); return FLA_SUCCESS; } // Create a matrix to hold block Householder transformations. FLA_Tridiag_UT_create_T( A, &T ); // Create a vector to hold the realifying scalars. FLA_Obj_create( dt, mn_A, 1, 0, 0, &r ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d0 ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e0 ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &pu ); FLA_Obj_create( FLA_INT, mn_A, 1, 0, 0, &ls ); FLA_Obj_create( dt_comp, mn_A-1, n_G, 0, 0, &G ); FLA_Obj_create( dt_real, mn_A, mn_A, 0, 0, &R ); FLA_Obj_create( dt, mn_A, mn_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Reduce the matrix to tridiagonal form. FLA_Tridiag_UT( uplo, A, T ); } *dtime_tred = FLA_Clock() - dtime_temp; // Apply scalars to rotate elements on the sub-diagonal to the real domain. FLA_Tridiag_UT_realify( uplo, A, r ); // Extract the diagonal and sub-diagonal from A. FLA_Tridiag_UT_extract_diagonals( uplo, A, d, e ); dtime_temp = FLA_Clock(); { // Form Q, overwriting A. FLA_Tridiag_UT_form_Q( uplo, A, T ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the scalars in r to Q. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); // Find the eigenvalues only. FLA_Copy( d, d0 ); FLA_Copy( e, e0 ); //r_val = FLA_Tevd_n_opt_var1( n_iter_max, d0, e0, G, A ); { int info; double* buff_d = FLA_DOUBLE_PTR( d0 ); double* buff_e = FLA_DOUBLE_PTR( e0 ); dsterf_( &mn_A, buff_d, buff_e, &info ); } FLA_Sort( FLA_FORWARD, d0 ); FLA_Set( FLA_ZERO, ls ); FLA_Set( FLA_ZERO, pu ); dtime_temp = FLA_Clock(); { // Perform an eigenvalue decomposition on the tridiagonal matrix. r_val = FLA_Tevd_v_opt_var4( n_iter_max, d, e, d0, ls, pu, G, R, W, A, b_alg ); } *dtime_tevd = FLA_Clock() - dtime_temp; // Copy the converged eigenvalues to the output vector. FLA_Copy( d, l ); // Sort the eigenvalues and eigenvectors in ascending order. FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_free( &T ); FLA_Obj_free( &r ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &d0 ); FLA_Obj_free( &pu ); FLA_Obj_free( &e0 ); FLA_Obj_free( &ls ); FLA_Obj_free( &G ); FLA_Obj_free( &R ); FLA_Obj_free( &W ); return r_val; }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, k_accum, b_alg, n_iter_max, variant, n_repeats, i, n_variants = 2; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff1, diff2; FLA_Datatype datatype, dt_real; FLA_Obj A, l, Q, Ql, TT, r, d, e, A_orig, G, R, W2, de, alpha; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter n_iter_max (per eigenvalue): ", '%' ); scanf( "%d", &n_iter_max ); fprintf( stdout, "%c %d\n", '%', n_iter_max ); fprintf( stdout, "%c enter number of sets of Givens rotations to accumulate:", '%' ); scanf( "%d", &k_accum ); fprintf( stdout, "%c %d\n", '%', k_accum ); fprintf( stdout, "%c enter blocking size for application of G:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &A_orig ); FLA_Obj_create( datatype, m, m, 0, 0, &Q ); FLA_Obj_create( datatype, m, m, 0, 0, &Ql ); FLA_Obj_create( datatype, m, 1, 0, 0, &r ); FLA_Obj_create( datatype, m, m, 0, 0, &W2 ); FLA_Obj_create( datatype, m-1, k_accum, 0, 0, &G ); dt_real = FLA_Obj_datatype_proj_to_real( A ); FLA_Obj_create( dt_real, m, 1, 0, 0, &l ); FLA_Obj_create( dt_real, m, 1, 0, 0, &d ); FLA_Obj_create( dt_real, m-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, m, m, 0, 0, &R ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &alpha ); *FLA_DOUBLE_PTR( alpha ) = 1.0 / ( sqrt( sqrt( (double) m ) ) ); FLA_Random_unitary_matrix( Q ); //FLA_Fill_with_uniform_dist( FLA_ONE, l ); //FLA_Fill_with_inverse_dist( FLA_ONE, l ); FLA_Fill_with_geometric_dist( alpha, l ); { FLA_Copy( Q, Ql ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, Ql ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Ql, Q, FLA_ZERO, A ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Copy( A, A_orig ); } FLA_Set( FLA_ZERO, l ); FLA_Set( FLA_ZERO, Q ); FLA_Tridiag_UT_create_T( A, &TT ); FLA_Tridiag_UT( FLA_LOWER_TRIANGULAR, A, TT ); FLA_Tridiag_UT_realify( FLA_LOWER_TRIANGULAR, A, r ); FLA_Tridiag_UT_extract_diagonals( FLA_LOWER_TRIANGULAR, A, d, e ); FLA_Tridiag_UT_form_Q( FLA_LOWER_TRIANGULAR, A, TT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); FLA_Obj_free( &TT ); time_Tevd_v( 0, FLA_ALG_REFERENCE, n_repeats, m, k_accum, b_alg, n_iter_max, A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops ); fprintf( stdout, "data_REFq( %d, 1:3 ) = [ %d %6.3lf %9.2e %6.2le %6.2le ]; \n", i, p, gflops, dtime, diff1, diff2 ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Tevd_v( variant, FLA_ALG_UNB_OPT, n_repeats, m, k_accum, b_alg, n_iter_max, A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops ); fprintf( stdout, "%6.3lf %9.2e %6.2le %6.2le ", gflops, dtime, diff1, diff2 ); fflush( stdout ); fprintf( stdout, "];\n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &A_orig ); FLA_Obj_free( &Q ); FLA_Obj_free( &Ql ); FLA_Obj_free( &G ); FLA_Obj_free( &W2 ); FLA_Obj_free( &r ); FLA_Obj_free( &l ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &R ); FLA_Obj_free( &alpha ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME Hevd_lv performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
FLA_Error FLA_Hess_UT_step_unb_var2( FLA_Obj A, FLA_Obj T ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj yT, y0, yB, psi1, y2; FLA_Obj zT, z0, zB, zeta1, z2; FLA_Obj y, z; FLA_Obj inv_tau11; FLA_Obj minus_inv_tau11; FLA_Obj first_elem; FLA_Obj beta; FLA_Obj conj_beta; FLA_Obj dot_product; FLA_Obj a21_t, a21_b; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &first_elem ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &conj_beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &dot_product ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); FLA_Part_2x1( y, &yT, &yB, 0, FLA_TOP ); FLA_Part_2x1( z, &zT, &zB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( yT, &y0, /* ** */ /* **** */ &psi1, yB, &y2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( zT, &z0, /* ** */ /* ***** */ &zeta1, zB, &z2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ if ( FLA_Obj_length( A22 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); // [ u21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // inv_tau11 = 1 / tau11; // minus_inv_tau11 = -1 / tau11; FLA_Set( FLA_ONE, inv_tau11 ); FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); FLA_Copy( inv_tau11, minus_inv_tau11 ); FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); // Save first element of a21_t and set it to one so we can use a21 as // u21 in subsequent computations. We will restore a21_t later on. FLA_Copy( a21_t, first_elem ); FLA_Set( FLA_ONE, a21_t ); // y21 = A22' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // z21 = A22 * u21; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); // beta = u21' * z21 / 2; // conj_beta = conj(beta); FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); FLA_Inv_scal( FLA_TWO, beta ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); // y21' = ( y21' - beta / tau * u21' ) / tau; // y21 = ( y21 - conj(beta) / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, conj_beta ); FLA_Axpy( conj_beta, a21, y2 ); FLA_Scal( inv_tau11, y2 ); // z21 = ( z21 - beta / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, beta ); FLA_Axpy( beta, a21, z2 ); FLA_Scal( inv_tau11, z2 ); // a12t = a12t * ( I - u21 * u21' / tau ); // = a12t - ( a12t * u21 ) * u21' / tau; FLA_Dot( a12t, a21, dot_product ); FLA_Scal( minus_inv_tau11, dot_product ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); // A02 = A02 * ( I - u21 * u21' / tau ); // = A02 - ( A02 * u21 ) * u21' / tau; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); // A22 = A22 - u21 * y21' - z21 * u21'; FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); // t01 = U20' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); // Restore first element of a21. FLA_Copy( first_elem, a21_t ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &yT, y0, psi1, /* ** */ /* **** */ &yB, y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &zT, z0, zeta1, /* ** */ /* ***** */ &zB, z2, FLA_TOP ); } FLA_Obj_free( &inv_tau11 ); FLA_Obj_free( &minus_inv_tau11 ); FLA_Obj_free( &first_elem ); FLA_Obj_free( &beta ); FLA_Obj_free( &conj_beta ); FLA_Obj_free( &dot_product ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); return FLA_SUCCESS; }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, A_flame, A_lapack, C; int m; FLA_Error init_result; FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame; FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack; testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack; int lwork, info, is_flame; if ( argc == 3 ) { m = atoi(argv[1]); is_flame = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m is_flame\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " is_flame : 1 yes, 0 no\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); fprintf( stdout, "lapack2flame: %d x %d: \n", m, m); FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Random_matrix( A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Random_matrix( C ); if ( is_flame ) { fprintf( stdout, " flame executed\n"); FLA_Bidiag_UT_create_T( A_flame, &TU, &TV ); FLA_Bidiag_UT( A_flame, TU, TV ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame ); FLA_Bidiag_UT_form_U( U_flame, TU, U_flame ); FLA_Bidiag_UT_form_V( V_flame, TV, V_flame ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_flame ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_flame ); FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame ); FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_flame, B_flame ); FLA_Set_diagonal_matrix( e_flame, BTR ); } if (1) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------"); } } else { fprintf( stdout, " lapack executed\n"); FLA_Obj_create( datatype, m, 1, 0, 0, &tauq ); FLA_Obj_create( datatype, m, 1, 0, 0, &taup ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_lapack ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_lapack ); buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack ); buff_tauq = (testtype*)FLA_Obj_buffer_at_view( tauq ); buff_taup = (testtype*)FLA_Obj_buffer_at_view( taup ); buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack ); lwork = 32*m; FLA_Obj_create( datatype, lwork, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sgebrd_( &m, &m, buff_A_lapack, &m, buff_d_lapack, buff_e_lapack, buff_tauq, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack ); FLA_Copy( A_lapack, U_lapack ); FLA_Copy( A_lapack, V_lapack ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack ); sorgbr_( "Q", &m, &m, &m, buff_U_lapack, &m, buff_tauq, buff_W, &lwork, &info ); sorgbr_( "P", &m, &m, &m, buff_V_lapack, &m, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_lapack, B_lapack ); FLA_Set_diagonal_matrix( e_lapack, BTR ); } FLA_Obj_free( &W ); if (1) { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------"); } } { testtype dummy; int zero = 0, one = 1; FLA_Obj D_lapack; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack ); if ( is_flame ) { buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame ); } FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sbdsqr_( "U", &m, &m, &m, &zero, buff_d_lapack, buff_e_lapack, buff_V_lapack, &m, buff_U_lapack, &m, &dummy, &one, buff_W, &info ); FLA_Obj_free( &W ); if (info != 0) printf( " Error info = %d\n", info ); if ( is_flame ) FLA_Set_diagonal_matrix( d_flame, D_lapack ); else FLA_Set_diagonal_matrix( d_lapack, D_lapack ); if ( is_flame ) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } else { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } FLA_Obj_free( &D_lapack ); } if ( is_flame ) { FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &U_flame ); FLA_Obj_free( &V_flame ); FLA_Obj_free( &d_flame ); FLA_Obj_free( &e_flame ); FLA_Obj_free( &B_flame ); } else { FLA_Obj_free( &tauq ); FLA_Obj_free( &taup ); FLA_Obj_free( &d_lapack ); FLA_Obj_free( &e_lapack ); FLA_Obj_free( &U_lapack ); FLA_Obj_free( &V_lapack ); FLA_Obj_free( &B_lapack ); } FLA_Obj_free( &A ); FLA_Obj_free( &A_flame ); FLA_Obj_free( &A_lapack ); FLA_Obj_free( &C ); FLA_Finalize_safe( init_result ); }
int main(int argc, char *argv[]) { int datatype, n_input, mB_input, mC_input, mD_input, mB, mC, mD, n, p_first, p_last, p_inc, p, b_alg, variant, n_repeats, i, n_variants = 1; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj B, C, D, T, R, E; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter algorithmic blocksize:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter n (-1 means bind to problem size): ", '%' ); scanf( "%d", &n_input ); fprintf( stdout, "%c %d\n", '%', n_input ); fprintf( stdout, "%c enter mB mC mD (-1 means bind to problem size): ", '%' ); scanf( "%d %d %d", &mB_input, &mC_input, &mD_input ); fprintf( stdout, "%c %d %d %d\n", '%', mB_input, mC_input, mD_input ); fprintf( stdout, "\nclear all;\n\n" ); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { mB = mB_input; mC = mC_input; mD = mD_input; n = n_input; if( mB < 0 ) mB = p / abs(mB_input); if( mC < 0 ) mC = p / abs(mC_input); if( mD < 0 ) mD = p / abs(mD_input); if( n < 0 ) n = p / abs(n_input); for ( variant = 0; variant < n_variants; variant++ ){ FLA_Obj_create( datatype, mB, n, 0, 0, &B ); FLA_Obj_create( datatype, mC, n, 0, 0, &C ); FLA_Obj_create( datatype, mD, n, 0, 0, &D ); FLA_Obj_create( datatype, b_alg, n, 0, 0, &T ); FLA_Obj_create( datatype, n, n, 0, 0, &R ); FLA_Obj_create( datatype, n, n, 0, 0, &E ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); FLA_Random_matrix( D ); FLA_Set( FLA_ZERO, R ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, B, FLA_ONE, R ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, D, FLA_ONE, R ); FLA_Chol( FLA_UPPER_TRIANGULAR, R ); FLA_Set( FLA_ZERO, E ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, B, FLA_ONE, E ); FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, C, FLA_ONE, E ); FLA_Chol( FLA_UPPER_TRIANGULAR, E ); fprintf( stdout, "data_uddate_ut( %d, 1:5 ) = [ %d ", i, p ); fflush( stdout ); time_UDdate_UT( variant, FLA_ALG_FRONT, n_repeats, mB, mC, mD, n, B, C, D, T, R, E, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &D ); FLA_Obj_free( &T ); FLA_Obj_free( &R ); FLA_Obj_free( &E ); } fprintf( stdout, "\n" ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_variants; i++ ) { fprintf( stdout, "plot( data_qr_ut( :,1 ), data_qr_ut( :, 2 ), '%c:%c' ); \n", colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_qr_ut( :,1 ), data_qr_ut( :, 4 ), '%c-.%c' ); \n", colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_variants; i++ ) fprintf( stdout, "'ref\\_qr\\_ut', 'fla\\_qr\\_ut', ... \n" ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME UDdate_UT front-end performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc qr_ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z; dim_t m, n, k; dim_t min_m_n; FLA_Error init_result; double residual_A, residual_Axy; int use_form_q = 1; if ( argc == 4 ) { m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n k\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " k : number of house holder vectors applied for testing\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME LQ^H setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_LQ_UT_create_T( A, &T ); // Rand A and create A_copy. FLA_Random_matrix( A ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); FLA_Copy( A, A_copy ); // LQ test ( A = L Q^H ) FLA_LQ_UT( A, T ); // Create Q (identity), L (A_copy) FLA_Obj_create( datatype, m, n, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Obj_create( datatype, m, m, 0, 0, &D ); FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk ); FLA_Obj_create( datatype, k, k, 0, 0, &Dk ); FLA_Obj_create( datatype, m, m, 0, 0, &L ); // Q^H := I H_{0}^H ... H_{k-1}^H if ( use_form_q ) { FLA_LQ_UT_form_Q( A, T, Q ); } else { FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, Q ); FLA_Obj_free( &W ); } // D := Q^T Q FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Q, Q, FLA_ZERO, D ); // Qk := I H0 ... Hk FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT ); FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP ); if ( use_form_q ) { // Overwrite the result to test FLAME API FLA_Set( FLA_ZERO, Qk ); FLA_Copy( Ak, Qk ); FLA_LQ_UT_form_Q( Ak, Tk, Qk ); } else { FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, Ak, Tk, W, Qk ); FLA_Obj_free( &W ); } // Dk := Qk^T Qk FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); // L := A (Q^H)^H if ( use_form_q ) { // Note that the formed Q is actually Q^H; transb should be carefully assigned. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_copy, Q, FLA_ZERO, L ); } else { FLA_Apply_Q_UT_create_workspace( T, L, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, L ); FLA_Obj_free( &W ); } FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, L, Q, FLA_ZERO, A_recovered ); // Create vectors for testing FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z ); // x is given FLA_Set( FLA_ONE, x ); // y := Ax FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y ); // z := L (Q^H) x , libflame FLA_Apply_Q_UT_create_workspace( T, x, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, x ); FLA_Obj_free( &W ); if ( m < n ) FLA_Part_2x1( x, &x, &W, m, FLA_TOP ); else FLA_Part_1x2( L, &L, &W, n, FLA_LEFT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z ); // Comapre (A_copy, A_recovered), (y,z) and (y,w) residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); residual_Axy = FLA_Max_elemwise_diff( y, z ); if ( 1 || residual_A > EPS || residual_Axy > EPS ) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k); fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n", residual_A, residual_Axy ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Obj_free( &L ); FLA_Obj_free( &Q ); FLA_Obj_free( &Qk ); FLA_Obj_free( &D ); FLA_Obj_free( &Dk ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Finalize_safe( init_result ); }
int main( int argc, char** argv ) { FLA_Datatype comptype = COMPTYPE; FLA_Datatype realtype = REALTYPE; dim_t m; FLA_Obj a, aT, aB, a0, a1, a2; FLA_Obj v, vT, vB, v0, v1, v2; FLA_Error init_result; int use_abs = 1; if ( argc == 3 ) { m = atoi(argv[1]); use_abs = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m use_abs\n", argv[0]); fprintf(stderr, " m : test vector length\n"); fprintf(stderr, " use_abs : 0 - norm (realtype), 1 - abs (complex type)\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); FLA_Obj_create( comptype, m, 1, 0, 0, &a ); FLA_Obj_create( use_abs ? comptype : realtype, m, 1, 0, 0, &v ); FLA_Random_matrix( a ); FLA_Set( FLA_ZERO, v ); FLA_Obj_fshow( stdout, "- a -", a, "% 6.4e", "--" ); // Normalize a vector FLA_Part_2x1( a, &aT, &aB, 0, FLA_TOP ); FLA_Part_2x1( v, &vT, &vB, 0, FLA_TOP ); while ( FLA_Obj_length( aB ) > 0 ) { FLA_Repart_2x1_to_3x1( aT, &a0, &a1, aB, &a2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( vT, &v0, &v1, vB, &v2, 1, FLA_BOTTOM ); // -------------------------------------------- if ( use_abs ) { // a and v are complex datatype FLA_Copy( a1, v1 ); FLA_Absolute_value( v1 ); } else { // v is real datatype FLA_Nrm2( a1, v1 ); } if ( FLA_Obj_equals( v1, FLA_ZERO ) ) printf( " ZERO DETECTED\n" ); else FLA_Inv_scal( v1, a1 ); // Normalize the scalar // -------------------------------------------- FLA_Cont_with_3x1_to_2x1( &aT, a0, a1, &aB, a2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &vT, v0, v1, &vB, v2, FLA_TOP ); } FLA_Obj_fshow( stdout, "- a -", a, "% 6.4e", "--" ); FLA_Obj_fshow( stdout, "- v -", v, "% 6.4e", "--" ); // Check whether it is normalized FLA_Part_2x1( a, &aT, &aB, 0, FLA_TOP ); FLA_Part_2x1( v, &vT, &vB, 0, FLA_TOP ); while ( FLA_Obj_length( aB ) > 0 ) { FLA_Repart_2x1_to_3x1( aT, &a0, &a1, aB, &a2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( vT, &v0, &v1, vB, &v2, 1, FLA_BOTTOM ); // -------------------------------------------- if ( use_abs ) { // a and v are same datatype FLA_Copy( a1, v1 ); FLA_Absolute_value( v1 ); } else { // v is realdatatype FLA_Nrm2( a1, v1 ); } // -------------------------------------------- FLA_Cont_with_3x1_to_2x1( &aT, a0, a1, &aB, a2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &vT, v0, v1, &vB, v2, FLA_TOP ); } FLA_Obj_fshow( stdout, " - all should be one - ", v, "% 6.4e", "--"); FLA_Obj_free( &a ); FLA_Obj_free( &v ); FLA_Finalize_safe( init_result ); }
FLA_Error FLA_Fill_with_logarithmic_dist( FLA_Obj alpha, FLA_Obj x ) { FLA_Obj lT, l0, lB, lambda1, l2; FLA_Obj l, k, alpha2; FLA_Datatype dt_real; dim_t n_x; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Fill_with_logarithmic_dist_check( alpha, x ); dt_real = FLA_Obj_datatype_proj_to_real( x ); n_x = FLA_Obj_vector_dim( x ); // Create a local counter to increment as we create the distribution. FLA_Obj_create( dt_real, 1, 1, 0, 0, &k ); // Create a local vector l. We will work with this vector, which is // the same length as x, so that we can use vertical partitioning. FLA_Obj_create( dt_real, n_x, 1, 0, 0, &l ); // Create a local real scalar alpha2 of the same precision as // alpha. Then copy alpha to alpha2, which will convert the // complex value to real, if necessary (ie: if alpha is complex). FLA_Obj_create( dt_real, 1, 1, 0, 0, &alpha2 ); FLA_Copy( alpha, alpha2 ); // Initialize k to 0. FLA_Set( FLA_ZERO, k ); FLA_Part_2x1( l, &lT, &lB, 0, FLA_TOP ); while ( FLA_Obj_length( lB ) > 0 ) { FLA_Repart_2x1_to_3x1( lT, &l0, /* ** */ /* ******* */ &lambda1, lB, &l2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ // lambda1 = alpha^k; FLA_Pow( alpha2, k, lambda1 ); // k = k + 1; FLA_Mult_add( FLA_ONE, FLA_ONE, k ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &lT, l0, lambda1, /* ** */ /* ******* */ &lB, l2, FLA_TOP ); } // Normalize by last element. FLA_Part_2x1( l, &lT, &lB, 1, FLA_BOTTOM ); FLA_Inv_scal( lB, l ); // Overwrite x with the distribution we created in l. FLA_Copy( l, x ); FLA_Obj_free( &l ); FLA_Obj_free( &k ); FLA_Obj_free( &alpha2 ); return FLA_SUCCESS; }
FLA_Error FLA_Nrm2_external( FLA_Obj x, FLA_Obj norm_x ) { FLA_Datatype datatype; int num_elem; int inc_x; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Nrm2_check( x, norm_x ); if ( FLA_Obj_has_zero_dim( x ) ) { FLA_Set( FLA_ZERO, norm_x ); return FLA_SUCCESS; } datatype = FLA_Obj_datatype( x ); inc_x = FLA_Obj_vector_inc( x ); num_elem = FLA_Obj_vector_dim( x ); switch ( datatype ){ case FLA_FLOAT: { float *buff_x = ( float * ) FLA_FLOAT_PTR( x ); float *buff_norm_x = ( float * ) FLA_FLOAT_PTR( norm_x ); bli_snrm2( num_elem, buff_x, inc_x, buff_norm_x ); break; } case FLA_DOUBLE: { double *buff_x = ( double * ) FLA_DOUBLE_PTR( x ); double *buff_norm_x = ( double * ) FLA_DOUBLE_PTR( norm_x ); bli_dnrm2( num_elem, buff_x, inc_x, buff_norm_x ); break; } case FLA_COMPLEX: { scomplex *buff_x = ( scomplex * ) FLA_COMPLEX_PTR( x ); float *buff_norm_x = ( float * ) FLA_COMPLEX_PTR( norm_x ); bli_cnrm2( num_elem, buff_x, inc_x, buff_norm_x ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_x = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( x ); double *buff_norm_x = ( double * ) FLA_DOUBLE_COMPLEX_PTR( norm_x ); bli_znrm2( num_elem, buff_x, inc_x, buff_norm_x ); break; } } return FLA_SUCCESS; }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Datatype realtype = REALTYPE; FLA_Obj A, TU, TV, A_copy, A_recovered, U, V, Vb, B, Be, d, e, DU, DV; FLA_Obj ATL, ATR, ABL, ABR, Ae; FLA_Uplo uplo; dim_t m, n, min_m_n; FLA_Error init_result; double residual_A = 0.0; if ( argc == 3 ) { m = atoi(argv[1]); n = atoi(argv[2]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME Bidiag setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_Bidiag_UT_create_T( A, &TU, &TV ); // Rand A and create A_copy. FLA_Random_matrix( A ); { scomplex *buff_A = FLA_Obj_buffer_at_view( A ); buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01; buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01; } //FLA_Set_to_identity( A ); //FLA_Scal( FLA_MINUS_ONE, A ); if ( m >= n ) { uplo = FLA_UPPER_TRIANGULAR; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, min_m_n - 1, 1, FLA_TL ); Ae = ATR; } else { uplo = FLA_LOWER_TRIANGULAR; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 1, min_m_n - 1, FLA_TL ); Ae = ABL; } FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); // Bidiag test { FLA_Obj norm; FLA_Bool apply_scale; FLA_Obj_create( realtype, 1,1, 0,0, &norm ); FLA_Max_abs_value( A, norm ); apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A ); FLA_Bidiag_UT( A, TU, TV ); if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); FLA_Obj_free( &norm ); } // Orthonomal basis U, V. FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U ); FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V ); FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE, U ); FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); if ( FLA_Obj_is_complex( A ) ){ FLA_Obj rL, rR; FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR ); FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------"); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Obj_fshow( stdout, " - Factor realified - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------"); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U ); FLA_Apply_diag_matrix( FLA_LEFT, FLA_CONJUGATE, rR, V ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); } // U^H U FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU ); FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, U, U, FLA_ZERO, DU ); // V^H V FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, V, FLA_ZERO, DV ); // Recover the matrix FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B ); FLA_Set( FLA_ZERO, B ); // Set B FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d ); FLA_Set_diagonal_vector( A, d ); FLA_Set_diagonal_matrix( d, B ); FLA_Obj_free( &d ); if ( min_m_n > 1 ) { FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e ); FLA_Set_diagonal_vector( Ae, e ); if ( uplo == FLA_UPPER_TRIANGULAR ) { FLA_Part_2x2( B, &ATL, &ATR, &ABL, &ABR, min_m_n - 1, 1, FLA_TL ); Be = ATR; } else { FLA_Part_2x2( B, &ATL, &ATR, &ABL, &ABR, 1, min_m_n - 1, FLA_TL ); Be = ABL; } FLA_Set_diagonal_matrix( e, Be ); FLA_Obj_free( &e ); } // Vb := B (V^H) FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb ); FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, Vb ); // A := U Vb FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, U, Vb, FLA_ZERO, A_recovered ); residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); if (1) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U'U - ", DU, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - VV' - ", DV, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n); fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &B ); FLA_Obj_free( &U ); FLA_Obj_free( &V ); FLA_Obj_free( &Vb ); FLA_Obj_free( &DU ); FLA_Obj_free( &DV ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Finalize_safe( init_result ); }
int test_gemm( FILE* stream, param_t param, result_t *result) { FLA_Datatype datatype = param.datatype; FLA_Trans transa = param.trans[0], transb = param.trans[1]; FLA_Obj A, B, C, x, y, z, w; FLA_Obj alpha, beta; double time, time_min = MAX_TIME_VALUE; unsigned int i, m = param.dims[0], n = param.dims[1], k = param.dims[2], repeat = param.repeat; int is_trans, is_complex; // Create matrices. is_trans = (transa == FLA_NO_TRANSPOSE); FLA_Obj_create( datatype, (is_trans ? m:k), (is_trans ? k:m), 0,0, &A ); is_trans = (transb == FLA_NO_TRANSPOSE); FLA_Obj_create( datatype, (is_trans ? k:n), (is_trans ? n:k), 0,0, &B ); FLA_Obj_create( datatype, m, n, 0,0, &C ); FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, k, 1, 0, 0, &w ); // Initialize the test matrices. FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, w ); FLA_Set( FLA_ZERO, z ); // Constants. alpha = FLA_MINUS_ONE; beta = FLA_ZERO; // Repeat the experiment repeat times and record results. for ( i = 0; i < repeat; ++i ) { time = FLA_Clock(); FLA_Gemm_external( transa, transb, alpha, A, B, beta, C ); time = FLA_Clock() - time; time_min = min( time_min, time ); } is_complex = FLA_Obj_is_complex( C ); result->performance = ( FMULS * FP_PER_MUL(is_complex) + FADDS * FP_PER_ADD(is_complex) )/time_min/FLOPS_PER_UNIT_PERF; FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); FLA_Gemv_external( transb, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Gemv_external( transa, alpha, A, w, FLA_ZERO, z ); result->residual = FLA_Max_elemwise_diff( y, z ); FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); return 0; }
int main(int argc, char *argv[]) { int datatype, n_blocks_m, n_threads, m_input, n_input, m, n, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; dim_t nb_flash, nb_alg; char *colors = "brkgmcbrkgmcbrkgmc"; char *ticks = "o+*xso+*xso+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, A_flat_ref, A_flat, B, B_flat, D, D_flat, t, T, T_flat; FLA_Init( ); fprintf( stdout, "%c number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter algorithmic blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc: ", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d%d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLASH_Queue_set_num_threads( n_threads ); for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( n < 0 ) n = p / abs(n_input); nb_flash = n; for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ) { FLA_Obj_create( datatype, m, nb_flash, &A_flat ); FLA_Obj_create( datatype, m, nb_flash, &A_flat_ref ); FLA_Obj_create( datatype, m, nb_flash, &T_flat ); FLA_Obj_create( datatype, nb_flash, 1, &t ); FLASH_Obj_create( datatype, m, nb_flash, 1, &nb_flash, &A ); n_blocks_m = FLA_Obj_length( A ); FLASH_Obj_create_ext( datatype, nb_alg * n_blocks_m, nb_flash, 1, &nb_alg, &nb_flash, &T ); FLA_Set( FLA_ZERO, T_flat ); FLASH_Set( FLA_ZERO, T ); FLASH_Random_matrix( A ); FLASH_Obj_flatten( A, A_flat ); FLA_Part_2x1( A, &B, &D, 1, FLA_TOP ); FLA_Part_2x1( A_flat, &B_flat, &D_flat, FLA_Obj_width( A_flat ), FLA_TOP ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, *(FLASH_OBJ_PTR_AT(B)) ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, B_flat ); fprintf( stdout, "data_qr2ut_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_QR2_UT( param_combo, FLA_ALG_REFERENCE, n_repeats, m, n, A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_QR2_UT( param_combo, FLA_ALG_FRONT, n_repeats, m, n, A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A_flat ); FLA_Obj_free( &A_flat_ref ); FLA_Obj_free( &T_flat ); FLA_Obj_free( &t ); FLASH_Obj_free( &A ); FLASH_Obj_free( &T ); } fprintf( stdout, "\n" ); } fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_qr2ut\\_%s', 'fla\\_qr2ut\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME qr2ut front-end performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc qr2ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); return 0; }