FLA_Error FLA_QR_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X ) { FLA_Obj W, Y; FLA_Obj AT, AB; FLA_Obj YT, YB; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_QR_UT_solve_check( A, T, B, X ); FLA_Apply_Q_UT_create_workspace( T, B, &W ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &Y ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Y ); FLA_Part_2x1( A, &AT, &AB, FLA_Obj_width( A ), FLA_TOP ); FLA_Part_2x1( Y, &YT, &YB, FLA_Obj_width( A ), FLA_TOP ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, AT, YT ); FLA_Copy_external( YT, X ); FLA_Obj_free( &Y ); FLA_Obj_free( &W ); return FLA_SUCCESS; }
int phonopy_pinv_libflame(double *matrix, double *eigvals, const int size, const double cutoff) { FLA_Obj A, B, l; /* FLA_Obj C; */ double *inv_eigvals; int i; inv_eigvals = (double*)malloc(sizeof(double) * size); FLA_Init(); FLA_Obj_create_without_buffer(FLA_DOUBLE, size, size, &A); FLA_Obj_attach_buffer(matrix, 0, 0, &A); FLA_Obj_create_without_buffer(FLA_DOUBLE, size, 1, &l); FLA_Obj_attach_buffer(eigvals, 0, 0, &l); /* Eigensolver */ FLA_Obj_create_copy_of(FLA_NO_TRANSPOSE, A, &B); FLA_Hevd(FLA_EVD_WITH_VECTORS, FLA_LOWER_TRIANGULAR, B, l); /* SVD */ /* FLA_Obj_create(FLA_DOUBLE, size, size, 0, 0, &B); */ /* use U */ /* FLA_Svd(FLA_SVD_VECTORS_ALL, FLA_SVD_VECTORS_NONE, A, l, B, C); */ /* use V */ /* FLA_Svd(FLA_SVD_VECTORS_NONE, FLA_SVD_VECTORS_ALL, A, l, C, B); */ FLA_Obj_free_without_buffer(&l); for (i = 0; i < size; i++) { if (eigvals[i] < cutoff) { inv_eigvals[i] = 0; } else { inv_eigvals[i] = 1.0 / sqrt(eigvals[i]); } } FLA_Obj_create_without_buffer(FLA_DOUBLE, size, 1, &l); FLA_Obj_attach_buffer(inv_eigvals, 0, 0, &l); FLA_Apply_diag_matrix(FLA_RIGHT, FLA_NO_CONJUGATE, l, B); FLA_Syrk(FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, B, FLA_ZERO, A); FLA_Symmetrize(FLA_LOWER_TRIANGULAR, A); FLA_Obj_free_without_buffer(&A); FLA_Obj_free_without_buffer(&l); FLA_Obj_free(&B); FLA_Finalize(); free(inv_eigvals); return 0; }
FLA_Error FLA_UDdate_UT_update_rhs( FLA_Obj T, FLA_Obj bR, FLA_Obj C, FLA_Obj bC, FLA_Obj D, FLA_Obj bD ) { FLA_Obj W; FLA_Obj bC_copy; FLA_Obj bD_copy; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_UDdate_UT_update_rhs_check( T, bR, C, bC, D, bD ); // Create workspace according to the algorithmic blocksize (length of T) // and the number of right-hand sides. FLA_Apply_QUD_UT_create_workspace( T, bR, &W ); // Make temporary copies of the bC and bD right-hand side objects so we // don't destory their original contents. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, bC, &bC_copy ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, bD, &bD_copy ); // Apply the updowndating Q' to the right-hand sides. FLA_Apply_QUD_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, T, W, bR, C, bC_copy, D, bD_copy ); // Free the temporary objects. FLA_Obj_free( &bC_copy ); FLA_Obj_free( &bD_copy ); // Free the workspace object. FLA_Obj_free( &W ); return FLA_SUCCESS; }
void libfla_test_symm_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; unsigned int n; signed int n_input = -1; FLA_Side side; FLA_Uplo uplo; FLA_Obj A, B, C, x, y, z, w, norm; FLA_Obj alpha, beta; FLA_Obj C_save; FLA_Obj A_test, B_test, C_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); // Create the matrices for the current operation. if ( side == FLA_LEFT ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, m, 1, 0, 0, &w ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, n, 1, 0, 0, &w ); } libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C ); // Create a norm scalar. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_symm_matrix( uplo, A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); // Initialize the test vectors. FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, z ); FLA_Set( FLA_ZERO, w ); // Set constants. alpha = FLA_TWO; beta = FLA_MINUS_ONE; // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test ); } else { A_test = A; B_test = B; C_test = C; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( C_save, C_test ); else FLA_Copy_external( C_save, C_test ); time = FLA_Clock(); libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Copy the solution to flat matrix X. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( C_test, C ); } else { // No action needed since C_test and C refer to the same object. } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &B_test ); FLASH_Obj_free( &C_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_free(); // Compute the performance of the best experiment repeat. if ( side == FLA_LEFT ) *perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; else *perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute: // y = C * x // and compare to // z = ( beta * C_orig + alpha * A * B ) x (side = left) // z = ( beta * C_orig + alpha * B * A ) x (side = right) FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); if ( side == FLA_LEFT ) { FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Symv_external( uplo, alpha, A, w, FLA_ZERO, z ); } else { FLA_Symv_external( uplo, FLA_ONE, A, x, FLA_ZERO, w ); FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B, w, FLA_ZERO, z ); } FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z ); // Compute || y - z ||. //FLA_Axpy_external( FLA_MINUS_ONE, y, z ); //FLA_Nrm2_external( z, norm ); //FLA_Obj_extract_real_scalar( norm, residual ); *residual = FLA_Max_elemwise_diff( y, z ); // Free the supporting flat objects. FLA_Obj_free( &C_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Sort( FLA_Direct direct, FLA_Obj x ) { FLA_Datatype datatype; FLA_Obj x_use; dim_t m_x; dim_t inc_x; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Sort_check( direct, x ); datatype = FLA_Obj_datatype( x ); m_x = FLA_Obj_vector_dim( x ); inc_x = FLA_Obj_vector_inc( x ); // If the vector does not have unit stride, copy it to a temporary vector // that does have unit stride. if ( inc_x != 1 ) { FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, x, &x_use ); inc_x = FLA_Obj_vector_inc( x_use ); } else { x_use = x; } switch ( datatype ) { case FLA_FLOAT: { float* x_p = ( float* ) FLA_FLOAT_PTR( x_use ); if ( direct == FLA_FORWARD ) FLA_Sort_f_ops( m_x, x_p, inc_x ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_b_ops( m_x, x_p, inc_x ); break; } case FLA_DOUBLE: { double* x_p = ( double* ) FLA_DOUBLE_PTR( x_use ); if ( direct == FLA_FORWARD ) FLA_Sort_f_opd( m_x, x_p, inc_x ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_b_opd( m_x, x_p, inc_x ); break; } } if ( inc_x != 1 ) { FLA_Copy( x_use, x ); FLA_Obj_free( &x_use ); } return FLA_SUCCESS; }
void libfla_test_qrut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input = -2; signed int n_input = -1; FLA_Obj A, T, x, b, y, norm; FLA_Obj A_save; FLA_Obj A_test, T_test, x_test, b_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); if ( impl == FLA_TEST_FLAT_FRONT_END || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 1 ) ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); else if ( var == 2 ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], min_m_n, min_m_n, &T ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], 1, min_m_n, &T ); // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create vectors to form a linear system. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, n, 1, 0, 0, &y ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a random right-hand side vector. FLA_Random_matrix( b ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &b_flash, &b_test ); FLASH_Obj_create_hier_copy_of_flat( x, 1, &b_flash, &x_test ); } else { A_test = A; T_test = T; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( A_save, A_test ); else FLA_Copy_external( A_save, A_test ); time = FLA_Clock(); libfla_test_qrut_impl( impl, A_test, T_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Perform a linear solve with the result. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_QR_UT_solve( A_test, T_test, b_test, x_test ); FLASH_Obj_flatten( x_test, x ); } else { FLA_QR_UT_solve( A_test, T_test, b, x ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &b_test ); FLASH_Obj_free( &x_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_qrut_cntl_free(); // Compute the performance of the best experiment repeat. *perf = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save, x, FLA_MINUS_ONE, b ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &y ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); }
void time_Tevd_v( int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max, FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l, double *dtime, double *diff1, double* diff2, double *gflops ) { int irep; double k, dtime_old = 1.0e9; FLA_Obj A_save, G_save, d_save, e_save; if ( //( variant == 0 ) || //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || FALSE ) { *dtime = 0.0; *gflops = 0.0; *diff1 = 0.0; *diff2 = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); FLA_Copy_external( d, d_save ); FLA_Copy_external( e, e_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Tevd_v( d, e, A ); break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj V, A_rev_evd, norm, eye; FLA_Copy( d, l ); //FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" ); FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); /* FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, D, FLA_ZERO, A_rev_evd ); FLA_Copy( A_rev_evd, D ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, D, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); */ //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); /* FLA_Obj_free( &EL ); FLA_Obj_free( &EU ); FLA_Obj_free( &D ); FLA_Obj_free( &dc ); FLA_Obj_free( &ec ); */ FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; if ( FLA_Obj_is_complex( A ) ) { *gflops = ( ( 4.5 * k * m * m ) + 2.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } else { *gflops = ( ( 4.5 * k * m * m ) + 1.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &d_save ); FLA_Obj_free( &e_save ); }
void time_Hevd_lv_components( int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg, FLA_Obj A, FLA_Obj l, double* dtime, double* diff1, double* diff2, double* gflops, double* dtime_tred, double* gflops_tred, double* dtime_tevd, double* gflops_tevd, double* dtime_appq, double* gflops_appq, int* k_perf ) { int i; double k; double dtime_save = 1.0e9; double dtime_tred_save = 1.0e9; double dtime_tevd_save = 1.0e9; double dtime_appq_save = 1.0e9; double flops_tred; double flops_tevd; double flops_appq; double mult_tred; double mult_tevd; double mult_appq; FLA_Obj A_save, Z; if ( ( variant == -3 ) || ( variant == -4 ) || ( variant == -5 ) || //( variant == 0 ) || //( variant == -1 ) || //( variant == -2 ) || //( variant == 1 ) || //( variant == 2 ) || //( variant == 3 ) || //( variant == 4 ) || FALSE ) { *gflops = 0.0; *dtime = 0.0; *diff1 = 0.0; *diff2 = 0.0; *dtime_tred = 0.0; *dtime_tevd = 0.0; *dtime_appq = 0.0; *gflops_tred = 0.0; *gflops_tevd = 0.0; *gflops_appq = 0.0; *k_perf = 0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); FLA_Copy_external( A, A_save ); for ( i = 0 ; i < n_repeats; i++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case -3: { *k_perf = 0; REF_Hevd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -4: { *k_perf = 0; REF_Hevdd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -5: { *k_perf = 0; REF_Hevdr_lv( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } case 0: { *k_perf = 0; REF_Hevd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -1: { *k_perf = 0; REF_Hevdd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -2: { *k_perf = 0; REF_Hevdr_lv_components( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 1 case 1: { *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 2 case 2: { *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } } *dtime = FLA_Clock() - *dtime; if ( *dtime < dtime_save ) { dtime_save = *dtime; dtime_tred_save = *dtime_tred; dtime_tevd_save = *dtime_tevd; dtime_appq_save = *dtime_appq; } } *dtime = dtime_save; *dtime_tred = dtime_tred_save; *dtime_tevd = dtime_tevd_save; *dtime_appq = dtime_appq_save; //if ( variant == -3 || variant == 0 ) //printf( "\ndtime is %9.3e\n", *dtime ); { FLA_Obj V, A_rev_evd, norm, eye; if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; flops_tred = ( ( 4.0 / 3.0 ) * m * m * m ); flops_tevd = ( 4.5 * k * m * m + 3.0 * k * m * m * m ); if ( variant == -1 || variant == -2 || variant == -4 || variant == -5 ) flops_appq = ( 2.0 * m * m * m ); else flops_appq = ( 4.0 / 3.0 * m * m * m ); /* if ( FLA_Obj_is_complex( A ) ) { *gflops = ( 4.0 * flops_tred + 2.0 * flops_tevd + 4.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9; } else { *gflops = ( 1.0 * flops_tred + 1.0 * flops_tevd + 1.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9; } */ if ( FLA_Obj_is_complex( A ) ) { mult_tred = 4.0; mult_tevd = 2.0; mult_appq = 4.0; } else { mult_tred = 1.0; mult_tevd = 1.0; mult_appq = 1.0; } *gflops = ( mult_tred * flops_tred + mult_tevd * flops_tevd + mult_appq * flops_appq ) / *dtime / 1e9; *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &Z ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, A_flame, A_lapack, C; int m; FLA_Error init_result; FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame; FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack; testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack; int lwork, info, is_flame; if ( argc == 3 ) { m = atoi(argv[1]); is_flame = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m is_flame\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " is_flame : 1 yes, 0 no\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); fprintf( stdout, "lapack2flame: %d x %d: \n", m, m); FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Random_matrix( A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Random_matrix( C ); if ( is_flame ) { fprintf( stdout, " flame executed\n"); FLA_Bidiag_UT_create_T( A_flame, &TU, &TV ); FLA_Bidiag_UT( A_flame, TU, TV ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame ); FLA_Bidiag_UT_form_U( U_flame, TU, U_flame ); FLA_Bidiag_UT_form_V( V_flame, TV, V_flame ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_flame ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_flame ); FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame ); FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_flame, B_flame ); FLA_Set_diagonal_matrix( e_flame, BTR ); } if (1) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------"); } } else { fprintf( stdout, " lapack executed\n"); FLA_Obj_create( datatype, m, 1, 0, 0, &tauq ); FLA_Obj_create( datatype, m, 1, 0, 0, &taup ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_lapack ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_lapack ); buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack ); buff_tauq = (testtype*)FLA_Obj_buffer_at_view( tauq ); buff_taup = (testtype*)FLA_Obj_buffer_at_view( taup ); buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack ); lwork = 32*m; FLA_Obj_create( datatype, lwork, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sgebrd_( &m, &m, buff_A_lapack, &m, buff_d_lapack, buff_e_lapack, buff_tauq, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack ); FLA_Copy( A_lapack, U_lapack ); FLA_Copy( A_lapack, V_lapack ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack ); sorgbr_( "Q", &m, &m, &m, buff_U_lapack, &m, buff_tauq, buff_W, &lwork, &info ); sorgbr_( "P", &m, &m, &m, buff_V_lapack, &m, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_lapack, B_lapack ); FLA_Set_diagonal_matrix( e_lapack, BTR ); } FLA_Obj_free( &W ); if (1) { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------"); } } { testtype dummy; int zero = 0, one = 1; FLA_Obj D_lapack; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack ); if ( is_flame ) { buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame ); } FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sbdsqr_( "U", &m, &m, &m, &zero, buff_d_lapack, buff_e_lapack, buff_V_lapack, &m, buff_U_lapack, &m, &dummy, &one, buff_W, &info ); FLA_Obj_free( &W ); if (info != 0) printf( " Error info = %d\n", info ); if ( is_flame ) FLA_Set_diagonal_matrix( d_flame, D_lapack ); else FLA_Set_diagonal_matrix( d_lapack, D_lapack ); if ( is_flame ) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } else { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } FLA_Obj_free( &D_lapack ); } if ( is_flame ) { FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &U_flame ); FLA_Obj_free( &V_flame ); FLA_Obj_free( &d_flame ); FLA_Obj_free( &e_flame ); FLA_Obj_free( &B_flame ); } else { FLA_Obj_free( &tauq ); FLA_Obj_free( &taup ); FLA_Obj_free( &d_lapack ); FLA_Obj_free( &e_lapack ); FLA_Obj_free( &U_lapack ); FLA_Obj_free( &V_lapack ); FLA_Obj_free( &B_lapack ); } FLA_Obj_free( &A ); FLA_Obj_free( &A_flame ); FLA_Obj_free( &A_lapack ); FLA_Obj_free( &C ); FLA_Finalize_safe( init_result ); }
void libfla_test_apqut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input; signed int n_input; FLA_Side side; FLA_Trans trans; FLA_Direct direct; FLA_Store storev; FLA_Obj A, T, W, B, eye, norm; FLA_Obj B_save; FLA_Obj A_test, T_test, W_test, B_test; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans ); FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct ); FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev ); // We want to make sure the Apply_Q_UT routines work with rectangular // matrices. So we use m > n when testing with column-wise storage (via // QR factorization) and m < n when testing with row-wise storage (via // LQ factorization). if ( storev == FLA_COLUMNWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } else // if ( storev == FLA_ROWWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); if ( storev == FLA_COLUMNWISE ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye ); FLA_Apply_Q_UT_create_workspace( T, B, &W ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_matrix( A ); FLA_Set_to_identity( B ); FLA_Set_to_identity( eye ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); else // if ( storev == FLA_ROWWISE ) FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { A_test = A; T_test = T; W_test = W; B_test = B; } // Compute a Householder factorization. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test ); else FLASH_LQ_UT( A_test, T_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test ); else FLA_LQ_UT( A_test, T_test ); } // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( B_save, B_test ); else FLA_Copy_external( B_save, B_test ); time = FLA_Clock(); libfla_test_apqut_impl( impl, side, trans, direct, storev, A_test, T_test, W_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Multiply by its conjugate-transpose to get what should be (near) identity // and then subtract from actual identity to get what should be (near) zero. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( B_test, B ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &W_test ); FLASH_Obj_free( &B_test ); } // Compute the norm of eye, which contains I - Q * Q'. FLA_Norm1( eye, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Compute the performance of the best experiment repeat. *perf = ( 4.0 * m * min_m_n * n - 2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Free the supporting flat objects. FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &B ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); }
void libfla_test_eig_gest_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Uplo inv; FLA_Uplo uplo; FLA_Obj A, B, Y, norm; FLA_Obj A_save, B_save; FLA_Obj A_test, B_test, Y_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); if ( inv == FLA_NO_INVERSE && ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) ) ) { *perf = 0.0; *residual = 0.0; return; } // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); // Initialize the test matrices. FLA_Random_spd_matrix( uplo, A ); FLA_Scalr( uplo, FLA_TWO, A ); FLA_Hermitianize( uplo, A ); FLA_Random_spd_matrix( uplo, B ); FLA_Scalr( uplo, FLA_TWO, B ); FLA_Chol( uplo, B ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); } else { A_test = A; Y_test = Y; B_test = B; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_hierarchify( A_save, A_test ); FLASH_Obj_hierarchify( B_save, B_test ); } else { FLA_Copy_external( A_save, A_test ); FLA_Copy_external( B_save, B_test ); } time = FLA_Clock(); libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Check our solution. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLA_Trans trans_left, trans_right; FLASH_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } FLASH_Obj_flatten( A_test, A ); } else { FLA_Trans trans_left, trans_right; FLA_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &Y_test ); FLASH_Obj_free( &B_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_free(); // Compute the performance of the best experiment repeat. *perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Axpy_external( FLA_MINUS_ONE, A_save, A ); FLA_Norm1( A, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &Y ); FLA_Obj_free( &B ); }
void time_LU( int pivot_combo, int type, int nrepeats, int m, int n, dim_t nb_alg, dim_t nb_flash, FLA_Obj A, FLA_Obj p, FLA_Obj x, FLA_Obj b, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj AH_save, b_save; FLA_Obj AH, pH, bH, LH; FLASH_LU_incpiv_create_hier_matrices( A, 1, &nb_flash, nb_alg, &AH, &pH, &LH ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_flash, &bH ); FLASH_Obj_create_copy_of( FLA_NO_TRANSPOSE, AH, &AH_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, b, &b_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( AH_save, AH ); *dtime = FLA_Clock(); switch( pivot_combo ){ case 0: { switch( type ) { case FLA_ALG_FRONT_OPT0: FLASH_LU_incpiv_noopt( AH, pH, LH ); break; case FLA_ALG_FRONT_OPT1: FLASH_LU_incpiv_opt1( AH, pH, LH ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLASH_FS_incpiv( AH, pH, LH, bH ); FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, AH, bH ); FLASH_Obj_flatten( bH, x ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A, x, FLA_MINUS_ONE, b ); FLA_Nrm2_external( b, norm ); FLA_Obj_extract_real_scalar( norm, diff ); } *gflops = 2.0 / 3.0 * m * m * n / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy( b_save, b ); FLASH_Obj_free( &AH ); FLASH_Obj_free( &pH ); FLASH_Obj_free( &bH ); FLASH_Obj_free( &LH ); FLA_Obj_free( &b_save ); FLASH_Obj_free( &AH_save ); }
void libfla_test_hessut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Obj A, T, W, Qh, AQ, QhAQ, norm; FLA_Obj AT, AB; FLA_Obj QhT, QhB; FLA_Obj A_save; // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); if ( impl == FLA_TEST_FLAT_FRONT_END || impl == FLA_TEST_FLAT_BLK_VAR ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &T ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &W ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &T ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &W ); } // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create auxiliary matrices to be used when checking the result. FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Qh ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &AQ ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &QhAQ ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_hessut_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { FLA_Copy_external( A_save, A ); time = FLA_Clock(); libfla_test_hessut_impl( impl, A, T ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_hessut_cntl_free(); // Compute the performance of the best experiment repeat. *perf = ( 10.0 / 3.0 * m * m * m ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Check the result by computing R - Q' A_orig Q. FLA_Set_to_identity( Qh ); FLA_Part_2x1( Qh, &QhT, &QhB, 1, FLA_TOP ); FLA_Part_2x1( A, &AT, &AB, 1, FLA_TOP ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QhB ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, Qh, FLA_ZERO, AQ ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, Qh, AQ, FLA_ZERO, QhAQ ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, AB ); *residual = FLA_Max_elemwise_diff( A, QhAQ ); // Free the supporting flat objects. FLA_Obj_free( &W ); FLA_Obj_free( &Qh ); FLA_Obj_free( &AQ ); FLA_Obj_free( &QhAQ ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Datatype realtype = REALTYPE; FLA_Obj A, TU, TV, A_copy, A_recovered, U, V, Vb, B, Be, d, e, DU, DV; FLA_Obj ATL, ATR, ABL, ABR, Ae; FLA_Uplo uplo; dim_t m, n, min_m_n; FLA_Error init_result; double residual_A = 0.0; if ( argc == 3 ) { m = atoi(argv[1]); n = atoi(argv[2]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME Bidiag setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_Bidiag_UT_create_T( A, &TU, &TV ); // Rand A and create A_copy. FLA_Random_matrix( A ); { scomplex *buff_A = FLA_Obj_buffer_at_view( A ); buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01; buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01; } //FLA_Set_to_identity( A ); //FLA_Scal( FLA_MINUS_ONE, A ); if ( m >= n ) { uplo = FLA_UPPER_TRIANGULAR; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, min_m_n - 1, 1, FLA_TL ); Ae = ATR; } else { uplo = FLA_LOWER_TRIANGULAR; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 1, min_m_n - 1, FLA_TL ); Ae = ABL; } FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); // Bidiag test { FLA_Obj norm; FLA_Bool apply_scale; FLA_Obj_create( realtype, 1,1, 0,0, &norm ); FLA_Max_abs_value( A, norm ); apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A ); FLA_Bidiag_UT( A, TU, TV ); if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); FLA_Obj_free( &norm ); } // Orthonomal basis U, V. FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U ); FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V ); FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE, U ); FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); if ( FLA_Obj_is_complex( A ) ){ FLA_Obj rL, rR; FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR ); FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------"); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Obj_fshow( stdout, " - Factor realified - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------"); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U ); FLA_Apply_diag_matrix( FLA_LEFT, FLA_CONJUGATE, rR, V ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); } // U^H U FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU ); FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, U, U, FLA_ZERO, DU ); // V^H V FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, V, FLA_ZERO, DV ); // Recover the matrix FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B ); FLA_Set( FLA_ZERO, B ); // Set B FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d ); FLA_Set_diagonal_vector( A, d ); FLA_Set_diagonal_matrix( d, B ); FLA_Obj_free( &d ); if ( min_m_n > 1 ) { FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e ); FLA_Set_diagonal_vector( Ae, e ); if ( uplo == FLA_UPPER_TRIANGULAR ) { FLA_Part_2x2( B, &ATL, &ATR, &ABL, &ABR, min_m_n - 1, 1, FLA_TL ); Be = ATR; } else { FLA_Part_2x2( B, &ATL, &ATR, &ABL, &ABR, 1, min_m_n - 1, FLA_TL ); Be = ABL; } FLA_Set_diagonal_matrix( e, Be ); FLA_Obj_free( &e ); } // Vb := B (V^H) FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb ); FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, Vb ); // A := U Vb FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, U, Vb, FLA_ZERO, A_recovered ); residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); if (1) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U'U - ", DU, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - VV' - ", DV, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n); fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &B ); FLA_Obj_free( &U ); FLA_Obj_free( &V ); FLA_Obj_free( &Vb ); FLA_Obj_free( &DU ); FLA_Obj_free( &DV ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Finalize_safe( init_result ); }