FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, S, rL, rR, d, e, G, H, RG, RH, W; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; double dtime_temp; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // If the matrix is a scalar, then the SVD is easy. if ( min_m_n == 1 ) { FLA_Copy( A, s ); FLA_Set_to_identity( U ); FLA_Set_to_identity( V ); return FLA_SUCCESS; } // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create matrices to hold the left and right Givens matrices. FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG ); FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH ); FLA_Obj_create( dt, m_A, n_A, 0, 0, &W ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the sub-diagonal to the real domain. // Extract the diagonal and sub-diagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_diagonals( A, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; //FLA_QR_UT_create_T( A, &TQ ); FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ ); dtime_temp = FLA_Clock(); { // Perform a QR factorization on A and form Q in U. FLA_QR_UT( A, TQ ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { FLA_QR_UT_form_Q( A, TQ, U ); } *dtime_appq = FLA_Clock() - dtime_temp; FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_diagonals( R, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); } *dtime_appq += FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; FLA_Obj_free( &R ); } } else // if ( m_A < n_A ) { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); FLA_Obj_free( &RG ); FLA_Obj_free( &RH ); FLA_Obj_free( &W ); return r_val; }
void time_Hevd_lv_components( int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg, FLA_Obj A, FLA_Obj l, double* dtime, double* diff1, double* diff2, double* gflops, double* dtime_tred, double* gflops_tred, double* dtime_tevd, double* gflops_tevd, double* dtime_appq, double* gflops_appq, int* k_perf ) { int i; double k; double dtime_save = 1.0e9; double dtime_tred_save = 1.0e9; double dtime_tevd_save = 1.0e9; double dtime_appq_save = 1.0e9; double flops_tred; double flops_tevd; double flops_appq; double mult_tred; double mult_tevd; double mult_appq; FLA_Obj A_save, Z; if ( ( variant == -3 ) || ( variant == -4 ) || ( variant == -5 ) || //( variant == 0 ) || //( variant == -1 ) || //( variant == -2 ) || //( variant == 1 ) || //( variant == 2 ) || //( variant == 3 ) || //( variant == 4 ) || FALSE ) { *gflops = 0.0; *dtime = 0.0; *diff1 = 0.0; *diff2 = 0.0; *dtime_tred = 0.0; *dtime_tevd = 0.0; *dtime_appq = 0.0; *gflops_tred = 0.0; *gflops_tevd = 0.0; *gflops_appq = 0.0; *k_perf = 0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); FLA_Copy_external( A, A_save ); for ( i = 0 ; i < n_repeats; i++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case -3: { *k_perf = 0; REF_Hevd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -4: { *k_perf = 0; REF_Hevdd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -5: { *k_perf = 0; REF_Hevdr_lv( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } case 0: { *k_perf = 0; REF_Hevd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -1: { *k_perf = 0; REF_Hevdd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -2: { *k_perf = 0; REF_Hevdr_lv_components( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 1 case 1: { *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 2 case 2: { *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } } *dtime = FLA_Clock() - *dtime; if ( *dtime < dtime_save ) { dtime_save = *dtime; dtime_tred_save = *dtime_tred; dtime_tevd_save = *dtime_tevd; dtime_appq_save = *dtime_appq; } } *dtime = dtime_save; *dtime_tred = dtime_tred_save; *dtime_tevd = dtime_tevd_save; *dtime_appq = dtime_appq_save; //if ( variant == -3 || variant == 0 ) //printf( "\ndtime is %9.3e\n", *dtime ); { FLA_Obj V, A_rev_evd, norm, eye; if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; flops_tred = ( ( 4.0 / 3.0 ) * m * m * m ); flops_tevd = ( 4.5 * k * m * m + 3.0 * k * m * m * m ); if ( variant == -1 || variant == -2 || variant == -4 || variant == -5 ) flops_appq = ( 2.0 * m * m * m ); else flops_appq = ( 4.0 / 3.0 * m * m * m ); /* if ( FLA_Obj_is_complex( A ) ) { *gflops = ( 4.0 * flops_tred + 2.0 * flops_tevd + 4.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9; } else { *gflops = ( 1.0 * flops_tred + 1.0 * flops_tevd + 1.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9; } */ if ( FLA_Obj_is_complex( A ) ) { mult_tred = 4.0; mult_tevd = 2.0; mult_appq = 4.0; } else { mult_tred = 1.0; mult_tevd = 1.0; mult_appq = 1.0; } *gflops = ( mult_tred * flops_tred + mult_tevd * flops_tevd + mult_appq * flops_appq ) / *dtime / 1e9; *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &Z ); }
void time_Tevd_v( int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max, FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l, double *dtime, double *diff1, double* diff2, double *gflops ) { int irep; double k, dtime_old = 1.0e9; FLA_Obj A_save, G_save, d_save, e_save; if ( //( variant == 0 ) || //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || FALSE ) { *dtime = 0.0; *gflops = 0.0; *diff1 = 0.0; *diff2 = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); FLA_Copy_external( d, d_save ); FLA_Copy_external( e, e_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Tevd_v( d, e, A ); break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj V, A_rev_evd, norm, eye; FLA_Copy( d, l ); //FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" ); FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); /* FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, D, FLA_ZERO, A_rev_evd ); FLA_Copy( A_rev_evd, D ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, D, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); */ //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); /* FLA_Obj_free( &EL ); FLA_Obj_free( &EU ); FLA_Obj_free( &D ); FLA_Obj_free( &dc ); FLA_Obj_free( &ec ); */ FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; if ( FLA_Obj_is_complex( A ) ) { *gflops = ( ( 4.5 * k * m * m ) + 2.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } else { *gflops = ( ( 4.5 * k * m * m ) + 1.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &d_save ); FLA_Obj_free( &e_save ); }
void libfla_test_apqut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input; signed int n_input; FLA_Side side; FLA_Trans trans; FLA_Direct direct; FLA_Store storev; FLA_Obj A, T, W, B, eye, norm; FLA_Obj B_save; FLA_Obj A_test, T_test, W_test, B_test; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans ); FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct ); FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev ); // We want to make sure the Apply_Q_UT routines work with rectangular // matrices. So we use m > n when testing with column-wise storage (via // QR factorization) and m < n when testing with row-wise storage (via // LQ factorization). if ( storev == FLA_COLUMNWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } else // if ( storev == FLA_ROWWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); if ( storev == FLA_COLUMNWISE ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye ); FLA_Apply_Q_UT_create_workspace( T, B, &W ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_matrix( A ); FLA_Set_to_identity( B ); FLA_Set_to_identity( eye ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); else // if ( storev == FLA_ROWWISE ) FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { A_test = A; T_test = T; W_test = W; B_test = B; } // Compute a Householder factorization. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test ); else FLASH_LQ_UT( A_test, T_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test ); else FLA_LQ_UT( A_test, T_test ); } // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( B_save, B_test ); else FLA_Copy_external( B_save, B_test ); time = FLA_Clock(); libfla_test_apqut_impl( impl, side, trans, direct, storev, A_test, T_test, W_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Multiply by its conjugate-transpose to get what should be (near) identity // and then subtract from actual identity to get what should be (near) zero. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( B_test, B ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &W_test ); FLASH_Obj_free( &B_test ); } // Compute the norm of eye, which contains I - Q * Q'. FLA_Norm1( eye, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Compute the performance of the best experiment repeat. *perf = ( 4.0 * m * min_m_n * n - 2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Free the supporting flat objects. FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &B ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z; dim_t m, n, k; dim_t min_m_n; FLA_Error init_result; double residual_A, residual_Axy; int use_form_q = 1; if ( argc == 4 ) { m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n k\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " k : number of house holder vectors applied for testing\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME LQ^H setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_LQ_UT_create_T( A, &T ); // Rand A and create A_copy. FLA_Random_matrix( A ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); FLA_Copy( A, A_copy ); // LQ test ( A = L Q^H ) FLA_LQ_UT( A, T ); // Create Q (identity), L (A_copy) FLA_Obj_create( datatype, m, n, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Obj_create( datatype, m, m, 0, 0, &D ); FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk ); FLA_Obj_create( datatype, k, k, 0, 0, &Dk ); FLA_Obj_create( datatype, m, m, 0, 0, &L ); // Q^H := I H_{0}^H ... H_{k-1}^H if ( use_form_q ) { FLA_LQ_UT_form_Q( A, T, Q ); } else { FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, Q ); FLA_Obj_free( &W ); } // D := Q^T Q FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Q, Q, FLA_ZERO, D ); // Qk := I H0 ... Hk FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT ); FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP ); if ( use_form_q ) { // Overwrite the result to test FLAME API FLA_Set( FLA_ZERO, Qk ); FLA_Copy( Ak, Qk ); FLA_LQ_UT_form_Q( Ak, Tk, Qk ); } else { FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, Ak, Tk, W, Qk ); FLA_Obj_free( &W ); } // Dk := Qk^T Qk FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); // L := A (Q^H)^H if ( use_form_q ) { // Note that the formed Q is actually Q^H; transb should be carefully assigned. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_copy, Q, FLA_ZERO, L ); } else { FLA_Apply_Q_UT_create_workspace( T, L, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, L ); FLA_Obj_free( &W ); } FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, L, Q, FLA_ZERO, A_recovered ); // Create vectors for testing FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z ); // x is given FLA_Set( FLA_ONE, x ); // y := Ax FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y ); // z := L (Q^H) x , libflame FLA_Apply_Q_UT_create_workspace( T, x, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, x ); FLA_Obj_free( &W ); if ( m < n ) FLA_Part_2x1( x, &x, &W, m, FLA_TOP ); else FLA_Part_1x2( L, &L, &W, n, FLA_LEFT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z ); // Comapre (A_copy, A_recovered), (y,z) and (y,w) residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); residual_Axy = FLA_Max_elemwise_diff( y, z ); if ( 1 || residual_A > EPS || residual_Axy > EPS ) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k); fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n", residual_A, residual_Axy ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Obj_free( &L ); FLA_Obj_free( &Q ); FLA_Obj_free( &Qk ); FLA_Obj_free( &D ); FLA_Obj_free( &Dk ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Finalize_safe( init_result ); }
void time_Hess_UT( int variant, int type, int nrepeats, int m, FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Hess_UT( A, t ); break; case FLA_ALG_FRONT: FLA_Hess_UT( A, T ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } //if ( type == FLA_ALG_REFERENCE ) //{ // ; //} //else { FLA_Obj AT, AB; FLA_Obj Q, QT, QB; FLA_Obj E, ET, EB; FLA_Obj F; dim_t m_A, m_T; m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Part_2x1( Q, &QT, &QB, 1, FLA_TOP ); FLA_Part_2x1( A, &AT, &AB, 1, FLA_TOP ); if ( type == FLA_ALG_REFERENCE ) { if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB ); } else FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QB ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, Q, FLA_ZERO, E ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, Q, E, FLA_ZERO, F ); FLA_Copy( A, E ); FLA_Part_2x1( E, &ET, &EB, 1, FLA_TOP ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, EB ); *diff = FLA_Max_elemwise_diff( E, F ); FLA_Obj_free( &Q ); FLA_Obj_free( &E ); FLA_Obj_free( &F ); } *gflops = ( 10.0 / 3.0 * m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &norm ); }
void time_Bidiag_UT( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj tu, FLA_Obj tv, FLA_Obj TU, FLA_Obj TV, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( param_combo ){ case 0: { switch( type ) { case FLA_ALG_REFERENCE: REF_Bidiag_UT( A, tu, tv ); break; case FLA_ALG_FRONT: FLA_Bidiag_UT( A, TU, TV ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj AL, AR; FLA_Obj ATL, ATR, ABL, ABR; FLA_Obj QU; FLA_Obj QV, QVL, QVR; FLA_Obj E, EL, ER; FLA_Obj F; FLA_Obj WU, WV, eye; FLA_Obj tvT, tvB; dim_t m_A, n_A, m_TU; //FLA_Obj_show( "A_save", A_save, "%10.3e", "" ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); m_TU = FLA_Obj_length( TU ); FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &QU ); FLA_Obj_create( FLA_Obj_datatype( A ), n_A, n_A, 0, 0, &QV ); FLA_Obj_create( FLA_Obj_datatype( A ), m_TU, m_A, 0, 0, &WU ); FLA_Obj_create( FLA_Obj_datatype( A ), m_TU, n_A, 0, 0, &WV ); FLA_Set_to_identity( QU ); FLA_Set_to_identity( QV ); FLA_Part_1x2( QV, &QVL, &QVR, 1, FLA_LEFT ); FLA_Part_1x2( A, &AL, &AR, 1, FLA_LEFT ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 1, 1, FLA_BL ); FLA_Part_2x1( tv, &tvT, &tvB, 1, FLA_BOTTOM ); if ( type == FLA_ALG_REFERENCE ) { if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, AR, tv, QVR ); // // Need to apply backwards transformation, since vectors are stored columnwise. // QL? RQ? // //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, AR, tvT, QVR ); } else { FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, TU, WU, QU ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, AR, TV, WV, QVR ); } /* FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Set_to_identity( eye ); //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, // FLA_ONE, QV, QV, FLA_MINUS_ONE, eye ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, QU, QU, FLA_MINUS_ONE, eye ); FLA_Obj_show( "eye", eye, "%10.3e", "" ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &eye ); */ FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A_save, QV, FLA_ZERO, E ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, QU, E, FLA_ZERO, F ); //FLA_Obj_show( "A_save", A_save, "%10.3e", "" ); FLA_Copy( A, E ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, E ); FLA_Part_1x2( E, &EL, &ER, 1, FLA_LEFT ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, ER ); //FLA_Obj_show( "B", E, "%10.3e", "" ); //FLA_Obj_show( "Q'AV", F, "%10.3e", "" ); //FLA_Obj_show( "B", E, "%10.3e + %10.3e ", "" ); //FLA_Obj_show( "Q'AV", F, "%10.3e + %10.3e ", "" ); *diff = FLA_Max_elemwise_diff( E, F ); FLA_Obj_free( &E ); FLA_Obj_free( &F ); FLA_Obj_free( &QU ); FLA_Obj_free( &QV ); FLA_Obj_free( &WU ); FLA_Obj_free( &WV ); } *gflops = 4.0 * n * n * ( m - n / 3.0 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &norm ); }
void libfla_test_hessut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Obj A, T, W, Qh, AQ, QhAQ, norm; FLA_Obj AT, AB; FLA_Obj QhT, QhB; FLA_Obj A_save; // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); if ( impl == FLA_TEST_FLAT_FRONT_END || impl == FLA_TEST_FLAT_BLK_VAR ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &T ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &W ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &T ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &W ); } // Initialize the test matrices. FLA_Random_matrix( A ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); // Create auxiliary matrices to be used when checking the result. FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Qh ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &AQ ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &QhAQ ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_hessut_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { FLA_Copy_external( A_save, A ); time = FLA_Clock(); libfla_test_hessut_impl( impl, A, T ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_hessut_cntl_free(); // Compute the performance of the best experiment repeat. *perf = ( 10.0 / 3.0 * m * m * m ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Check the result by computing R - Q' A_orig Q. FLA_Set_to_identity( Qh ); FLA_Part_2x1( Qh, &QhT, &QhB, 1, FLA_TOP ); FLA_Part_2x1( A, &AT, &AB, 1, FLA_TOP ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QhB ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, Qh, FLA_ZERO, AQ ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, Qh, AQ, FLA_ZERO, QhAQ ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, AB ); *residual = FLA_Max_elemwise_diff( A, QhAQ ); // Free the supporting flat objects. FLA_Obj_free( &W ); FLA_Obj_free( &Qh ); FLA_Obj_free( &AQ ); FLA_Obj_free( &QhAQ ); FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); }