FLA_Error FLA_Chol_solve( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, FLA_Obj X ) { // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Chol_solve_check( uplo, A, B, X ); if ( FLA_Obj_is_identical( B, X ) == FALSE ) FLA_Copy_external( B, X ); if ( uplo == FLA_LOWER_TRIANGULAR ) { FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, X ); FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, X ); } else // if ( uplo == FLA_UPPER_TRIANGULAR ) { FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, X ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, X ); } return FLA_SUCCESS; }
FLA_Error FLA_QR_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X ) { FLA_Obj W, Y; FLA_Obj AT, AB; FLA_Obj YT, YB; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_QR_UT_solve_check( A, T, B, X ); FLA_Apply_Q_UT_create_workspace( T, B, &W ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &Y ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Y ); FLA_Part_2x1( A, &AT, &AB, FLA_Obj_width( A ), FLA_TOP ); FLA_Part_2x1( Y, &YT, &YB, FLA_Obj_width( A ), FLA_TOP ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, AT, YT ); FLA_Copy_external( YT, X ); FLA_Obj_free( &Y ); FLA_Obj_free( &W ); return FLA_SUCCESS; }
FLA_Error FLA_LU_nopiv_unb_var3( FLA_Obj A ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) && FLA_Obj_width( ATL ) < FLA_Obj_width( A )){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); /*------------------------------------------------------------*/ // a01 = trilu( A00 ) \ a10 FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, A00, a01 ); // alpha11 = alpha11 - a10t * a01 FLA_Dots_external( FLA_MINUS_ONE, a10t, a01, FLA_ONE, alpha11 ); // a21 = a21 - A20 * a01 FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, a01, FLA_ONE, a21 ); // a21 = a21 / alpha11 FLA_Inv_scal_external( alpha11, a21 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); } if ( FLA_Obj_width( ATR ) > 0 ) // ATR = trilu( ATL ) \ ATR FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, FLA_ONE, ATL, ATR ); return FLA_SUCCESS; }
FLA_Error FLA_Trsm( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B ) { FLA_Error r_val = FLA_SUCCESS; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Trsm_check( side, uplo, trans, diag, alpha, A, B ); #ifdef FLA_ENABLE_BLAS3_FRONT_END_CNTL_TREES r_val = FLA_Trsm_internal( side, uplo, trans, diag, alpha, A, B, fla_trsm_cntl_mm ); #else r_val = FLA_Trsm_external( side, uplo, trans, diag, alpha, A, B ); #endif return r_val; }
FLA_Error FLA_UDdate_UT_solve( FLA_Obj R, FLA_Obj bR, FLA_Obj x ) { // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_UDdate_UT_solve_check( R, bR, x ); // Copy the contents of bR to x so that after the triangular solve, the // solution resides in x (and bR is preserved). FLA_Copy_external( bR, x ); // Perform a triangular solve with R the right-hand side. FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, R, x ); return FLA_SUCCESS; }
FLA_Error FLA_Trsm_rlt_blk_var3_ht( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Trsm_t* cntl ) { FLA_Obj BT, B0, BB, B1, B2; int b; FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_length( BT ) < FLA_Obj_length( B ) ){ b = 1; FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* B1 = B1 / tril( A'); */ FLA_Trsm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, diagA, alpha, *FLASH_OBJ_PTR_AT( A ), *FLASH_OBJ_PTR_AT( B1 ), NULL ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blk_var4( FLA_Obj A, FLA_Obj T ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj YT, Y0, YB, Y1, Y2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Y, Z; FLA_Obj ABR_l; FLA_Obj UB_l, U2_l; FLA_Obj YB_l, Y2_l; FLA_Obj ZB_l, Z2_l; FLA_Obj WT_l; FLA_Obj T1_tl; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Y ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Y, &YT, &YB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( YT, &Y0, /* ** */ /* ** */ &Y1, YB, &Y2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( YB, &YB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( ZB, &ZB_l, &none, bb, FLA_LEFT ); FLA_Part_2x1( UB_l, &none, &U2_l, b, FLA_TOP ); FLA_Part_2x1( YB_l, &none, &Y2_l, b, FLA_TOP ); FLA_Part_2x1( ZB_l, &none, &Z2_l, b, FLA_TOP ); // [ ABR, YB, ZB, T1 ] = FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1, b ); //FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1_tl ); //FLA_Hess_UT_step_ofu_var4( ABR, YB, ZB, T1_tl ); FLA_Hess_UT_step_opt_var4( ABR, YB, ZB, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } // A22 = A22 - U2 * Y2' - Z2 * U2'; FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, U2_l, Y2_l, FLA_ONE, A22 ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, Z2_l, U2_l, FLA_ONE, A22 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &YT, Y0, Y1, /* ** */ /* ** */ &YB, Y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
void time_Apply_Q( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj B, FLA_Obj B_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj B_save, A_flat, B_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, B, &B_flat ); FLASH_Copy( B, B_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( B_save, B ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( B, B_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Apply_Q( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A_flat, t, B_flat ); break; case FLA_ALG_FRONT: //printf("\n"); FLASH_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, B ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A_flat, B_flat ); FLASH_Obj_hierarchify( B_flat, B_ref ); *diff = 0.0; } else { FLASH_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, B ); *diff = FLASH_Max_elemwise_diff( B, B_ref ); } *gflops = 2.0 * FLASH_Obj_scalar_length( A ) * FLASH_Obj_scalar_width( A ) * FLASH_Obj_scalar_width( B ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( B_save, B ); FLASH_Obj_free( &B_save ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &B_flat ); }
void time_QR_UT( int variant, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, FLA_Obj b, FLA_Obj b_orig, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, b_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); if ( FLA_Obj_is_single_precision( A ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_QR_UT( A, t ); break; case FLA_ALG_FRONT: FLA_QR_UT( A, T ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Obj AT, AB; FLA_Obj bT, bB; FLA_Obj y; FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y ); FLA_Copy_external( b, b_orig ); if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); FLA_Part_2x1( A, &AT, &AB, FLA_Obj_width( A ), FLA_TOP ); FLA_Part_2x1( b, &bT, &bB, FLA_Obj_width( A ), FLA_TOP ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, AT, bT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, bT, FLA_ONE, b_orig ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &y ); } else { FLA_Obj x, y; FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y ); FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &x ); FLA_Copy_external( b, b_orig ); FLA_QR_UT_solve( A, T, b, x ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, x, FLA_ONE, b_orig ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); } *gflops = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &norm ); }
FLA_Error REF_Trsm( FLA_Side side, FLA_Uplo uplo, FLA_Trans transA, FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B ) { FLA_Trsm_external( side, uplo, transA, diagA, alpha, A, B ); return 0; }
FLA_Error FLA_LU_piv_unb_var3( FLA_Obj A, FLA_Obj p ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj AL, AR, A0, a1, A2; FLA_Obj pT, p0, pB, pi1, p2; FLA_Obj AB0, aB1; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( p, &pT, &pB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) && FLA_Obj_width( ATL ) < FLA_Obj_width( A )){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &a1, &A2, 1, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( pT, &p0, /* ** */ /* *** */ &pi1, pB, &p2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ // Apply previously computed pivots FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, p0, a1 ); // a01 = trilu( A00 ) \ a01 FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, A00, a01 ); // alpha11 = alpha11 - a10t * a01 FLA_Dots_external( FLA_MINUS_ONE, a10t, a01, FLA_ONE, alpha11 ); // a21 = a21 - A20 * a01 FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, a01, FLA_ONE, a21 ); // aB1 = / alpha11 \ // \ a21 / FLA_Merge_2x1( alpha11, a21, &aB1 ); // Determine pivot index FLA_Amax_external( aB1, pi1 ); // Apply pivots to current column FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, aB1 ); // a21 = a21 / alpha11 FLA_Inv_scal_external( alpha11, a21 ); // AB0 = / a10t \ // \ A20 / FLA_Merge_2x1( a10t, A20, &AB0 ); // Apply pivots to previous columns FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, AB0 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, a1, /**/ A2, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &pT, p0, pi1, /* ** */ /* *** */ &pB, p2, FLA_TOP ); } if ( FLA_Obj_width( ATR ) > 0 ) { /* Apply pivots to untouched columns */ FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, p, ATR ); /* ATR = trilu( ATL ) \ ATR */ FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, FLA_ONE, ATL, ATR ); } return FLA_SUCCESS; }
void time_Eig_gest_nu( int variant, int type, int n_repeats, int n, int b_alg, FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, double *dtime, double *diff, double *gflops ) { int irep; double dtime_save = 1.0e9; FLA_Obj A_save, B_save, norm; fla_blocksize_t* bp; fla_eig_gest_t* cntl_eig_gest_var; fla_eig_gest_t* cntl_eig_gest_unb; if ( ( type == FLA_ALG_UNBLOCKED || type == FLA_ALG_UNB_OPT ) && n > 300 ) { *gflops = 0.0; *diff = 0.0; return; } if ( variant == 3 ) { *gflops = 0.0; *diff = 0.0; return; } bp = FLA_Blocksize_create( b_alg, b_alg, b_alg, b_alg ); cntl_eig_gest_unb = FLA_Cntl_eig_gest_obj_create( FLA_FLAT, //FLA_UNBLOCKED_VARIANT1, FLA_UNB_OPT_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_eig_gest_var = FLA_Cntl_eig_gest_obj_create( FLA_FLAT, variant, bp, cntl_eig_gest_unb, fla_axpy_cntl_blas, fla_axpy_cntl_blas, fla_gemm_cntl_blas, fla_gemm_cntl_blas, fla_gemm_cntl_blas, fla_hemm_cntl_blas, fla_her2k_cntl_blas, fla_trmm_cntl_blas, fla_trmm_cntl_blas, fla_trsm_cntl_blas, fla_trsm_cntl_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); FLA_Copy_external( B, B_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( B_save, B ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Eig_gest_nu( A, B ); break; case 1: { // Time variant 1 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var1( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var1( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var1( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 2: { // Time variant 2 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var2( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var2( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var2( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 3: { // Time variant 3 switch( type ) { case FLA_ALG_UNBLOCKED: //FLA_Eig_gest_nu_unb_var3( A, Y, B ); break; case FLA_ALG_UNB_OPT: //FLA_Eig_gest_nu_opt_var3( A, Y, B ); break; case FLA_ALG_BLOCKED: //FLA_Eig_gest_nu_blk_var3( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 4: { // Time variant 4 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var4( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var4( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var4( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } case 5: { // Time variant 5 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Eig_gest_nu_unb_var5( A, Y, B ); break; case FLA_ALG_UNB_OPT: FLA_Eig_gest_nu_opt_var5( A, Y, B ); break; case FLA_ALG_BLOCKED: FLA_Eig_gest_nu_blk_var5( A, Y, B, cntl_eig_gest_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_save = min( *dtime, dtime_save ); } FLA_Cntl_obj_free( cntl_eig_gest_var ); FLA_Cntl_obj_free( cntl_eig_gest_unb ); FLA_Blocksize_free( bp ); // Recover A. if ( inv == FLA_NO_INVERSE ) { if ( uplo == FLA_LOWER_TRIANGULAR ) { // A = L' * A_orig * L // A_orig = inv(L') * A * inv(L) FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A ); FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trsm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } else // if ( uplo == FLA_UPPER_TRIANGULAR ) { // A = U * A_orig * U' // A_orig = inv(U) * A * inv(U') FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } } else // if ( inv == FLA_INVERSE ) { if ( uplo == FLA_LOWER_TRIANGULAR ) { // A = inv(L) * A_orig * inv(L') // A_orig = L * A * L' FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A ); FLA_Trmm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trmm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } else // if ( uplo == FLA_UPPER_TRIANGULAR ) { // A = inv(U') * A_orig * inv(U) // A_orig = U' * A * U FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A ); FLA_Trmm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); FLA_Trmm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, A ); } } *diff = FLA_Max_elemwise_diff( A, A_save ); /* if ( type == FLA_ALG_UNBLOCKED ) { FLA_Obj_show( "A", A, "%10.3e", "" ); FLA_Obj_show( "A_orig", A_save, "%10.3e", "" ); } */ *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_save / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_save; FLA_Copy_external( A_save, A ); FLA_Copy_external( B_save, B ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); FLA_Obj_free( &norm ); }
void time_Chol( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj A_ref, FLA_Obj T, FLA_Obj t_ref, FLA_Obj B, FLA_Obj B_ref, FLA_Obj X, FLA_Obj X_ref, FLA_Obj W, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj B_save; FLA_Obj normx; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save ); if ( FLA_Obj_is_single_precision( A ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &normx ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &normx ); FLA_Copy_external( B, B_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( B_save, B ); FLA_Copy_external( B_save, B_ref ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: FLA_Copy_external( B_ref, X_ref ); //REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A_ref, t_ref, X_ref ); REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A_ref, t_ref, X_ref ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A_ref, X_ref ); break; case FLA_ALG_FRONT: FLA_Copy_external( B, X ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, X ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, X ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: FLA_Copy_external( B_ref, X_ref ); FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A_ref, X_ref ); //REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, A_ref, t_ref, X_ref ); REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, A_ref, t_ref, X_ref ); break; case FLA_ALG_FRONT: FLA_Copy_external( B, X ); FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, X ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, X ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { //FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_ref, X_ref, FLA_ONE, B_ref ); //FLA_Nrm2_external( B_ref, normx ); //FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, normx, 1, 1, diff, 1, 1 ); //FLA_Obj_show( "X_ref:", X_ref, "%12.4e", "" ); *diff = 0.0; } else { //FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A, X, FLA_ONE, B ); //FLA_Nrm2_external( B, normx ); //FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, normx, 1, 1, diff, 1, 1 ); //FLA_Obj_show( "X_fla:", X, "%12.4e", "" ); *diff = FLA_Max_elemwise_diff( X, X_ref ); } *gflops = 1.0 / 3.0 * FLA_Obj_length( A ) * FLA_Obj_length( B ) * FLA_Obj_width( B ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( B_save, B ); FLA_Copy_external( B_save, B_ref ); FLA_Obj_free( &B_save ); FLA_Obj_free( &normx ); }
FLA_Error FLA_Trsm_ruc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl ) { return FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_NO_TRANSPOSE, diag, alpha, A, B ); }
FLA_Error FLA_Trsm_lut_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl ) { return FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, diag, alpha, A, B ); }
FLA_Error FLA_Trsm_task( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl ) { return FLA_Trsm_external( side, uplo, trans, diag, alpha, A, B ); }