FLA_Error FLA_Syrk_ut_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj CTL, CTR, C00, C01, C02, CBL, CBR, C10, C11, C12, C20, C21, C22; dim_t b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_RIGHT ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_BR ); while ( FLA_Obj_width( AR ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, &A1, /**/ &A2, b, FLA_LEFT ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02, &C10, &C11, /**/ &C12, /* ************* */ /* ******************** */ CBL, /**/ CBR, &C20, &C21, /**/ &C22, b, b, FLA_TL ); /*------------------------------------------------------------*/ /* C12 = C12 + A1' * A2 */ FLA_Gemm_internal( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A1, A2, beta, C12, FLA_Cntl_sub_gemm( cntl ) ); /* C11 = C11 + A1' * A1 */ FLA_Syrk_internal( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, alpha, A1, beta, C11, FLA_Cntl_sub_syrk( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, /**/ A1, A2, FLA_RIGHT ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02, /* ************** */ /* ****************** */ C10, /**/ C11, C12, &CBL, /**/ &CBR, C20, /**/ C21, C22, FLA_BR ); } return FLA_SUCCESS; }
FLA_Error FLA_Hemm_ru_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl ) { FLA_Obj BT, B0, BB, B1, B2; FLA_Obj CT, C0, CB, C1, C2; dim_t b; FLA_Part_2x1( B, &BT, &BB, 0, FLA_BOTTOM ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_BOTTOM ); while ( FLA_Obj_length( BB ) < FLA_Obj_length( B ) ){ b = FLA_Determine_blocksize( BT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( BT, &B0, &B1, /* ** */ /* ** */ BB, &B2, b, FLA_TOP ); FLA_Repart_2x1_to_3x1( CT, &C0, &C1, /* ** */ /* ** */ CB, &C2, b, FLA_TOP ); /*------------------------------------------------------------*/ /* C1 = C1 + B1 * A */ FLA_Hemm_internal( FLA_RIGHT, FLA_UPPER_TRIANGULAR, alpha, A, B1, beta, C1, FLA_Cntl_sub_hemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &BT, B0, /* ** */ /* ** */ B1, &BB, B2, FLA_BOTTOM ); FLA_Cont_with_3x1_to_2x1( &CT, C0, /* ** */ /* ** */ C1, &CB, C2, FLA_BOTTOM ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_ch_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj CT, C0, CB, C1, C2; dim_t b; FLA_Part_2x1( A, &AT, &AB, 0, FLA_BOTTOM ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_BOTTOM ); while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, &A1, /* ** */ /* ** */ AB, &A2, b, FLA_TOP ); FLA_Repart_2x1_to_3x1( CT, &C0, &C1, /* ** */ /* ** */ CB, &C2, b, FLA_TOP ); /*------------------------------------------------------------*/ /* C1 = alpha * A1 * B' + C1; */ FLA_Gemm_internal( FLA_CONJ_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, alpha, A1, B, beta, C1, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, /* ** */ /* ** */ A1, &AB, A2, FLA_BOTTOM ); FLA_Cont_with_3x1_to_2x1( &CT, C0, /* ** */ /* ** */ C1, &CB, C2, FLA_BOTTOM ); } return FLA_SUCCESS; }
FLA_Error FLA_Trmm_rut_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj BL, BR, B0, B1, B2; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ /* B1 = B1 * triu( A11 )'; */ FLA_Trmm_internal( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, diagA, alpha, A11, B1, FLA_Cntl_sub_trmm( cntl ) ); /* B1 = B1 + B2 * A12'; */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, alpha, B2, A12, FLA_ONE, B1, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Trinv_uu_blk_var3( FLA_Obj A, fla_trinv_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); /*------------------------------------------------------------*/ // A12 = -triuu( A11 ) \ A12; FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_MINUS_ONE, A11, A12, FLA_Cntl_sub_trsm1( cntl ) ); // A02 = A01 * A12 + A02; FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A01, A12, FLA_ONE, A02, FLA_Cntl_sub_gemm( cntl ) ); // A01 = A01 / triuu( A11 ); FLA_Trsm_internal( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A11, A01, FLA_Cntl_sub_trsm2( cntl ) ); // A11 = inv( A11 ); FLA_Trinv_internal( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A11, FLA_Cntl_sub_trinv( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); } return FLA_SUCCESS; }
FLA_Error FLA_Chol_u_blk_var1( FLA_Obj A, fla_chol_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; dim_t b; int r_val = FLA_SUCCESS; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); /*------------------------------------------------------------*/ // A01 = inv( triu( A00 )' ) * A01 FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A00, A01, FLA_Cntl_sub_trsm( cntl ) ); // A11 = A11 - A01' * A01 FLA_Herk_internal( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, A01, FLA_ONE, A11, FLA_Cntl_sub_herk( cntl ) ); // A11 = chol( A11 ) r_val = FLA_Chol_internal( FLA_UPPER_TRIANGULAR, A11, FLA_Cntl_sub_chol( cntl ) ); if ( r_val != FLA_SUCCESS ) return ( FLA_Obj_length( A00 ) + r_val ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); } return r_val; }
FLA_Error FLASH_Apply_Q_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B ) { FLA_Error r_val; dim_t b_alg; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Apply_Q_UT_check( side, trans, direct, storev, A, T, W, B ); // Inspect the length of TTL to get the blocksize used by the QR/LQ // factorization, which will be our inner blocksize for Apply_Q_UT. b_alg = FLASH_Obj_scalar_length_tl( T ); // The traditional (non-incremental) Apply_Q_UT algorithm-by-blocks // requires that the algorithmic blocksize be equal to the storage // blocksize. if ( b_alg != FLASH_Obj_scalar_width_tl( T ) ) { FLA_Print_message( "FLASH_Apply_Q_UT() requires that b_alg == b_store", __FILE__, __LINE__ ); FLA_Abort(); } // Adjust the blocksize of the control tree node for the flat subproblem. if ( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ) != NULL ) FLA_Blocksize_set( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ), b_alg, b_alg, b_alg, b_alg ); // Begin a parallel region. FLASH_Queue_begin(); // Invoke FLA_Apply_Q_UT_internal() with the standard control tree. r_val = FLA_Apply_Q_UT_internal( side, trans, direct, storev, A, T, W, B, flash_apqut_cntl_blas ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLA_Axpyt_c_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj BT, B0, BB, B1, B2; dim_t b; FLA_Part_2x1( A, &AT, &AB, 0, FLA_BOTTOM ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_BOTTOM ); while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, &A1, /* ** */ /* ** */ AB, &A2, b, FLA_TOP ); FLA_Repart_2x1_to_3x1( BT, &B0, &B1, /* ** */ /* ** */ BB, &B2, b, FLA_TOP ); /*------------------------------------------------------------*/ FLA_Axpyt_internal( FLA_CONJ_NO_TRANSPOSE, alpha, A1, B1, FLA_Cntl_sub_axpyt( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, /* ** */ /* ** */ A1, &AB, A2, FLA_BOTTOM ); FLA_Cont_with_3x1_to_2x1( &BT, B0, /* ** */ /* ** */ B1, &BB, B2, FLA_BOTTOM ); } return FLA_SUCCESS; }
FLA_Error FLA_Copy_blk_var1( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj BT, B0, BB, B1, B2; dim_t b; FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Copy_internal( A1, B1, FLA_Cntl_sub_copy( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_cn_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BT, B0, BB, B1, B2; dim_t b; FLA_Scal_internal( beta, C, FLA_Cntl_sub_scal( cntl ) ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AR, FLA_RIGHT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C = alpha * A1 * B1 + C; */ FLA_Gemm_internal( FLA_CONJ_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A1, B1, FLA_ONE, C, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_hh_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj BL, BR, B0, B1, B2; dim_t b; FLA_Scal_internal( beta, C, FLA_Cntl_sub_scal( cntl ) ); FLA_Part_2x1( A, &AT, &AB, 0, FLA_BOTTOM ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, &A1, /* ** */ /* ** */ AB, &A2, b, FLA_TOP ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &B1, /**/ &B2, b, FLA_LEFT ); /*------------------------------------------------------------*/ /* C = alpha * A1' * B1' + C; */ FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, alpha, A1, B1, FLA_ONE, C, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, /* ** */ /* ** */ A1, &AB, A2, FLA_BOTTOM ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ B1, B2, FLA_RIGHT ); } return FLA_SUCCESS; }
FLA_Error FLA_Ttmm_u_blk_var1( FLA_Obj A, fla_ttmm_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); /*------------------------------------------------------------*/ // A00 = A00 + A01 * A01' FLA_Herk_internal( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A01, FLA_ONE, A00, FLA_Cntl_sub_herk( cntl ) ); // A01 = A01 * triu( A11 )' FLA_Trmm_internal( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A11, A01, FLA_Cntl_sub_trmm( cntl ) ); // A11 = triu( A11 ) * triu( A11 )' FLA_Ttmm_internal( FLA_UPPER_TRIANGULAR, A11, FLA_Cntl_sub_ttmm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_hn_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj CT, C0, CB, C1, C2; dim_t b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_RIGHT ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_BOTTOM ); while ( FLA_Obj_width( AR ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, &A1, /**/ &A2, b, FLA_LEFT ); FLA_Repart_2x1_to_3x1( CT, &C0, &C1, /* ** */ /* ** */ CB, &C2, b, FLA_TOP ); /*------------------------------------------------------------*/ /* C1 = alpha * A1' * B + C1; */ FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A1, B, beta, C1, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, /**/ A1, A2, FLA_RIGHT ); FLA_Cont_with_3x1_to_2x1( &CT, C0, /* ** */ /* ** */ C1, &CB, C2, FLA_BOTTOM ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_hh_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj BT, B0, BB, B1, B2; FLA_Obj CL, CR, C0, C1, C2; dim_t b; FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_LEFT ); while ( FLA_Obj_length( BT ) < FLA_Obj_length( B ) ){ b = FLA_Determine_blocksize( BB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, /**/ &C1, &C2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ /* C1 = alpha * A' * B1' + C1; */ FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, alpha, A, B1, beta, C1, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, C1, /**/ C2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemv_t_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj yT, y0, yB, y1, y2; dim_t b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_RIGHT ); FLA_Part_2x1( y, &yT, &yB, 0, FLA_BOTTOM ); while ( FLA_Obj_width( AR ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, &A1, /**/ &A2, b, FLA_LEFT ); FLA_Repart_2x1_to_3x1( yT, &y0, &y1, /* ** */ /* ** */ yB, &y2, b, FLA_TOP ); /*------------------------------------------------------------*/ /* y1 = alpha * A1' * x + y1 */ FLA_Gemv_internal( FLA_TRANSPOSE, alpha, A1, x, beta, y1, FLA_Cntl_sub_gemv( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, /**/ A1, A2, FLA_RIGHT ); FLA_Cont_with_3x1_to_2x1( &yT, y0, /* ** */ /* ** */ y1, &yB, y2, FLA_BOTTOM ); } return FLA_SUCCESS; }
FLA_Error FLA_Axpyt_h_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BT, B0, BB, B1, B2; dim_t b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AR, FLA_RIGHT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Axpyt_internal( FLA_CONJ_TRANSPOSE, alpha, A1, B1, FLA_Cntl_sub_axpyt( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Swap_t_blk_var2( FLA_Obj A, FLA_Obj B, fla_swap_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj BL, BR, B0, B1, B2; dim_t b; FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Swapt_external( FLA_TRANSPOSE, A1, B1 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_nt_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BL, BR, B0, B1, B2; dim_t b; FLA_Scal_internal( beta, C, FLA_Cntl_sub_scal( cntl ) ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_RIGHT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); while ( FLA_Obj_width( AR ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, &A1, /**/ &A2, b, FLA_LEFT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &B1, /**/ &B2, b, FLA_LEFT ); /*------------------------------------------------------------*/ /* C = alpha * A1 * B1' + C; */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, alpha, A1, B1, FLA_ONE, C, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, /**/ A1, A2, FLA_RIGHT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ B1, B2, FLA_RIGHT ); } return FLA_SUCCESS; }
FLA_Error FLA_Her2k_ln_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BL, BR, B0, B1, B2; dim_t b; FLA_Scalr_internal( FLA_LOWER_TRIANGULAR, beta, C, FLA_Cntl_sub_scalr( cntl ) ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AR, FLA_RIGHT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ /* C = C + A1 * B1' + B1 * A1' */ FLA_Her2k_internal( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, alpha, A1, B1, FLA_ONE, C, FLA_Cntl_sub_her2k( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Scalr_u_blk_var4( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_BR ); while ( FLA_Obj_min_dim( ATL ) > 0 ){ b = FLA_Determine_blocksize( ATL, FLA_TL, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02, &A10, &A11, /**/ &A12, /* ************* */ /* ******************** */ ABL, /**/ ABR, &A20, &A21, /**/ &A22, b, b, FLA_TL ); /*------------------------------------------------------------*/ // A11 = alpha * triu( A11 ); FLA_Scalr_internal( FLA_UPPER_TRIANGULAR, alpha, A11, FLA_Cntl_sub_scalr( cntl ) ); // A01 = alpha * A01; FLA_Scal_internal( alpha, A01, FLA_Cntl_sub_scal( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02, /* ************** */ /* ****************** */ A10, /**/ A11, A12, &ABL, /**/ &ABR, A20, /**/ A21, A22, FLA_BR ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_tc_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj BL, BR, B0, B1, B2; FLA_Obj CL, CR, C0, C1, C2; dim_t b; FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_RIGHT ); while ( FLA_Obj_width( BR ) < FLA_Obj_width( B ) ){ b = FLA_Determine_blocksize( BL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &B1, /**/ &B2, b, FLA_LEFT ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, &C1, /**/ &C2, b, FLA_LEFT ); /*------------------------------------------------------------*/ /* C1 = alpha * A' * B1 + C1; */ FLA_Gemm_internal( FLA_TRANSPOSE, FLA_CONJ_NO_TRANSPOSE, alpha, A, B1, beta, C1, FLA_Cntl_sub_gemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ B1, B2, FLA_RIGHT ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, /**/ C1, C2, FLA_RIGHT ); } return FLA_SUCCESS; }
FLA_Error FLA_Hemm_lu_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl ) { FLA_Obj BL, BR, B0, B1, B2; FLA_Obj CL, CR, C0, C1, C2; dim_t b; FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_LEFT ); while ( FLA_Obj_width( BL ) < FLA_Obj_width( B ) ){ b = FLA_Determine_blocksize( BR, FLA_RIGHT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, /**/ &C1, &C2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ /* C1 = C1 + A * B1 */ FLA_Hemm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR, alpha, A, B1, beta, C1, FLA_Cntl_sub_hemm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, C1, /**/ C2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Syrk_ut_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; dim_t b; FLA_Scalr_internal( FLA_UPPER_TRIANGULAR, beta, C, FLA_Cntl_sub_scalr( cntl ) ); FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C = C + A1' * A1 */ FLA_Syrk_internal( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, alpha, A1, FLA_ONE, C, FLA_Cntl_sub_syrk( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Transpose_blk_var2( FLA_Obj A, fla_tpose_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); /*------------------------------------------------------------*/ FLA_Transpose_unb_var2( A11 ); FLA_Swap_t_blk_var2( A21, A12, FLA_Cntl_sub_swap( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); } return FLA_SUCCESS; }
FLA_Error FLA_Apply_Q_UT_lhfr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl ) { FLA_Obj BL, BR, B0, B1, B2; FLA_Obj WL, WR, W0, W1, W2; dim_t b; FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); FLA_Part_1x2( W, &WL, &WR, 0, FLA_LEFT ); while ( FLA_Obj_width( BL ) < FLA_Obj_width( B ) ){ b = FLA_Determine_blocksize( BR, FLA_RIGHT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( WL, /**/ WR, &W0, /**/ &W1, &W2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ // B1 = Q' * B1; FLA_Apply_Q_UT_internal( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W1, B1, FLA_Cntl_sub_apqut( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &WL, /**/ &WR, W0, W1, /**/ W2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Copyt_c_blk_var3( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BL, BR, B0, B1, B2; dim_t b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AR, FLA_RIGHT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Copyt_internal( FLA_CONJ_NO_TRANSPOSE, A1, B1, FLA_Cntl_sub_copyt( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Trsm_ruc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl ) { FLA_Obj BT, B0, BB, B1, B2; dim_t b; FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_length( BT ) < FLA_Obj_length( B ) ) { b = FLA_Determine_blocksize( BB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* B1 = B1 * triu( A ); */ FLA_Trsm_internal( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_NO_TRANSPOSE, diagA, alpha, A, B1, FLA_Cntl_sub_trsm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Syrk_ln_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj CTL, CTR, C00, C01, C02, CBL, CBR, C10, C11, C12, C20, C21, C22; dim_t b; FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_TL ); while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b, FLA_BOTTOM ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, /**/ &C01, &C02, /* ************* */ /* ******************** */ &C10, /**/ &C11, &C12, CBL, /**/ CBR, &C20, /**/ &C21, &C22, b, b, FLA_BR ); /*------------------------------------------------------------*/ /* C10 = C10 + A1 * A0' */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, alpha, A1, A0, beta, C10, FLA_Cntl_sub_gemm( cntl ) ); /* C11 = C11 + A1 * A1' */ FLA_Syrk_internal( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, alpha, A1, beta, C11, FLA_Cntl_sub_syrk( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, C01, /**/ C02, C10, C11, /**/ C12, /* ************** */ /* ****************** */ &CBL, /**/ &CBR, C20, C21, /**/ C22, FLA_TL ); } return FLA_SUCCESS; }
FLA_Error FLA_Eig_gest_nl_blk_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj BTL, BTR, B00, B01, B02, BBL, BBR, B10, B11, B12, B20, B21, B22; FLA_Obj YT, Y01, YB, Y11, Y21; FLA_Obj Y21_l, Y21_r; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( B, &BTL, &BTR, &BBL, &BBR, 0, 0, FLA_TL ); FLA_Part_2x1( Y, &YT, &YB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x2_to_3x3( BTL, /**/ BTR, &B00, /**/ &B01, &B02, /* ************* */ /* ******************** */ &B10, /**/ &B11, &B12, BBL, /**/ BBR, &B20, /**/ &B21, &B22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( YT, &Y01, /* ** */ /* *** */ &Y11, YB, &Y21, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Part_1x2( Y21, &Y21_l, &Y21_r, b, FLA_LEFT ); // Y21 = A22 * B21; FLA_Hemm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A22, B21, FLA_ZERO, Y21_l, FLA_Cntl_sub_hemm( cntl ) ); // A21 = A21 * tril( B11 ); FLA_Trmm_internal( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B11, A21, FLA_Cntl_sub_trmm1( cntl ) ); // A21 = A21 + 1/2 * Y21; FLA_Axpy_internal( FLA_ONE_HALF, Y21_l, A21, FLA_Cntl_sub_axpy1( cntl ) ); // A11 = tril( B11 )' * A11 * tril( B11 ); FLA_Eig_gest_internal( FLA_NO_INVERSE, FLA_LOWER_TRIANGULAR, A11, Y11, B11, FLA_Cntl_sub_eig_gest( cntl ) ); // A11 = A11 + A21' * B21 + B21' * A21; FLA_Her2k_internal( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A21, B21, FLA_ONE, A11, FLA_Cntl_sub_her2k( cntl ) ); // A21 = A21 + 1/2 * Y21; FLA_Axpy_internal( FLA_ONE_HALF, Y21_l, A21, FLA_Cntl_sub_axpy2( cntl ) ); // A21 = tril( B22 )' * A21; FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B22, A21, FLA_Cntl_sub_trmm2( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &BTL, /**/ &BTR, B00, B01, /**/ B02, B10, B11, /**/ B12, /* ************** */ /* ****************** */ &BBL, /**/ &BBR, B20, B21, /**/ B22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &YT, Y01, Y11, /* ** */ /* *** */ &YB, Y21, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Her2k_uh_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BL, BR, B0, B1, B2; FLA_Obj CTL, CTR, C00, C01, C02, CBL, CBR, C10, C11, C12, C20, C21, C22; dim_t b; FLA_Scalr_internal( FLA_UPPER_TRIANGULAR, beta, C, FLA_Cntl_sub_scalr( cntl ) ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_RIGHT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_BR ); while ( FLA_Obj_width( AR ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, &A1, /**/ &A2, b, FLA_LEFT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &B1, /**/ &B2, b, FLA_LEFT ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02, &C10, &C11, /**/ &C12, /* ************* */ /* ******************** */ CBL, /**/ CBR, &C20, &C21, /**/ &C22, b, b, FLA_TL ); /*------------------------------------------------------------*/ /* C01 = C01 + B0' * A1 */ FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, B0, A1, FLA_ONE, C01, FLA_Cntl_sub_gemm1( cntl ) ); /* C12 = C12 + A1' * B2 */ FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A1, B2, FLA_ONE, C12, FLA_Cntl_sub_gemm2( cntl ) ); /* C11 = C11 + A1' * B1 + B1' * A1 */ FLA_Her2k_internal( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, alpha, A1, B1, FLA_ONE, C11, FLA_Cntl_sub_her2k( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, /**/ A1, A2, FLA_RIGHT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ B1, B2, FLA_RIGHT ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02, /* ************** */ /* ****************** */ C10, /**/ C11, C12, &CBL, /**/ &CBR, C20, /**/ C21, C22, FLA_BR ); } return FLA_SUCCESS; }