int Symm_ru_unb_var4( FLA_Obj A, FLA_Obj B, FLA_Obj C ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BL, BR, B0, b1, B2; FLA_Obj CL, CR, C0, c1, C2; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &b1, &B2, 1, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, /**/ &c1, &C2, 1, FLA_RIGHT ); /*------------------------------------------------------------*/ //c1 = (b1 * alpha11) + c1; FLA_Axpy(alpha11, b1, c1); //c1 = (B2 * a12t') + c1; FLA_Gemv(FLA_NO_TRANSPOSE, FLA_ONE, B2, a12t, FLA_ONE, c1); //C2 = (b1 * a12t) + C2 FLA_Ger(FLA_ONE, a12t, b1, C0); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, b1, /**/ B2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, c1, /**/ C2, FLA_LEFT ); } return FLA_SUCCESS; }
int Trsm_unb_var2( FLA_Obj L, FLA_Obj B ) { FLA_Obj LTL, LTR, L00, l01, L02, LBL, LBR, l10t, lambda11, l12t, L20, l21, L22; FLA_Obj BT, B0, BB, b1t, B2; FLA_Part_2x2( L, <L, <R, &LBL, &LBR, 0, 0, FLA_TL ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_length( LTL ) < FLA_Obj_length( L ) ){ FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &l01, &L02, /* ************* */ /* *************************** */ &l10t, /**/ &lambda11, &l12t, LBL, /**/ LBR, &L20, /**/ &l21, &L22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* *** */ &b1t, BB, &B2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* b1t = b1t / lambda11 */ FLA_Inv_scal( lambda11, b1t ); /* B2 = B2 - l21 * b1t */ FLA_Ger( FLA_MINUS_ONE, l21, b1t, B2 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( <L, /**/ <R, L00, l01, /**/ L02, l10t, lambda11, /**/ l12t, /* ************** */ /* ************************* */ &LBL, /**/ &LBR, L20, l21, /**/ L22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &BT, B0, b1t, /* ** */ /* *** */ &BB, B2, FLA_TOP ); } return FLA_SUCCESS; }
int Gemm_unb_var1( FLA_Obj A, FLA_Obj B, FLA_Obj C ) { FLA_Obj AL, AR, A0, a1, A2; FLA_Obj BT, B0, BB, b1t, B2; FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &a1, &A2, 1, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* *** */ &b1t, BB, &B2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Ger( FLA_ONE, a1, b1t, C ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, a1, /**/ A2, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, b1t, /* ** */ /* *** */ &BB, B2, FLA_TOP ); } return FLA_SUCCESS; }
int Symm_unb_var7( FLA_Obj A, FLA_Obj B, FLA_Obj C ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BT, B0, BB, b1t, B2; FLA_Obj CT, C0, CB, c1t, C2; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* *** */ &b1t, BB, &B2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( CT, &C0, /* ** */ /* *** */ &c1t, CB, &C2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ // C0 = C0 + a10*b1t; FLA_Ger(FLA_ONE, a10t, b1t, C0); // c1t = c1t + alpha11*b1t; FLA_Axpy(alpha11, b1t, c1t); // C2 = C2 + a21t*b1t; FLA_Ger(FLA_ONE, a21, b1t, C2); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &BT, B0, b1t, /* ** */ /* *** */ &BB, B2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &CT, C0, c1t, /* ** */ /* *** */ &CB, C2, FLA_TOP ); } return FLA_SUCCESS; }
// ============================================================================ void compute_case4b( int size_a, int size_b, int size_c, int size_d, int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C, int print_data ) { FLA_Obj slice_C; int datatype, size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci, iter_a, iter_b, iter_c, iter_d, iter_i, iter_j, ldim_slice_C; size_t idx_A, idx_B, idx_C; double * buff_cb_A, * buff_cb_B, * buff_cb_C, * buff_slice_C, d_one = 1.0; // Some initializations. datatype = FLA_Obj_datatype( cb_A ); buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A ); buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B ); buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C ); size_ab = size_a * size_b; size_abc = size_a * size_b * size_c; size_ia = size_i * size_a; size_iaj = size_i * size_a * size_j; size_jc = size_j * size_c; size_jci = size_j * size_c * size_i; // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" ); } // Prepare temporal slices without buffer. FLA_Obj_create_without_buffer( datatype, size_a, size_c, & slice_C ); #if 0 FLA_Obj_create_without_buffer( datatype, size_a, 1, & slice_A ); FLA_Obj_create_without_buffer( datatype, size_c, 1, & slice_B ); #endif // Perform computation. for( iter_b = 0; iter_b < size_b; iter_b++ ) { for( iter_d = 0; iter_d < size_d; iter_d++ ) { // Define slice_C. iter_a = 0; iter_c = 0; idx_C = ( ( size_t ) iter_a ) + ( ( size_t ) iter_b * size_a ) + ( ( size_t ) iter_c * size_ab ) + ( ( size_t ) iter_d * size_abc ); FLA_Obj_attach_buffer( & buff_cb_C[ idx_C ], 1, size_ab, & slice_C ); buff_slice_C = ( double * ) FLA_Obj_buffer_at_view( slice_C ); ldim_slice_C = FLA_Obj_col_stride( slice_C ); // Initialize slice_C. MyFLA_Obj_set_to_zero( slice_C ); for( iter_i = 0; iter_i < size_i; iter_i++ ) { for( iter_j = 0; iter_j < size_j; iter_j++ ) { #if 0 // Define slice_A. FLA_Obj_attach_buffer( & buff_cb_A[ iter_i + 0 * size_i + iter_j * size_ia + iter_b * size_iaj ], size_i, 1, & slice_A ); // Define slice_B. FLA_Obj_attach_buffer( & buff_cb_B[ iter_j + 0 * size_j + iter_i * size_jc + iter_d * size_jci ], size_j, 1, & slice_B ); // Compute DGER operation. FLA_Ger( FLA_ONE, slice_A, slice_B, slice_C ); #endif idx_A = ( ( size_t ) iter_i ) + ( ( size_t ) 0 * size_i ) + ( ( size_t ) iter_j * size_ia ) + ( ( size_t ) iter_b * size_iaj ); idx_B = ( ( size_t ) iter_j ) + ( ( size_t ) 0 * size_j ) + ( ( size_t ) iter_i * size_jc ) + ( ( size_t ) iter_d * size_jci ); dger_( & size_a, & size_c, & d_one, & buff_cb_A[ idx_A ], & size_i, & buff_cb_B[ idx_B ], & size_j, buff_slice_C, & ldim_slice_C ); } } } } // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" ); } // Remove temporal slices. FLA_Obj_free_without_buffer( & slice_C ); #if 0 FLA_Obj_free_without_buffer( & slice_A ); FLA_Obj_free_without_buffer( & slice_B ); #endif }