int Symm_ru_unb_var4( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
                           A20,  a21,     A22;

  FLA_Obj BL,    BR,       B0,  b1,  B2;

  FLA_Obj CL,    CR,       C0,  c1,  C2;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( B,    &BL,  &BR,      0, FLA_LEFT );

  FLA_Part_1x2( C,    &CL,  &CR,      0, FLA_LEFT );

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );

    FLA_Repart_1x2_to_1x3( BL,  /**/ BR,        &B0, /**/ &b1, &B2,
                           1, FLA_RIGHT );

    FLA_Repart_1x2_to_1x3( CL,  /**/ CR,        &C0, /**/ &c1, &C2,
                           1, FLA_RIGHT );

    /*------------------------------------------------------------*/
		//c1 = (b1 * alpha11) + c1;
    FLA_Axpy(alpha11, b1, c1);
    
    //c1 = (B2 * a12t') + c1;
    FLA_Gemv(FLA_NO_TRANSPOSE, FLA_ONE, B2, a12t, FLA_ONE, c1);
    
    //C2 = (b1 * a12t) + C2
    FLA_Ger(FLA_ONE, a12t, b1, C0);

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &BL,  /**/ &BR,        B0, b1, /**/ B2,
                              FLA_LEFT );

    FLA_Cont_with_1x3_to_1x2( &CL,  /**/ &CR,        C0, c1, /**/ C2,
                              FLA_LEFT );

  }

  return FLA_SUCCESS;
}
Beispiel #2
0
int Trsm_unb_var2( FLA_Obj L, FLA_Obj B )
{
  FLA_Obj LTL,   LTR,      L00,  l01,      L02, 
          LBL,   LBR,      l10t, lambda11, l12t,
                           L20,  l21,      L22;

  FLA_Obj BT,              B0,
          BB,              b1t,
                           B2;

  FLA_Part_2x2( L,    &LTL, &LTR,
                      &LBL, &LBR,     0, 0, FLA_TL );

  FLA_Part_2x1( B,    &BT, 
                      &BB,            0, FLA_TOP );

  while ( FLA_Obj_length( LTL ) < FLA_Obj_length( L ) ){

    FLA_Repart_2x2_to_3x3( LTL, /**/ LTR,       &L00,  /**/ &l01,      &L02,
                        /* ************* */   /* *************************** */
                                                &l10t, /**/ &lambda11, &l12t,
                           LBL, /**/ LBR,       &L20,  /**/ &l21,      &L22,
                           1, 1, FLA_BR );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                        /* ** */            /* *** */
                                              &b1t, 
                           BB,                &B2,        1, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    /* b1t = b1t / lambda11 */
    FLA_Inv_scal( lambda11, b1t );

    /* B2 = B2 - l21 * b1t */
    FLA_Ger( FLA_MINUS_ONE, l21, b1t, B2 );


    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &LTL, /**/ &LTR,       L00,  l01,      /**/ L02,
                                                     l10t, lambda11, /**/ l12t,
                            /* ************** */  /* ************************* */
                              &LBL, /**/ &LBR,       L20,  l21,      /**/ L22,
                              FLA_TL );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                  b1t, 
                            /* ** */           /* *** */
                              &BB,                B2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
Beispiel #3
0
int Gemm_unb_var1( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
  FLA_Obj AL,    AR,       A0,  a1,  A2;

  FLA_Obj BT,              B0,
          BB,              b1t,
                           B2;

  FLA_Part_1x2( A,    &AL,  &AR,      0, FLA_LEFT );

  FLA_Part_2x1( B,    &BT, 
                      &BB,            0, FLA_TOP );

  while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){

    FLA_Repart_1x2_to_1x3( AL,  /**/ AR,        &A0, /**/ &a1, &A2,
                           1, FLA_RIGHT );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                        /* ** */            /* *** */
                                              &b1t, 
                           BB,                &B2,        1, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    FLA_Ger( FLA_ONE, a1, b1t, C );

    /*------------------------------------------------------------*/

    FLA_Cont_with_1x3_to_1x2( &AL,  /**/ &AR,        A0, a1, /**/ A2,
                              FLA_LEFT );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                  b1t, 
                            /* ** */           /* *** */
                              &BB,                B2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
int Symm_unb_var7( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
    FLA_Obj ATL,   ATR,      A00,  a01,     A02,
            ABL,   ABR,      a10t, alpha11, a12t,
            A20,  a21,     A22;

    FLA_Obj BT,              B0,
            BB,              b1t,
            B2;

    FLA_Obj CT,              C0,
            CB,              c1t,
            C2;

    FLA_Part_2x2( A,    &ATL, &ATR,
                  &ABL, &ABR,     0, 0, FLA_TL );

    FLA_Part_2x1( B,    &BT,
                  &BB,            0, FLA_TOP );

    FLA_Part_2x1( C,    &CT,
                  &CB,            0, FLA_TOP );

    while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) {

        FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                               /* ************* */   /* ************************** */
                               &a10t, /**/ &alpha11, &a12t,
                               ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                               1, 1, FLA_BR );

        FLA_Repart_2x1_to_3x1( BT,                &B0,
                               /* ** */            /* *** */
                               &b1t,
                               BB,                &B2,        1, FLA_BOTTOM );

        FLA_Repart_2x1_to_3x1( CT,                &C0,
                               /* ** */            /* *** */
                               &c1t,
                               CB,                &C2,        1, FLA_BOTTOM );

        /*------------------------------------------------------------*/

        // C0 = C0 + a10*b1t;
        FLA_Ger(FLA_ONE, a10t, b1t, C0);

        // c1t = c1t + alpha11*b1t;
        FLA_Axpy(alpha11, b1t, c1t);

        // C2 = C2 + a21t*b1t;
        FLA_Ger(FLA_ONE, a21, b1t, C2);

        /*------------------------------------------------------------*/

        FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                  a10t, alpha11, /**/ a12t,
                                  /* ************** */  /* ************************ */
                                  &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                                  FLA_TL );

        FLA_Cont_with_3x1_to_2x1( &BT,                B0,
                                  b1t,
                                  /* ** */           /* *** */
                                  &BB,                B2,     FLA_TOP );

        FLA_Cont_with_3x1_to_2x1( &CT,                C0,
                                  c1t,
                                  /* ** */           /* *** */
                                  &CB,                C2,     FLA_TOP );

    }

    return FLA_SUCCESS;
}
// ============================================================================
void compute_case4b( int size_a, int size_b, int size_c, int size_d,
                     int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C,
                     int print_data ) {
    FLA_Obj  slice_C;
    int      datatype, size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci,
             iter_a, iter_b, iter_c, iter_d, iter_i, iter_j, ldim_slice_C;
    size_t   idx_A, idx_B, idx_C;
    double   * buff_cb_A, * buff_cb_B, * buff_cb_C, * buff_slice_C, d_one = 1.0;

    // Some initializations.
    datatype  = FLA_Obj_datatype( cb_A );
    buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
    buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );
    buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C );

    size_ab  = size_a * size_b;
    size_abc = size_a * size_b * size_c;

    size_ia  = size_i * size_a;
    size_iaj = size_i * size_a * size_j;

    size_jc  = size_j * size_c;
    size_jci = size_j * size_c * size_i;

    // Show data.
    if( print_data == 1 ) {
        FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" );
        FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" );
        FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" );
    }

    // Prepare temporal slices without buffer.
    FLA_Obj_create_without_buffer( datatype, size_a, size_c, & slice_C );
#if 0
    FLA_Obj_create_without_buffer( datatype, size_a, 1, & slice_A );
    FLA_Obj_create_without_buffer( datatype, size_c, 1, & slice_B );
#endif

    // Perform computation.
    for( iter_b = 0; iter_b < size_b; iter_b++ ) {

        for( iter_d = 0; iter_d < size_d; iter_d++ ) {

            // Define slice_C.
            iter_a = 0;
            iter_c = 0;
            idx_C = ( ( size_t ) iter_a ) +
                    ( ( size_t ) iter_b * size_a ) +
                    ( ( size_t ) iter_c * size_ab ) +
                    ( ( size_t ) iter_d * size_abc );
            FLA_Obj_attach_buffer( & buff_cb_C[ idx_C ], 1, size_ab, & slice_C );
            buff_slice_C = ( double * ) FLA_Obj_buffer_at_view( slice_C );
            ldim_slice_C = FLA_Obj_col_stride( slice_C );

            // Initialize slice_C.
            MyFLA_Obj_set_to_zero( slice_C );

            for( iter_i = 0; iter_i < size_i; iter_i++ ) {

                for( iter_j = 0; iter_j < size_j; iter_j++ ) {
#if 0
                    // Define slice_A.
                    FLA_Obj_attach_buffer(
                        & buff_cb_A[ iter_i + 0 * size_i + iter_j * size_ia +
                                     iter_b * size_iaj ],
                        size_i, 1, & slice_A );

                    // Define slice_B.
                    FLA_Obj_attach_buffer(
                        & buff_cb_B[ iter_j + 0 * size_j + iter_i * size_jc +
                                     iter_d * size_jci ],
                        size_j, 1, & slice_B );

                    // Compute DGER operation.
                    FLA_Ger( FLA_ONE, slice_A, slice_B, slice_C );
#endif
                    idx_A = ( ( size_t ) iter_i ) +
                            ( ( size_t ) 0 * size_i ) +
                            ( ( size_t ) iter_j * size_ia ) +
                            ( ( size_t ) iter_b * size_iaj );
                    idx_B = ( ( size_t ) iter_j ) +
                            ( ( size_t ) 0 * size_j ) +
                            ( ( size_t ) iter_i * size_jc ) +
                            ( ( size_t ) iter_d * size_jci );

                    dger_( & size_a, & size_c,
                           & d_one,
                           & buff_cb_A[ idx_A ], & size_i,
                           & buff_cb_B[ idx_B ], & size_j,
                           buff_slice_C, & ldim_slice_C );
                }
            }

        }
    }

    // Show data.
    if( print_data == 1 ) {
        FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" );
        FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" );
        FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" );
    }

    // Remove temporal slices.
    FLA_Obj_free_without_buffer( & slice_C );
#if 0
    FLA_Obj_free_without_buffer( & slice_A );
    FLA_Obj_free_without_buffer( & slice_B );
#endif
}