예제 #1
0
FLA_Error FLA_LU_piv_solve_check( FLA_Obj A, FLA_Obj p, FLA_Obj B, FLA_Obj X )
{
    FLA_Error e_val;

    e_val = FLA_Check_floating_object( A );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_nonconstant_object( A );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_identical_object_datatype( A, B );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_identical_object_datatype( A, X );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_int_object( p );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_square( A );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_col_vector( p );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_vector_dim_min( p, FLA_Obj_min_dim( A ) );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_matrix_matrix_dims( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, A, X, B );
    FLA_Check_error_code( e_val );

    return FLA_SUCCESS;
}
FLA_Error FLA_LU_find_zero_on_diagonal( FLA_Obj A )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
                           A20,  a21,     A22;

  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_LU_find_zero_on_diagonal_check( A );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  while ( FLA_Obj_length( ATL ) < FLA_Obj_min_dim( A ) ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );

    /*------------------------------------------------------------*/

    if ( FLA_Obj_equals( alpha11, FLA_ZERO ) ) return FLA_Obj_length( A00 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );
  }

  return FLA_SUCCESS;
}
예제 #3
0
FLA_Error FLA_QR_UT_solve_check( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X )
{
  FLA_Error e_val;

  e_val = FLA_Check_floating_object( A );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_nonconstant_object( A );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_identical_object_datatype( A, T );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_identical_object_datatype( A, B );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_identical_object_datatype( A, X );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_object_width_equals( T, FLA_Obj_min_dim( A ) );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_matrix_matrix_dims( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, A, X, B );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_object_length_min( A, FLA_Obj_width( A ) );
  FLA_Check_error_code( e_val );

  return FLA_SUCCESS;
}
예제 #4
0
FLA_Error FLA_Check_valid_diag_offset( FLA_Obj A, FLA_Diag_off offset )
{
  FLA_Error e_val = FLA_SUCCESS;

  if ( FLA_Obj_min_dim( A ) <= abs( offset ) )
    e_val = FLA_INVALID_DIAG_OFFSET;

  return e_val;
}
예제 #5
0
FLA_Error FLA_QR_UT_unb_var1( FLA_Obj A, FLA_Obj t )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
                           A20,  a21,     A22;

  FLA_Obj tLt,   tRt,      t0t,  tau1,  t2t;


  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( t,    &tLt,  &tRt,      0, FLA_LEFT );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );

    FLA_Repart_1x2_to_1x3( tLt,  /**/ tRt,      &t0t, /**/ &tau1, &t2t,
                           1, FLA_RIGHT );

    /*------------------------------------------------------------*/

    // Compute tau11 and u21 from alpha11 and a21 such that tau11 and u21
    // determine a Householder transform H such that applying H from the
    // left to the column vector consisting of alpha11 and a21 annihilates
    // the entries in a21 (and updates alpha11).
    FLA_Househ2_UT( FLA_LEFT,
                    alpha11,
                    a21, tau1 );

    // / a12t \ =  H / a12t \
    // \ A22  /      \ A22  /
    //
    // where H is formed from tau11 and u21.
    FLA_Apply_H2_UT( FLA_LEFT, tau1, a21, a12t,
                                          A22 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &tLt,  /**/ &tRt,      t0t, tau1, /**/ t2t,
                              FLA_LEFT );
  }

  return FLA_SUCCESS;
}
예제 #6
0
FLA_Error FLA_Check_householder_panel_dims( FLA_Obj A, FLA_Obj T )
{
  FLA_Error e_val = FLA_SUCCESS;
  dim_t     nb_alg;

  nb_alg = FLA_Query_blocksize( FLA_Obj_datatype( A ), FLA_DIMENSION_MIN );

  if ( FLA_Obj_length( T ) < nb_alg )
    e_val = FLA_HOUSEH_PANEL_MATRIX_TOO_SMALL;

  if ( FLA_Obj_width( T ) < FLA_Obj_min_dim( A ) )
    e_val = FLA_HOUSEH_PANEL_MATRIX_TOO_SMALL;

  return e_val;
}
FLA_Error FLA_Bidiag_UT_extract_real_diagonals_check( FLA_Obj A, FLA_Obj d, FLA_Obj e )
{
  FLA_Error e_val;
  dim_t     min_m_n;

  e_val = FLA_Check_floating_object( A );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_nonconstant_object( A );
  FLA_Check_error_code( e_val );

  min_m_n = FLA_Obj_min_dim( A );

  e_val = FLA_Check_nonconstant_object( d );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_real_object( d );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_identical_object_precision( A, d );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_if_vector( d );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_vector_dim( d, min_m_n );
  FLA_Check_error_code( e_val );

  if ( min_m_n != 1 ) 
  {
    e_val = FLA_Check_nonconstant_object( e );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_real_object( e );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_identical_object_precision( A, e );
    FLA_Check_error_code( e_val );
    
    e_val = FLA_Check_if_vector( e );
    FLA_Check_error_code( e_val );
    
    e_val = FLA_Check_vector_dim( e, min_m_n - 1 );
    FLA_Check_error_code( e_val );
  }

  return FLA_SUCCESS;
}
예제 #8
0
FLA_Error FLA_Bidiag_UT_create_T( FLA_Obj A, FLA_Obj* TU, FLA_Obj* TV )
{
  FLA_Datatype datatype;
  dim_t        b_alg, k;
  dim_t        rs_T, cs_T;

  // Query the datatype of A.
  datatype = FLA_Obj_datatype( A );

  // Query the blocksize from the library.
  b_alg = FLA_Query_blocksize( datatype, FLA_DIMENSION_MIN );

  // Scale the blocksize by a pre-set global constant.
  b_alg = ( dim_t )( ( ( double ) b_alg ) * FLA_BIDIAG_INNER_TO_OUTER_B_RATIO );

  // Query the minimum dimension of A.
  k = FLA_Obj_min_dim( A );

  b_alg = 5;

  // Adjust the blocksize with respect to the min-dim of A.
  b_alg = min( b_alg, k );
  
  // Figure out whether TU and TV should be row-major or column-major.
  if ( FLA_Obj_row_stride( A ) == 1 )
  {
    rs_T = 1;          
    cs_T = b_alg;      
  }
  else // if ( FLA_Obj_col_stride( A ) == 1 )
  {
    rs_T = k;
    cs_T = 1;
  }

  // Create two b_alg x k matrices to hold the block Householder transforms
  // that will be accumulated within the bidiagonal reduction algorithm.
  // If the matrix dimension has a zero dimension, apply_q complains it.
  if ( TU != NULL ) FLA_Obj_create( datatype, b_alg, k, rs_T, cs_T, TU );
  if ( TV != NULL ) FLA_Obj_create( datatype, b_alg, k, rs_T, cs_T, TV );

  return FLA_SUCCESS;
}
예제 #9
0
FLA_Error FLA_Scalr_u_blk_var4( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  dim_t b;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_BR );

  while ( FLA_Obj_min_dim( ATL ) > 0 ){

    b = FLA_Determine_blocksize( ATL, FLA_TL, FLA_Cntl_blocksize( cntl ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, &A01, /**/ &A02,
                                                &A10, &A11, /**/ &A12,
                        /* ************* */   /* ******************** */
                           ABL, /**/ ABR,       &A20, &A21, /**/ &A22,
                           b, b, FLA_TL );

    /*------------------------------------------------------------*/

    // A11 = alpha * triu( A11 );
    FLA_Scalr_internal( FLA_UPPER_TRIANGULAR, alpha, A11,
                        FLA_Cntl_sub_scalr( cntl ) );

    // A01 = alpha * A01;
    FLA_Scal_internal( alpha, A01,
                       FLA_Cntl_sub_scal( cntl ) );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, /**/ A01, A02,
                            /* ************** */  /* ****************** */
                                                     A10, /**/ A11, A12,
                              &ABL, /**/ &ABR,       A20, /**/ A21, A22,
                              FLA_BR );
  }

  return FLA_SUCCESS;
}
예제 #10
0
FLA_Error FLA_LQ_UT_create_T( FLA_Obj A, FLA_Obj* T )
{
  FLA_Datatype datatype;
  dim_t        b_alg, k;
  dim_t        rs_T, cs_T;

  // Query the datatype of A.
  datatype = FLA_Obj_datatype( A );

  // Query the blocksize from the library.
  b_alg = FLA_Query_blocksize( datatype, FLA_DIMENSION_MIN );

  // Scale the blocksize by a pre-set global constant.
  b_alg = ( dim_t )( ( ( double ) b_alg ) * FLA_LQ_INNER_TO_OUTER_B_RATIO );

  // Adjust the blocksize with respect to the min-dim of A.
  b_alg = min(b_alg, FLA_Obj_min_dim( A ));

  // Query the length of A.
  k = FLA_Obj_length( A );

  // Figure out whether T should be row-major or column-major.
  if ( FLA_Obj_row_stride( A ) == 1 )
  {
    rs_T = 1;
    cs_T = b_alg;
  }
  else // if ( FLA_Obj_col_stride( A ) == 1 )
  {
    rs_T = k;
    cs_T = 1;
  }

  // Create a b_alg x k matrix to hold the block Householder transforms that
  // will be accumulated within the LQ factorization algorithm.
  FLA_Obj_create( datatype, b_alg, k, rs_T, cs_T, T );

  return FLA_SUCCESS;
}
예제 #11
0
int FLA_task_determine_matrix_size( FLA_Obj A, FLA_Quadrant from )
{
	int r_val = 0;
	
	// Determine the size of the matrix dimension along which we are moving.
	switch( from )
	{
		case FLA_TOP:
		case FLA_BOTTOM:
		{
			r_val = FLA_Obj_length( A );
			break;
		}
		case FLA_LEFT:
		case FLA_RIGHT:
		{
			r_val = FLA_Obj_width( A );
			break;
		}
		case FLA_TL:
		case FLA_TR:
		case FLA_BL:
		case FLA_BR:
		{
			// If A happens to be the full object, we need to use min_dim() here
			// because the matrix might be rectangular. If A is the processed
			// partition, it is very probably square, and min_dim() doesn't hurt.
			r_val = FLA_Obj_min_dim( A );
			break;
		}
		default:
			FLA_Print_message( "Unexpected default in switch statement!", __FILE__, __LINE__ );
			FLA_Abort();
	}

	return r_val;
}
예제 #12
0
FLA_Error FLA_LQ_check( FLA_Obj A, FLA_Obj t )
{
  FLA_Error e_val;

  e_val = FLA_Check_floating_object( A );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_nonconstant_object( A );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_identical_object_datatype( A, t );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_col_vector( t );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_col_storage( t );
  FLA_Check_error_code( e_val );

  e_val = FLA_Check_vector_dim( t, FLA_Obj_min_dim( A ) );
  FLA_Check_error_code( e_val );
  
  return FLA_SUCCESS;
}
예제 #13
0
FLA_Error FLA_Add_to_diag( void* diag_value, FLA_Obj A )
{
  FLA_Datatype datatype;
  dim_t        j, min_m_n;
  dim_t        rs, cs;

  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_Add_to_diag_check( diag_value, A );

  datatype = FLA_Obj_datatype( A );
  min_m_n  = FLA_Obj_min_dim( A );
  rs       = FLA_Obj_row_stride( A );
  cs       = FLA_Obj_col_stride( A );

  switch ( datatype ){

  case FLA_FLOAT:
  {
    float *buff_A    = ( float * ) FLA_FLOAT_PTR( A );
    float *value_ptr = ( float * ) diag_value;

    for ( j = 0; j < min_m_n; j++ )
      buff_A[ j*cs + j*rs ] += *value_ptr;

    break;
  }

  case FLA_DOUBLE:
  {
    double *buff_A    = ( double * ) FLA_DOUBLE_PTR( A );
    double *value_ptr = ( double * ) diag_value;

    for ( j = 0; j < min_m_n; j++ )
      buff_A[ j*cs + j*rs ] += *value_ptr;

    break;
  }

  case FLA_COMPLEX:
  {
    scomplex *buff_A    = ( scomplex * ) FLA_COMPLEX_PTR( A );
    scomplex *value_ptr = ( scomplex * ) diag_value;

    for ( j = 0; j < min_m_n; j++ )
    {
      buff_A[ j*cs + j*rs ].real += value_ptr->real;
      buff_A[ j*cs + j*rs ].imag += value_ptr->imag;
    }

    break;
  }

  case FLA_DOUBLE_COMPLEX:
  {
    dcomplex *buff_A    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A );
    dcomplex *value_ptr = ( dcomplex * ) diag_value;

    for ( j = 0; j < min_m_n; j++ )
    {
      buff_A[ j*cs + j*rs ].real += value_ptr->real;
      buff_A[ j*cs + j*rs ].imag += value_ptr->imag;
    }

    break;
  }

  }

  return FLA_SUCCESS;
}
예제 #14
0
FLA_Error FLA_Svd_uv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg )
{
    FLA_Error    r_val = FLA_SUCCESS;
    FLA_Datatype dt;
    FLA_Datatype dt_real;
    FLA_Datatype dt_comp;
    FLA_Obj      scale, T, S, rL, rR, d, e, G, H;
    dim_t        m_A, n_A;
    dim_t        min_m_n;
    dim_t        n_GH;
    double       crossover_ratio = 17.0 / 9.0;

    n_GH    = k_accum;

    m_A     = FLA_Obj_length( A );
    n_A     = FLA_Obj_width( A );
    min_m_n = FLA_Obj_min_dim( A );
    dt      = FLA_Obj_datatype( A );
    dt_real = FLA_Obj_datatype_proj_to_real( A );
    dt_comp = FLA_Obj_datatype_proj_to_complex( A );

    // Create matrices to hold block Householder transformations.
    FLA_Bidiag_UT_create_T( A, &T, &S );

    // Create vectors to hold the realifying scalars.
    FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rL );
    FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rR );

    // Create vectors to hold the diagonal and sub-diagonal.
    FLA_Obj_create( dt_real, min_m_n,      1, 0, 0, &d );
    FLA_Obj_create( dt_real, min_m_n-1,    1, 0, 0, &e );

    // Create matrices to hold the left and right Givens scalars.
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G );
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H );

    // Create a real scaling factor.
    FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale );

    // Compute a scaling factor; If none is needed, sigma will be set to one.
    FLA_Svd_compute_scaling( A, scale );

    // Scale the matrix if scale is non-unit.
    if ( !FLA_Obj_equals( scale, FLA_ONE ) )
        FLA_Scal( scale, A );

    if ( m_A < crossover_ratio * n_A )
    {
        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( A, T, S );
        FLA_Bidiag_UT_realify( A, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( A, d, e );

        // Form U and V.
        FLA_Bidiag_UT_form_U( A, T, U );
        FLA_Bidiag_UT_form_V( A, S, V );

        // Apply the realifying scalars in rL and rR to U and V, respectively.
        {
            FLA_Obj UL, UR;
            FLA_Obj VL, VR;

            FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );
            FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );

            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, UL );
            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
        }

        // Perform a singular value decomposition on the bidiagonal matrix.
        r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, U, V, b_alg );
    }
    else // if ( crossover_ratio * n_A <= m_A )
    {
        FLA_Obj TQ, R;
        FLA_Obj AT,
                AB;
        FLA_Obj UL, UR;

        // Perform a QR factorization on A and form Q in U.
        FLA_QR_UT_create_T( A, &TQ );
        FLA_QR_UT( A, TQ );
        FLA_QR_UT_form_Q( A, TQ, U );
        FLA_Obj_free( &TQ );

        // Set the lower triangle of R to zero and then copy the upper
        // triangle of A to R.
        FLA_Part_2x1( A,   &AT,
                           &AB,   n_A, FLA_TOP );
        FLA_Obj_create( dt, n_A, n_A, 0, 0, &R );
        FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R );
        FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R );

        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( R, T, S );
        FLA_Bidiag_UT_realify( R, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( R, d, e );

        // Form V from right Householder vectors in upper triangle of R.
        FLA_Bidiag_UT_form_V( R, S, V );

        // Form U in R.
        FLA_Bidiag_UT_form_U( R, T, R );

        // Apply the realifying scalars in rL and rR to U and V, respectively.
        FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, R );
        FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V );

        // Perform a singular value decomposition on the bidiagonal matrix.
        r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, R, V, b_alg );

        // Multiply R into U, storing the result in A and then copying back
        // to U.
        FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );
        FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                  FLA_ONE, UL, R, FLA_ZERO, A );
        FLA_Copy( A, UL );

        FLA_Obj_free( &R );
    }

    // Copy the converged eigenvalues to the output vector.
    FLA_Copy( d, s );

    // Sort the singular values and singular vectors in descending order.
    FLA_Sort_svd( FLA_BACKWARD, s, U, V );

    // If the matrix was scaled, rescale the singular values.
    if ( !FLA_Obj_equals( scale, FLA_ONE ) )
        FLA_Inv_scal( scale, s );

    FLA_Obj_free( &scale );
    FLA_Obj_free( &T );
    FLA_Obj_free( &S );
    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
    FLA_Obj_free( &d );
    FLA_Obj_free( &e );
    FLA_Obj_free( &G );
    FLA_Obj_free( &H );

    return r_val;
}
FLA_Error FLA_Apply_Q_UT_rnfr_blk_var3( FLA_Obj A, FLA_Obj TW, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TWTL,  TWTR,     TW00, TW01, TW02, 
          TWBL,  TWBR,     TW10,  T11,  W12,
                           TW20, TW21, TW22;

  FLA_Obj WTL,   WTR,
          WBL,   WBR;

  FLA_Obj BL,    BR,       B0,  B1,  B2;

  dim_t b;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_2x2( TW,   &TWTL, &TWTR,
                      &TWBL, &TWBR,     0, 0, FLA_TL );

  FLA_Part_1x2( B,    &BL,  &BR,      0, FLA_LEFT );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    FLA_Repart_2x2_to_3x3( TWTL, /**/ TWTR,       &TW00, /**/ &TW01, &TW02,
                        /* *************** */   /* *********************** */
                                                  &TW10, /**/  &T11,  &W12,
                           TWBL, /**/ TWBR,       &TW20, /**/ &TW21, &TW22,
                           b, b, FLA_BR );

    FLA_Repart_1x2_to_1x3( BL,  /**/ BR,        &B0, /**/ &B1, &B2,
                           b, FLA_RIGHT );

    /*------------------------------------------------------------*/

    FLA_Part_2x2( W,    &WTL, &WTR,
                        &WBL, &WBR,     b, FLA_Obj_length( B1 ), FLA_TL );

    // WTL = B1;

    FLA_Copyt_internal( FLA_TRANSPOSE, B1, WTL,
                        FLA_Cntl_sub_copyt( cntl ) );

    // U11 = trilu( A11 );
    // U12 = A12;
    // Let WTL^T be conformal to B1.
    //
    // WTL^T = ( B1 * U11^T + B2 * U12^T ) * inv( triu(T11) );
    // WTL   = inv( triu(T11) )^T * ( U11 * B1^T + U12 * B2^T );

    FLA_Trmm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm1( cntl ) );

    FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, 
                       FLA_ONE, A12, B2, FLA_ONE, WTL,
                       FLA_Cntl_sub_gemm1( cntl ) );

    FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_TRANSPOSE, FLA_NONUNIT_DIAG,
                       FLA_ONE, T11, WTL,
                       FLA_Cntl_sub_trsm( cntl ) );

    // B2 = B2 - WTL^T * conj(U12);
    // B1 = B1 - WTL^T * conj(U11);
    //    = B1 - ( U11' * WTL )^T;

    FLA_Gemm_internal( FLA_TRANSPOSE, FLA_CONJ_NO_TRANSPOSE,
                       FLA_MINUS_ONE, WTL, A12, FLA_ONE, B2,
                       FLA_Cntl_sub_gemm2( cntl ) );

    FLA_Trmm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_MINUS_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm2( cntl ) );

    FLA_Axpyt_internal( FLA_TRANSPOSE, FLA_ONE, WTL, B1,
                        FLA_Cntl_sub_axpyt( cntl ) );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

    FLA_Cont_with_3x3_to_2x2( &TWTL, /**/ &TWTR,       TW00, TW01, /**/ TW02,
                                                       TW10,  T11, /**/  W12,
                            /* **************** */  /* ********************* */
                              &TWBL, /**/ &TWBR,       TW20, TW21, /**/ TW22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &BL,  /**/ &BR,        B0, B1, /**/ B2,
                              FLA_LEFT );
  }

  return FLA_SUCCESS;
}
예제 #16
0
FLA_Error FLA_Bidiag_blk_external( FLA_Obj A, FLA_Obj tu, FLA_Obj tv )
{
  int          info = 0;
#ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES
  FLA_Datatype datatype;
  int          m_A, n_A, cs_A;
  int          min_m_n, max_m_n;
  int          lwork;
  FLA_Obj      d, e, work_obj;

  if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
    FLA_Bidiag_check( A, tu, tv );

  if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS;

  datatype = FLA_Obj_datatype( A );

  m_A      = FLA_Obj_length( A );
  n_A      = FLA_Obj_width( A );
  min_m_n  = FLA_Obj_min_dim( A );
  max_m_n  = FLA_Obj_max_dim( A );
  cs_A     = FLA_Obj_col_stride( A );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n,     1, 0, 0, &d );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n - 1, 1, 0, 0, &e );

  lwork    = (m_A + n_A) * FLA_Query_blocksize( datatype, FLA_DIMENSION_MIN );
  FLA_Obj_create( datatype, lwork, 1, 0, 0, &work_obj );


  switch( datatype ){

  case FLA_FLOAT:
  {
    float* buff_A    = ( float * ) FLA_FLOAT_PTR( A );
    float* buff_d    = ( float * ) FLA_FLOAT_PTR( d );
    float* buff_e    = ( float * ) FLA_FLOAT_PTR( e );
    float* buff_tu   = ( float * ) FLA_FLOAT_PTR( tu );
    float* buff_tv   = ( float * ) FLA_FLOAT_PTR( tv );
    float* buff_work = ( float * ) FLA_FLOAT_PTR( work_obj );

    F77_sgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  }

  case FLA_DOUBLE:
  {
    double* buff_A    = ( double * ) FLA_DOUBLE_PTR( A );
    double* buff_d    = ( double * ) FLA_DOUBLE_PTR( d );
    double* buff_e    = ( double * ) FLA_DOUBLE_PTR( e );
    double* buff_tu   = ( double * ) FLA_DOUBLE_PTR( tu );
    double* buff_tv   = ( double * ) FLA_DOUBLE_PTR( tv );
    double* buff_work = ( double * ) FLA_DOUBLE_PTR( work_obj );

    F77_dgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  } 

  case FLA_COMPLEX:
  {
    scomplex* buff_A    = ( scomplex * ) FLA_COMPLEX_PTR( A );
    float*    buff_d    = ( float    * ) FLA_FLOAT_PTR( d );
    float*    buff_e    = ( float    * ) FLA_FLOAT_PTR( e );
    scomplex* buff_tu   = ( scomplex * ) FLA_COMPLEX_PTR( tu );
    scomplex* buff_tv   = ( scomplex * ) FLA_COMPLEX_PTR( tv );
    scomplex* buff_work = ( scomplex * ) FLA_COMPLEX_PTR( work_obj );

    F77_cgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  } 

  case FLA_DOUBLE_COMPLEX:
  {
    dcomplex* buff_A    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A );
    double*   buff_d    = ( double   * ) FLA_DOUBLE_PTR( d );
    double*   buff_e    = ( double   * ) FLA_DOUBLE_PTR( e );
    dcomplex* buff_tu   = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tu );
    dcomplex* buff_tv   = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tv );
    dcomplex* buff_work = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( work_obj );

    F77_zgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  } 

  }

  FLA_Obj_free( &d );
  FLA_Obj_free( &e );
  FLA_Obj_free( &work_obj );
#else
  FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED );
#endif

  return info;
}
예제 #17
0
FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg,
                                      FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V,
                                      double* dtime_bred, double* dtime_bsvd, double* dtime_appq,
                                      double* dtime_qrfa, double* dtime_gemm )
{
	FLA_Error    r_val = FLA_SUCCESS;
	FLA_Datatype dt;
	FLA_Datatype dt_real;
	FLA_Datatype dt_comp;
	FLA_Obj      T, S, rL, rR, d, e, G, H, RG, RH, W;
	dim_t        m_A, n_A;
	dim_t        min_m_n;
	dim_t        n_GH;
	double       crossover_ratio = 17.0 / 9.0;
	double       dtime_temp;

	n_GH    = k_accum;

	m_A     = FLA_Obj_length( A );
	n_A     = FLA_Obj_width( A );
	min_m_n = FLA_Obj_min_dim( A );
	dt      = FLA_Obj_datatype( A );
	dt_real = FLA_Obj_datatype_proj_to_real( A );
	dt_comp = FLA_Obj_datatype_proj_to_complex( A );

	// If the matrix is a scalar, then the SVD is easy.
	if ( min_m_n == 1 )
	{
		FLA_Copy( A, s );
		FLA_Set_to_identity( U );
		FLA_Set_to_identity( V );

		return FLA_SUCCESS;
	}

	// Create matrices to hold block Householder transformations.
	FLA_Bidiag_UT_create_T( A, &T, &S );

	// Create vectors to hold the realifying scalars.
	FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rL );
	FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rR );

	// Create vectors to hold the diagonal and sub-diagonal.
	FLA_Obj_create( dt_real, min_m_n,      1, 0, 0, &d );
	FLA_Obj_create( dt_real, min_m_n-1,    1, 0, 0, &e );

	// Create matrices to hold the left and right Givens scalars.
	FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G );
	FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H );

	// Create matrices to hold the left and right Givens matrices.
	FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG );
	FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH );
	FLA_Obj_create( dt,      m_A,     n_A,     0, 0, &W );

	if ( m_A >= n_A )
	{
		if ( m_A < crossover_ratio * n_A )
		{
			dtime_temp = FLA_Clock();
			{
			// Reduce the matrix to bidiagonal form.
			// Apply scalars to rotate elements on the sub-diagonal to the real domain.
			// Extract the diagonal and sub-diagonal from A.
			FLA_Bidiag_UT( A, T, S );
			FLA_Bidiag_UT_realify( A, rL, rR );
			FLA_Bidiag_UT_extract_diagonals( A, d, e );
			}
			*dtime_bred = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Form U and V.
			FLA_Bidiag_UT_form_U( A, T, U );
			FLA_Bidiag_UT_form_V( A, S, V );
			}
			*dtime_appq = FLA_Clock() - dtime_temp;

			// Apply the realifying scalars in rL and rR to U and V, respectively.
			{
				FLA_Obj UL, UR;
				FLA_Obj VL, VR;

				FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );
				FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );

				FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, UL );
				FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
			}

			dtime_temp = FLA_Clock();
			{
			// Perform a singular value decomposition on the bidiagonal matrix.
			r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg );
			}
			*dtime_bsvd = FLA_Clock() - dtime_temp;
		}
		else // if ( crossover_ratio * n_A <= m_A )
		{
			FLA_Obj TQ, R;
			FLA_Obj AT,
			        AB;
			FLA_Obj UL, UR;

			//FLA_QR_UT_create_T( A, &TQ );
			FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ );

			dtime_temp = FLA_Clock();
			{
			// Perform a QR factorization on A and form Q in U.
			FLA_QR_UT( A, TQ );
			}
			*dtime_qrfa = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			FLA_QR_UT_form_Q( A, TQ, U );
			}
			*dtime_appq = FLA_Clock() - dtime_temp;

			FLA_Obj_free( &TQ );

			// Set the lower triangle of R to zero and then copy the upper
			// triangle of A to R.
			FLA_Part_2x1( A,   &AT,
			                   &AB,   n_A, FLA_TOP );
			FLA_Obj_create( dt, n_A, n_A, 0, 0, &R );
			FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R );
			FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R );

			dtime_temp = FLA_Clock();
			{
			// Reduce the matrix to bidiagonal form.
			// Apply scalars to rotate elements on the superdiagonal to the real domain.
			// Extract the diagonal and superdiagonal from A.
			FLA_Bidiag_UT( R, T, S );
			FLA_Bidiag_UT_realify( R, rL, rR );
			FLA_Bidiag_UT_extract_diagonals( R, d, e );
			}
			*dtime_bred = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Form V from right Householder vectors in upper triangle of R.
			FLA_Bidiag_UT_form_V( R, S, V );

			// Form U in R.
			FLA_Bidiag_UT_form_U( R, T, R );
			}
			*dtime_appq += FLA_Clock() - dtime_temp;

			// Apply the realifying scalars in rL and rR to U and V, respectively.
			FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, R );
			FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V );

			dtime_temp = FLA_Clock();
			{
			// Perform a singular value decomposition on the bidiagonal matrix.
			r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg );
			}
			*dtime_bsvd = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Multiply R into U, storing the result in A and then copying back
			// to U.
			FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );
			FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
			          FLA_ONE, UL, R, FLA_ZERO, A );
			FLA_Copy( A, UL );
			}
			*dtime_gemm = FLA_Clock() - dtime_temp;

			FLA_Obj_free( &R );
		}
	}
	else // if ( m_A < n_A )
	{
		FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
	}

	// Copy the converged eigenvalues to the output vector.
	FLA_Copy( d, s );

	// Sort the singular values and singular vectors in descending order.
	FLA_Sort_svd( FLA_BACKWARD, s, U, V );

	FLA_Obj_free( &T );
	FLA_Obj_free( &S );
	FLA_Obj_free( &rL );
	FLA_Obj_free( &rR );
	FLA_Obj_free( &d );
	FLA_Obj_free( &e );
	FLA_Obj_free( &G );
	FLA_Obj_free( &H );
	FLA_Obj_free( &RG );
	FLA_Obj_free( &RH );
	FLA_Obj_free( &W );

	return r_val;
}
FLA_Error FLA_Apply_Q_UT_lnfc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl )
/*
  Apply a unitary matrix Q to a matrix B from the left,

    B :=  Q B

  where Q is the forward product of Householder transformations:

    Q  =  H(0) H(1) ... H(k-1)

  where H(i) corresponds to the Householder vector stored below the diagonal
  in the ith column of A. Thus, the operation becomes:

    B :=  Q B
       =  H(0) H(1) ... H(k-1) B

  From this, we can see that we must move through A from bottom-right to top-
  left, since the Householder vector for H(k-1) was stored in the last column
  of A. We intend to apply blocks of reflectors at a time, where a block
  reflector H of b consecutive Householder transforms may be expressed as:

    H  =  ( H(i) H(i+1) ... H(i+b-1) )
       =  ( I - U inv(T) U' )

  where:
    - U is the strictly lower trapezoidal (with implicit unit diagonal) matrix
      of Householder vectors, stored below the diagonal of A in columns i through
      i+b-1, corresponding to H(i) through H(i+b-1).
    - T is the upper triangular block Householder matrix corresponding to
      Householder vectors i through i+b-1.

  Consider applying H to B as an intermediate step towards applying all of Q:

    B  :=  H B
        =  ( I - U inv(T) U' ) B
        =  B - U inv(T) U' B

  We must move from bottom-right to top-left. So, we partition:

    U -> / U11 \  B -> / B1 \  T -> ( T2 T1 )
         \ U21 /       \ B2 / 

  where:
    - U11 is stored in strictly lower triangle of A11 with implicit unit
      diagonal.
    - U21 is stored in A21.
    - T1 is an upper triangular block of row-panel matrix T.

  Substituting repartitioned U, B, and T, we have:

    / B1 \  :=   / B1 \ - / U11 \ inv(T1) / U11 \' / B1 \
    \ B2 /       \ B2 /   \ U21 /         \ U21 /  \ B2 /
             =   / B1 \ - / U11 \ inv(T1) ( U11' U21' ) / B1 \
                 \ B2 /   \ U21 /                       \ B2 /
             =   / B1 \ - / U11 \ inv(T1) ( U11' B1 + U21' B2 )
                 \ B2 /   \ U21 /

  Thus, B1 is updated as:

      B1    :=     B1   -   U11 inv(T1) ( U11' B1 + U21' B2 )

  And B2 is updated as:

      B2    :=     B2   -   U21 inv(T1) ( U11' B1 + U21' B2 )

  Note that:

    inv(T1) ( U11' B1 + U21' B2 )

  is common to both updates, and thus may be computed and stored in
  workspace, and then re-used.

  -FGVZ
*/
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TL,    TR,       T0,  T1,  T2;

  FLA_Obj T1T,
          T2B;

  FLA_Obj WTL,  WTR,
          WBL,  WBR;

  FLA_Obj BT,              B0,
          BB,              B1,
                           B2;

  dim_t   b_alg, b;
  dim_t   m_BR, n_BR;

  // Query the algorithmic blocksize by inspecting the length of T.
  b_alg = FLA_Obj_length( T );

  // If m > n, then we have to initialize our partitionings carefully so
  // that we begin in the proper location in A and B (since we traverse
  // matrix A from BR to TL).
  if ( FLA_Obj_length( A ) > FLA_Obj_width( A ) )
  {
    m_BR = FLA_Obj_length( A ) - FLA_Obj_width( A );
    n_BR = 0;
  }
  else if ( FLA_Obj_length( A ) < FLA_Obj_width( A ) )
  {
    m_BR = 0;
    n_BR = FLA_Obj_width( A ) - FLA_Obj_length( A );
  }
  else
  {
    m_BR = 0;
    n_BR = 0;
  }

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     m_BR, n_BR, FLA_BR );

  // A and T are dependent; we determine T matrix w.r.t. A
  FLA_Part_1x2( T,    &TL,  &TR,      FLA_Obj_min_dim( A ), FLA_LEFT );

  FLA_Part_2x1( B,    &BT, 
                      &BB,            m_BR, FLA_BOTTOM );

  while ( FLA_Obj_min_dim( ATL ) > 0 ){

    b = min( b_alg, FLA_Obj_min_dim( ATL ) );

    // Since T was filled from left to right, and since we need to access them
    // in reverse order, we need to handle the case where the last block is
    // smaller than the other b x b blocks.
    if ( FLA_Obj_width( TR ) == 0 && FLA_Obj_width( T ) % b_alg > 0 )
      b = FLA_Obj_width( T ) % b_alg;

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, &A01, /**/ &A02,
                                                &A10, &A11, /**/ &A12,
                        /* ************* */   /* ******************** */
                           ABL, /**/ ABR,       &A20, &A21, /**/ &A22,
                           b, b, FLA_TL );

    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, &T1, /**/ &T2,
                           b, FLA_LEFT );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                                              &B1, 
                        /* ** */            /* ** */
                           BB,                &B2,        b, FLA_TOP );

    /*------------------------------------------------------------*/

    FLA_Part_2x1( T1,    &T1T, 
                         &T2B,     b, FLA_TOP );

    FLA_Part_2x2( W,     &WTL, &WTR,
                         &WBL, &WBR,     b, FLA_Obj_width( B1 ), FLA_TL );

    // WTL = B1;

    FLA_Copyt_internal( FLA_NO_TRANSPOSE, B1, WTL,
                        FLA_Cntl_sub_copyt( cntl ) );

    // U11 = trilu( A11 );
    // U21 = A21;
    // 
    // WTL = inv( triu(T1T) ) * ( U11' * B1 + U21' * B2 );

    FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                       FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm1( cntl ) );

    FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
                       FLA_ONE, A21, B2, FLA_ONE, WTL,
                       FLA_Cntl_sub_gemm1( cntl ) );

    FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                       FLA_ONE, T1T, WTL,
                       FLA_Cntl_sub_trsm( cntl ) );

    // B2 = B2 - U21 * WTL;
    // B1 = B1 - U11 * WTL;

    FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                       FLA_MINUS_ONE, A21, WTL, FLA_ONE, B2,
                       FLA_Cntl_sub_gemm2( cntl ) );

    FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                       FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_MINUS_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm2( cntl ) );

    FLA_Axpyt_internal( FLA_NO_TRANSPOSE, FLA_ONE, WTL, B1,
                        FLA_Cntl_sub_axpyt( cntl ) );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, /**/ A01, A02,
                            /* ************** */  /* ****************** */
                                                     A10, /**/ A11, A12,
                              &ABL, /**/ &ABR,       A20, /**/ A21, A22,
                              FLA_BR );

    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, /**/ T1, T2,
                              FLA_RIGHT );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                            /* ** */           /* ** */
                                                  B1, 
                              &BB,                B2,     FLA_BOTTOM );
  }

  return FLA_SUCCESS;
}
예제 #19
0
FLA_Error FLA_QR_UT_piv_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TL,    TR,       T0,  T1,  W12;
  FLA_Obj TT,    TB;

  FLA_Obj pT,              p0,
          pB,              p1,
                           p2;

  FLA_Obj wT,              w0,
          wB,              w1,
                           w2;

  dim_t   b_alg, b;

  // Query the algorithmic blocksize by inspecting the length of T.
  b_alg = FLA_Obj_length( T );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( T,    &TL,  &TR,      0, FLA_LEFT );

  FLA_Part_2x1( p,    &pT, 
                      &pB,            0, FLA_TOP );

  FLA_Part_2x1( w,    &wT, 
                      &wB,            0, FLA_TOP );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    b = min( b_alg, FLA_Obj_min_dim( ABR ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, /**/ &T1, &W12,
                           b, FLA_RIGHT );

    FLA_Repart_2x1_to_3x1( pT,                &p0, 
                        /* ** */            /* ** */
                                              &p1, 
                           pB,                &p2,        b, FLA_BOTTOM );

    FLA_Repart_2x1_to_3x1( wT,                &w0, 
                        /* ** */            /* ** */
                                              &w1, 
                           wB,                &w2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    // ** Reshape T matrices to match the blocksize b
    FLA_Part_2x1( TR,   &TT, 
                        &TB,    b, FLA_TOP );

    // ** Perform a unblocked (BLAS2-oriented) QR factorization 
    // with pivoting via the UT transform on ABR:
    //
    //   ABR  -> QB1 R11
    //
    // where:
    //  - QB1 is formed from UB1 (which is stored column-wise below the
    //    diagonal of ( A11 A21 )^T and the upper-triangle of T1. 
    //  - R11 is stored to ( A11 A12 ).
    //  - W12 stores  T and partial updates for FLA_Apply_Q_UT_piv_var.
    FLA_QR_UT_piv_internal( ABR, TT, wB, p1, 
                            FLA_Cntl_sub_qrut( cntl ) );

    if ( FLA_Obj_width( A12 ) > 0 )
    {
      // ** Block update
      FLA_Part_2x1( W12,  &TT, 
                          &TB,    b, FLA_TOP );
 
      FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
                         FLA_MINUS_ONE, A21, TT, FLA_ONE, A22 );
    }

    // ** Apply pivots to previous columns.
    FLA_Apply_pivots( FLA_RIGHT, FLA_TRANSPOSE, p1, ATR );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, T1, /**/ W12,
                              FLA_LEFT );

    FLA_Cont_with_3x1_to_2x1( &pT,                p0, 
                                                  p1, 
                            /* ** */           /* ** */
                              &pB,                p2,     FLA_TOP );

    FLA_Cont_with_3x1_to_2x1( &wT,                w0, 
                                                  w1, 
                            /* ** */           /* ** */
                              &wB,                w2,     FLA_TOP );
  }

  return FLA_SUCCESS;
}
예제 #20
0
FLA_Error FLA_Copyr_u_blk_var4( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj BTL,   BTR,      B00, B01, B02, 
          BBL,   BBR,      B10, B11, B12,
                           B20, B21, B22;

  dim_t b;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_BR );

  FLA_Part_2x2( B,    &BTL, &BTR,
                      &BBL, &BBR,     0, 0, FLA_BR );

  while ( FLA_Obj_min_dim( ATL ) > 0 ){

    b = FLA_Determine_blocksize( ATL, FLA_TL, FLA_Cntl_blocksize( cntl ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, &A01, /**/ &A02,
                                                &A10, &A11, /**/ &A12,
                        /* ************* */   /* ******************** */
                           ABL, /**/ ABR,       &A20, &A21, /**/ &A22,
                           b, b, FLA_TL );

    FLA_Repart_2x2_to_3x3( BTL, /**/ BTR,       &B00, &B01, /**/ &B02,
                                                &B10, &B11, /**/ &B12,
                        /* ************* */   /* ******************** */
                           BBL, /**/ BBR,       &B20, &B21, /**/ &B22,
                           b, b, FLA_TL );

    /*------------------------------------------------------------*/

    // B11 = triu( A11 );
    FLA_Copyr_internal( FLA_UPPER_TRIANGULAR, A11, B11,
                        FLA_Cntl_sub_copyr( cntl ) );

    // B01 = A01;
    FLA_Copy_internal( A01, B01,
                       FLA_Cntl_sub_copy( cntl ) );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, /**/ A01, A02,
                            /* ************** */  /* ****************** */
                                                     A10, /**/ A11, A12,
                              &ABL, /**/ &ABR,       A20, /**/ A21, A22,
                              FLA_BR );

    FLA_Cont_with_3x3_to_2x2( &BTL, /**/ &BTR,       B00, /**/ B01, B02,
                            /* ************** */  /* ****************** */
                                                     B10, /**/ B11, B12,
                              &BBL, /**/ &BBR,       B20, /**/ B21, B22,
                              FLA_BR );
  }

  return FLA_SUCCESS;
}
FLA_Error FLA_Apply_Q_UT_lhfr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl )
{
/*
  Apply the conjugate-transpose of a unitary matrix Q to a matrix B from the
  left,

    B := Q' B

  where Q is the forward product of Householder transformations:

    Q  =  H(0) H(1) ... H(k-1)

  where H(i) corresponds to the Householder vector stored above the diagonal
  in the ith row of A. Thus, the operation becomes:

    B :=  Q' B
       =  ( H(0) H(1) ... H(k-1) )' B
       =  H(k-1)' ... H(1)' H(0)' B

  From this, we can see that we must move through A from top-left to bottom-
  right, since the Householder vector for H(0) was stored in the first row
  of A. We intend to apply blocks of reflectors at a time, where a block
  reflector H of b consecutive Householder transforms may be expressed as:

    H  =  ( H(i) H(i+1) ... H(i+b-1) )'
       =  ( I - U inv(T) U' )'

  where:
    - U^T is the strictly upper trapezoidal (with implicit unit diagonal) matrix
      of Householder vectors, stored above the diagonal of A in rows i through
      i+b-1, corresponding to H(i) through H(i+b-1).
    - T is the upper triangular block Householder matrix corresponding to
      Householder vectors i through i+b-1.

  Consider applying H to B as an intermediate step towards applying all of Q':

    B  :=  H B
        =  ( I - U inv(T) U' )' B
        =  ( I - U inv(T)' U' ) B
        =  B - U inv(T)' U' B

  We must move from top-left to bottom-right. So, we partition:

    U^T -> ( U11 U12 )  B -> / B1 \  T -> ( T1 T2 )
                             \ B2 /

  where:
    - U11 is stored in the strictly upper triangle of A11 with implicit unit
      diagonal.
    - U12 is stored in A12.
    - T1 is an upper triangular block of row-panel matrix T.

  Substituting repartitioned U, B, and T, we have:

    / B1 \  :=   / B1 \ - ( U11 U12 )^T inv(T1)' conj( U11 U12 ) / B1 \
    \ B2 /       \ B2 /                                          \ B2 /
             =   / B1 \ - / U11^T \ inv(T1)' conj( U11 U12 ) / B1 \
                 \ B2 /   \ U12^T /                          \ B2 /
             =   / B1 \ - / U11^T \ inv(T1)' ( conj(U11) B1 + conj(U12) B2 )
                 \ B2 /   \ U12^T /

  Thus, B1 is updated as:

      B1    :=     B1   -   U11^T inv(T1)' ( conj(U11) B1 + conj(U12) B2 )

  And B2 is updated as:

      B2    :=     B2   -   U12^T inv(T1)' ( conj(U11) B1 + conj(U12) B2 )

  Note that:

    inv(T1)' ( conj(U11) B1 + conj(U12) B2 )

  is common to both updates, and thus may be computed and stored in
  workspace, and then re-used.

  -FGVZ
*/
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TL,    TR,       T0,  T1,  T2;

  FLA_Obj T1T,
          T2B;

  FLA_Obj WTL,  WTR,
          WBL,  WBR;

  FLA_Obj BT,              B0,
          BB,              B1,
                           B2;

  dim_t   b_alg, b;

  // Query the algorithmic blocksize by inspecting the length of T.
  b_alg = FLA_Obj_length( T );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( T,    &TL,  &TR,      0, FLA_LEFT );

  FLA_Part_2x1( B,    &BT, 
                      &BB,            0, FLA_TOP );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    b = min( b_alg, FLA_Obj_min_dim( ABR ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, /**/ &T1, &T2,
                           b, FLA_RIGHT );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                        /* ** */            /* ** */
                                              &B1, 
                           BB,                &B2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    FLA_Part_2x1( T1,    &T1T, 
                         &T2B,     b, FLA_TOP );

    FLA_Part_2x2( W,     &WTL, &WTR,
                         &WBL, &WBR,     b, FLA_Obj_width( B1 ), FLA_TL );

    // WTL = B1;

    FLA_Copyt_internal( FLA_NO_TRANSPOSE, B1, WTL,
                        FLA_Cntl_sub_copyt( cntl ) );

    // U11 = triuu( A11 );
    // U12 = A12;
    //
    // WTL = inv( triu(T1T) )' * ( conj(U11) * B1 + conj(U12) * B2 );

    FLA_Trmm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_CONJ_NO_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm1( cntl ) );

    FLA_Gemm_internal( FLA_CONJ_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
                       FLA_ONE, A12, B2, FLA_ONE, WTL,
                       FLA_Cntl_sub_gemm1( cntl ) );

    FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                       FLA_ONE, T1T, WTL,
                       FLA_Cntl_sub_trsm( cntl ) );

    // B2 = B2 - U12^T * WTL;
    // B1 = B1 - U11^T * WTL;

    FLA_Gemm_internal( FLA_TRANSPOSE, FLA_NO_TRANSPOSE,
                       FLA_MINUS_ONE, A12, WTL, FLA_ONE, B2,
                       FLA_Cntl_sub_gemm2( cntl ) );

    FLA_Trmm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_MINUS_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm2( cntl ) );

    FLA_Axpyt_internal( FLA_NO_TRANSPOSE, FLA_ONE, WTL, B1,
                        FLA_Cntl_sub_axpyt( cntl ) );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, T1, /**/ T2,
                              FLA_LEFT );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                  B1, 
                            /* ** */           /* ** */
                              &BB,                B2,     FLA_TOP );
  }

  return FLA_SUCCESS;
}
예제 #22
0
FLA_Error FLA_QR_UT_blk_var1( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TL,    TR,       T0,  T1,  W12;

  FLA_Obj T1T,   T2B;

  FLA_Obj AB1,   AB2;

  dim_t   b_alg, b;

  // Query the algorithmic blocksize by inspecting the length of T.
  b_alg = FLA_Obj_length( T );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( T,    &TL,  &TR,      0, FLA_LEFT );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    b = min( b_alg, FLA_Obj_min_dim( ABR ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, /**/ &T1, &W12,
                           b, FLA_RIGHT );

    /*------------------------------------------------------------*/

    FLA_Part_2x1( T1,   &T1T, 
                        &T2B,    b, FLA_TOP );

    FLA_Merge_2x1( A11,
                   A21,   &AB1 );

    // Perform a QR factorization via the UT transform on AB1:
    //
    //   / A11 \ -> QB1 R11
    //   \ A21 /
    //
    // where:
    //  - QB1 is formed from UB1 (which is stored column-wise below the
    //    diagonal of AB1) and T11 (which is stored to the upper triangle
    //    of T11).
    //  - R11 is stored to the upper triangle of AB1.
  
    FLA_QR_UT_internal( AB1, T1T, 
                        FLA_Cntl_sub_qrut( cntl ) );


    if ( FLA_Obj_width( A12 ) > 0 )
    {
      FLA_Merge_2x1( A12,
                     A22,   &AB2 );

      // Apply the Householder transforms associated with UB1 and T11 to 
      // AB2:
      //
      //   / A12 \ := QB1' / A12 \
      //   \ A22 /         \ A22 /
      //
      // where QB1 is formed from UB1 and T11.

      FLA_Apply_Q_UT_internal( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE,
                               AB1, T1T, W12, AB2,
                               FLA_Cntl_sub_apqut( cntl ) );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, T1, /**/ W12,
                              FLA_LEFT );
  }

  return FLA_SUCCESS;
}
예제 #23
0
FLA_Error FLA_LQ_UT_unb_var2( FLA_Obj A, FLA_Obj T )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
                           A20,  a21,     A22;

  FLA_Obj TTL,   TTR,      T00,  t01,   T02, 
          TBL,   TBR,      t10t, tau11, t12t,
                           T20,  t21,   T22;


  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_2x2( T,    &TTL, &TTR,
                      &TBL, &TBR,     0, 0, FLA_TL );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );

    FLA_Repart_2x2_to_3x3( TTL, /**/ TTR,       &T00,  /**/ &t01,   &T02,
                        /* ************* */   /* ************************ */
                                                &t10t, /**/ &tau11, &t12t,
                           TBL, /**/ TBR,       &T20,  /**/ &t21,   &T22,
                           1, 1, FLA_BR );

    /*------------------------------------------------------------*/

    // Compute tau11 and u12t from alpha11 and a12t such that tau11 and u12t
    // determine a Householder transform H such that applying H from the
    // right to the row vector consisting of alpha11 and a12t annihilates
    // the entries in a12t (and updates alpha11).
    FLA_Househ2_UT( FLA_RIGHT, alpha11, a12t,
                    tau11 );

    // ( a21 A22 ) = ( a21 A22 ) H
    //
    // where H is formed from tau11 and u12t.
    FLA_Apply_H2_UT( FLA_RIGHT, tau11, a12t, a21, A22 );

    // t01 = conj(a01) + conj(A02) * u12t^T;
    FLA_Copyt_external( FLA_CONJ_NO_TRANSPOSE, a01, t01 );
    FLA_Gemvc_external( FLA_CONJ_NO_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A02, a12t, FLA_ONE, t01 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );

    FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR,       T00,  t01,   /**/ T02,
                                                     t10t, tau11, /**/ t12t,
                            /* ************** */  /* ********************** */
                              &TBL, /**/ &TBR,       T20,  t21,   /**/ T22,
                              FLA_TL );

  }

  return FLA_SUCCESS;
}
FLA_Error FLA_Apply_Q_UT_rhbc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl )
/*
  Apply the conjugate-transpose of a unitary matrix Q to a matrix B from the
  right,

    B :=  B Q'

  where Q is the backward product of Householder transformations:

    Q  =  H(k-1) ... H(1) H(0)

  where H(i) corresponds to the Householder vector stored below the diagonal
  in the ith column of A. Thus, the operation becomes:

    B :=  B Q
       =  B ( H(k-1) ... H(1) H(0) )'
       =  B ( H(k-1)' ... H(1)' H(0)' )'
       =  B ( H(0) H(1) ... H(k-1) )
       =  B H(0) H(1) ... H(k-1)

  From this, we can see that we must move through A from top-left to bottom-
  right, since the Householder vector for H(0) was stored in the first column
  of A. We intend to apply blocks of reflectors at a time, where a block
  reflector H of b consecutive Householder transforms may be expressed as:

    H  =  ( H(i) H(i+1) ... H(i+b-1) )
       =  ( I - U inv(T) U' )

  where:
    - U is the strictly lower trapezoidal (with implicit unit diagonal) matrix
      of Householder vectors, stored below the diagonal of A in columns i through
      i+b-1, corresponding to H(i) through H(i+b-1).
    - T is the upper triangular block Householder matrix corresponding to
      Householder vectors i through i+b-1.

  Consider applying H to B as an intermediate step towards applying all of Q':

    B  :=  B H
        =  B ( I - U inv(T) U' )
        =  B - B U inv(T) U'

  We must move from top-left to bottom-right. So, we partition:

    U -> / U11 \  B -> ( B1 B2 )  T -> ( T1 T2 )
         \ U21 /
  where:
    - U11 is stored in strictly lower triangle of A11 with implicit unit
      diagonal.
    - U21 is stored in A21.
    - T1 is an upper triangular block of row-panel matrix T.

  Substituting repartitioned U, B, and T, we have:

    ( B1 B2 ) :=  ( B1 B2 ) - ( B1 B2 ) / U11 \ inv(T1) / U11 \'
                                        \ U21 /         \ U21 /
               =  ( B1 B2 ) - ( B1 B2 ) / U11 \ inv(T1) ( U11' U21' )
                                        \ U21 /
               =  ( B1 B2 ) - ( B1 U11 + B2 U21 ) inv(T1) ( U11' U21' )

  Thus, B1 is updated as:

      B1    :=      B1   -   ( B1 U11 + B2 U21 ) inv(T1) U11'

  And B2 is updated as:

      B2    :=      B2   -   ( B1 U11 + B2 U21 ) inv(T1) U21'

  Note that:

    ( B1 U11 + B2 U21 ) inv(T1)

  is common to both updates, and thus may be computed and stored in
  workspace, and then re-used.

  -FGVZ
*/
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TL,    TR,       T0,  T1,  T2;

  FLA_Obj T1T,
          T2B;

  FLA_Obj WTL,  WTR,
          WBL,  WBR;

  FLA_Obj BL,    BR,       B0,  B1,  B2;

  dim_t   b_alg, b;

  // Query the algorithmic blocksize by inspecting the length of T.
  b_alg = FLA_Obj_length( T );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( T,    &TL,  &TR,      0, FLA_LEFT );

  FLA_Part_1x2( B,    &BL,  &BR,      0, FLA_LEFT );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    b = min( b_alg, FLA_Obj_min_dim( ABR ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, /**/ &T1, &T2,
                           b, FLA_RIGHT );

    FLA_Repart_1x2_to_1x3( BL,  /**/ BR,        &B0, /**/ &B1, &B2,
                           b, FLA_RIGHT );

    /*------------------------------------------------------------*/

    FLA_Part_2x1( T1,    &T1T, 
                         &T2B,     b, FLA_TOP );

    FLA_Part_2x2( W,     &WTL, &WTR,
                         &WBL, &WBR,     b, FLA_Obj_length( B1 ), FLA_TL );

    // WTL = B1^T;

    FLA_Copyt_internal( FLA_TRANSPOSE, B1, WTL,
                        FLA_Cntl_sub_copyt( cntl ) );

    // U11 = trilu( A11 );
    // U21 = A21;
    // Let WTL^T be conformal to B1.
    //
    // WTL^T = ( B1 * U11 + B2 * U21 ) * inv( triu(T1T) );
    // WTL   = inv( triu(T1T)^T ) * ( U11^T * B1^T + U21^T * B2^T );

    FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                       FLA_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm1( cntl ) );

    FLA_Gemm_internal( FLA_TRANSPOSE, FLA_TRANSPOSE, 
                       FLA_ONE, A21, B2, FLA_ONE, WTL,
                       FLA_Cntl_sub_gemm1( cntl ) );

    FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_TRANSPOSE, FLA_NONUNIT_DIAG,
                       FLA_ONE, T1T, WTL,
                       FLA_Cntl_sub_trsm( cntl ) );

    // B2 = B2 - WTL^T * U21';
    // B1 = B1 - WTL^T * U11';
    //    = B1 - ( conj(U11) * WTL )^T;

    FLA_Gemm_internal( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_MINUS_ONE, WTL, A21, FLA_ONE, B2,
                       FLA_Cntl_sub_gemm2( cntl ) );

    FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                       FLA_CONJ_NO_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_MINUS_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm2( cntl ) );

    FLA_Axpyt_internal( FLA_TRANSPOSE, FLA_ONE, WTL, B1,
                        FLA_Cntl_sub_axpyt( cntl ) );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, T1, /**/ T2,
                              FLA_LEFT );

    FLA_Cont_with_1x3_to_1x2( &BL,  /**/ &BR,        B0, B1, /**/ B2,
                              FLA_LEFT );
  }

  return FLA_SUCCESS;
}
예제 #25
0
FLA_Error FLA_Bidiag_UT_u_blk_var2( FLA_Obj A, FLA_Obj TU, FLA_Obj TV )
{
  FLA_Obj  ATL,   ATR,      A00, A01, A02, 
           ABL,   ABR,      A10, A11, A12,
                            A20, A21, A22;
  FLA_Obj  TUL,   TUR,      TU0, TU1, TU2; 
  FLA_Obj  TVL,   TVR,      TV0, TV1, TV2; 

  FLA_Obj  TU1_tl;
  FLA_Obj  TV1_tl;
  FLA_Obj  none, none2, none3;
  dim_t    b_alg, b;

  b_alg = FLA_Obj_length( TU );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,   0, 0, FLA_TL );
  FLA_Part_1x2( TU,   &TUL, &TUR,      0, FLA_LEFT ); 
  FLA_Part_1x2( TV,   &TVL, &TVR,      0, FLA_LEFT ); 

  while ( FLA_Obj_min_dim( ABR ) > 0 )
  {
    b = min( FLA_Obj_min_dim( ABR ), b_alg );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );
    FLA_Repart_1x2_to_1x3( TUL, /**/ TUR,       &TU0, /**/ &TU1, &TU2,
                           b, FLA_RIGHT );
    FLA_Repart_1x2_to_1x3( TVL, /**/ TVR,       &TV0, /**/ &TV1, &TV2,
                           b, FLA_RIGHT );

    /*------------------------------------------------------------*/

    FLA_Part_2x2( TU1,     &TU1_tl, &none,   
                           &none2,  &none3,   b, b, FLA_TL ); 

    FLA_Part_2x2( TV1,     &TV1_tl, &none,   
                           &none2,  &none3,   b, b, FLA_TL ); 

    // [ ABR, T1 ] = FLA_Bidiag_UT_u_step_unb_var2( ABR, TU1, TV1, b );
    //FLA_Bidiag_UT_u_step_unb_var2( ABR, TU1_tl, TV1_tl );
    //FLA_Bidiag_UT_u_step_ofu_var2( ABR, TU1_tl, TV1_tl );
    FLA_Bidiag_UT_u_step_opt_var2( ABR, TU1_tl, TV1_tl );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );
    FLA_Cont_with_1x3_to_1x2( &TUL, /**/ &TUR,       TU0, TU1, /**/ TU2,
                              FLA_LEFT );
    FLA_Cont_with_1x3_to_1x2( &TVL, /**/ &TVR,       TV0, TV1, /**/ TV2,
                              FLA_LEFT );
  }

  return FLA_SUCCESS;
}
예제 #26
0
FLA_Error REF_Svdd_uv_components( FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V,
                                  double* dtime_bred, double* dtime_bsvd, double* dtime_appq,
                                  double* dtime_qrfa, double* dtime_gemm )
/*
{
  *dtime_bred = 1;
  *dtime_bsvd = 1;
  *dtime_appq = 1;
  *dtime_qrfa = 1;
  *dtime_gemm = 1;

  return FLA_Svdd_external( FLA_SVD_VECTORS_ALL, A, s, U, V );
}
*/

{
  FLA_Datatype dt_A;
  FLA_Datatype dt_A_real;
  dim_t        m_A, n_A;
  dim_t        min_m_n;
  FLA_Obj      tq, tu, tv, d, e, Ur, Vr, W;
  FLA_Obj      eT, epsilonB;
  FLA_Uplo     uplo = FLA_UPPER_TRIANGULAR;
  double       crossover_ratio = 16.0 / 10.0;
  double       dtime_temp;

  dt_A      = FLA_Obj_datatype( A );
  dt_A_real = FLA_Obj_datatype_proj_to_real( A );
  m_A       = FLA_Obj_length( A );
  n_A       = FLA_Obj_width( A );

  min_m_n   = FLA_Obj_min_dim( A );

  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tq );
  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tu );
  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tv );
  FLA_Obj_create( dt_A_real, min_m_n, 1,   0, 0, &d );
  FLA_Obj_create( dt_A_real, min_m_n, 1,   0, 0, &e );
  FLA_Obj_create( dt_A_real, n_A,     n_A, 0, 0, &Ur );
  FLA_Obj_create( dt_A_real, n_A,     n_A, 0, 0, &Vr );


  FLA_Part_2x1( e,   &eT,
                     &epsilonB,    1, FLA_BOTTOM );

  if ( m_A >= n_A )
  {
    if ( m_A < crossover_ratio * n_A )
    {
      dtime_temp = FLA_Clock();
      {
        // Reduce to bidiagonal form.
        FLA_Bidiag_blk_external( A, tu, tv );
        FLA_Bidiag_UT_extract_diagonals( A, d, eT );
      }
      *dtime_bred = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Divide-and-conquor algorithm.
        FLA_Bsvdd_external( uplo, d, e, Ur, Vr );
      }
      *dtime_bsvd = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form U.
        FLA_Copy_external( Ur, U );
        FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, tu, U );

        // Form V.
        FLA_Copy_external( Vr, V );
        FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, A, tv, V );
      }
      *dtime_appq = FLA_Clock() - dtime_temp;


      *dtime_qrfa = 0.0;
      *dtime_gemm = 0.0;
    }
    else
    {
      FLA_Obj AT,
              AB;
      FLA_Obj UL, UR;

      FLA_Part_2x1( A,   &AT,
                         &AB,        n_A, FLA_TOP );
      FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );

      // Create a temporary n-by-n matrix R.
      FLA_Obj_create( dt_A, n_A, n_A, 0, 0, &W );

      dtime_temp = FLA_Clock();
      {
        // Perform a QR factorization.
        FLA_QR_blk_external( A, tq );
        FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, UL );
        FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, A );
      }
      *dtime_qrfa = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form Q.
        FLA_QR_form_Q_external( U, tq );
      }
      *dtime_appq = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Reduce R to bidiagonal form.
        FLA_Bidiag_blk_external( AT, tu, tv );
        FLA_Bidiag_UT_extract_diagonals( A, d, eT );
      }
      *dtime_bred = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Divide-and-conquor algorithm.
        FLA_Bsvdd_external( uplo, d, e, Ur, Vr );
      }
      *dtime_bsvd = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form U in W.
        FLA_Copy_external( Ur, W );
        FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, AT, tu, W );

        // Form V.
        FLA_Copy_external( Vr, V );
        FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, AT, tv, V );
      }
      *dtime_appq += FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Multiply R into U, storing the result in A and then copying
        // back to U.
        FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                           FLA_ONE, UL, W, FLA_ZERO, A );
        FLA_Copy( A, UL );
      }
      *dtime_gemm = FLA_Clock() - dtime_temp;


      // Free R.
      FLA_Obj_free( &W );
    }
  }
  else
  {
    FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
  }

  // Copy singular values to output vector.
  FLA_Copy( d, s );

  // Sort singular values and vectors.
  FLA_Sort_svd( FLA_BACKWARD, s, U, V );

  FLA_Obj_free( &tq );
  FLA_Obj_free( &tu );
  FLA_Obj_free( &tv );
  FLA_Obj_free( &d );
  FLA_Obj_free( &e );
  FLA_Obj_free( &Ur );
  FLA_Obj_free( &Vr );

  return FLA_SUCCESS;
}
FLA_Error FLA_Apply_Q_UT_rnbr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl )
/*
  Apply a unitary matrix Q to a matrix B from the right,

    B :=  B Q

  where Q is the backward product of Householder transformations:

    Q  =  H(k-1) ... H(1) H(0)

  where H(i) corresponds to the Householder vector stored above the diagonal
  in the ith row of A. Thus, the operation becomes:

    B :=  B Q
       =  B ( H(k-1) ... H(1) H(0) )
       =  B ( H(k-1)' ... H(1)' H(0)' )
       =  B ( H(0) H(1) ... H(k-1) )'
       =  B H(k-1)' ... H(1)' H(0)'

  From this, we can see that we must move through A from bottom-right to top-
  left, since the Householder vector for H(k-1) was stored in the last row
  of A. We intend to apply blocks of reflectors at a time, where a block
  reflector H of b consecutive Householder transforms may be expressed as:

    H  =  ( H(i) H(i+1) ... H(i+b-1) )'
       =  ( I - U inv(T) U' )'

  where:
    - U^T is the strictly upper trapezoidal (with implicit unit diagonal) matrix
      of Householder vectors, stored above the diagonal of A in rows i through
      i+b-1, corresponding to H(i) through H(i+b-1).
    - T is the upper triangular block Householder matrix corresponding to
      Householder vectors i through i+b-1.

  Consider applying H to B as an intermediate step towards applying all of Q:

    B  :=  B H
        =  B ( I - U inv(T) U' )'
        =  B ( I - U inv(T)' U' )
        =  B - B U inv(T)' U'

  We must move from bottom-right to top-left. So, we partition:

    U^T -> ( U11 U12 )  B -> ( B1 B2 )  T -> ( T2 T1 )

  where:
    - U11 is stored in strictly upper triangle of A11 with implicit unit
      diagonal.
    - U12 is stored in A12.
    - T1 is an upper triangular block of row-panel matrix T.

  Substituting repartitioned U, B, and T, we have:

    ( B1 B2 ) :=  ( B1 B2 ) - ( B1 B2 ) ( U11 U12 )^T inv(T1)' conj( U11 U12 )
               =  ( B1 B2 ) - ( B1 B2 ) / U11^T \ inv(T1)' conj( U11 U12 )
                                        \ U12^T /
               =  ( B1 B2 ) - ( B1 U11^T + B2 U12^T ) inv(T1)' conj( U11 U12 )

  Thus, B1 is updated as:

      B1    :=      B1   -   ( B1 U11^T + B2 U12^T ) inv(T1)' conj(U11)

  And B2 is updated as:

      B2    :=      B2   -   ( B1 U11^T + B2 U12^T ) inv(T1)' conj(U12)

  Note that:

    ( B1 U11^T + B2 U12^T ) inv(T1)'

  is common to both updates, and thus may be computed and stored in
  workspace, and then re-used.

  -FGVZ
*/
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TL,    TR,       T0,  T1,  T2;

  FLA_Obj T1T,
          T2B;

  FLA_Obj WTL,  WTR,
          WBL,  WBR;

  FLA_Obj BL,    BR,       B0,  B1,  B2;

  dim_t   b_alg, b;
  dim_t   m_BR, n_BR;

  // Query the algorithmic blocksize by inspecting the length of T.
  b_alg = FLA_Obj_length( T );

  // If m < n, then we have to initialize our partitionings carefully so
  // that we begin in the proper location in A and B (since we traverse
  // matrix A from BR to TL).
  if ( FLA_Obj_length( A ) < FLA_Obj_width( A ) )
  {
    m_BR = 0;
    n_BR = FLA_Obj_width( A ) - FLA_Obj_length( A );
  }
  else if ( FLA_Obj_length( A ) > FLA_Obj_width( A ) )
  {
    m_BR = FLA_Obj_length( A ) - FLA_Obj_width( A );
    n_BR = 0;
  }
  else
  {
    m_BR = 0;
    n_BR = 0;
  }

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     m_BR, n_BR, FLA_BR );

  // A and T are dependent; we determine T matrix w.r.t. A
  FLA_Part_1x2( T,    &TL,  &TR,      FLA_Obj_min_dim( A ), FLA_LEFT );

  // Be carefule that A contains reflector in row-wise;
  // corresponding B should be partitioned with n_BR.
  FLA_Part_1x2( B,    &BL,  &BR,      n_BR, FLA_RIGHT );

  while ( FLA_Obj_min_dim( ATL ) > 0 ){

    b = min( b_alg, FLA_Obj_min_dim( ATL ) );

    // Since T was filled from left to right, and since we need to access them
    // in reverse order, we need to handle the case where the last block is
    // smaller than the other b x b blocks.
    if ( FLA_Obj_width( TR ) == 0 && FLA_Obj_width( T ) % b_alg > 0 )
      b = FLA_Obj_width( T ) % b_alg;

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, &A01, /**/ &A02,
                                                &A10, &A11, /**/ &A12,
                        /* ************* */   /* ******************** */
                           ABL, /**/ ABR,       &A20, &A21, /**/ &A22,
                           b, b, FLA_TL );

    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, &T1, /**/ &T2,
                           b, FLA_LEFT );

    FLA_Repart_1x2_to_1x3( BL,  /**/ BR,        &B0, &B1, /**/ &B2,
                           b, FLA_LEFT );

    /*------------------------------------------------------------*/

    FLA_Part_2x1( T1,    &T1T, 
                         &T2B,     b, FLA_TOP );

    FLA_Part_2x2( W,     &WTL, &WTR,
                         &WBL, &WBR,     b, FLA_Obj_length( B1 ), FLA_TL );

    // WTL = B1^T;

    FLA_Copyt_internal( FLA_TRANSPOSE, B1, WTL,
                        FLA_Cntl_sub_copyt( cntl ) );

    // U11 = triuu( A11 );
    // U12 = A12;
    // Let WTL^T be conformal to B1.
    //
    // WTL^T = ( B1 * U11^T + B2 * U12^T ) * inv( triu(T1T)' );
    // WTL   = inv( conj(triu(T1T)) ) * ( U11 * B1^T + U12 * B2^T );

    FLA_Trmm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm1( cntl ) );

    FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, 
                       FLA_ONE, A12, B2, FLA_ONE, WTL,
                       FLA_Cntl_sub_gemm1( cntl ) );

    FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_CONJ_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                       FLA_ONE, T1T, WTL,
                       FLA_Cntl_sub_trsm( cntl ) );

    // B2 = B2 - WTL^T * conj(U12);
    // B1 = B1 - WTL^T * conj(U11);
    //    = B1 - ( U11' * WTL )^T;

    FLA_Gemm_internal( FLA_TRANSPOSE, FLA_CONJ_NO_TRANSPOSE,
                       FLA_MINUS_ONE, WTL, A12, FLA_ONE, B2,
                       FLA_Cntl_sub_gemm2( cntl ) );

    FLA_Trmm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                       FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
                       FLA_MINUS_ONE, A11, WTL,
                       FLA_Cntl_sub_trmm2( cntl ) );

    FLA_Axpyt_internal( FLA_TRANSPOSE, FLA_ONE, WTL, B1,
                        FLA_Cntl_sub_axpyt( cntl ) );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, /**/ A01, A02,
                            /* ************** */  /* ****************** */
                                                     A10, /**/ A11, A12,
                              &ABL, /**/ &ABR,       A20, /**/ A21, A22,
                              FLA_BR );

    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, /**/ T1, T2,
                              FLA_RIGHT );

    FLA_Cont_with_1x3_to_1x2( &BL,  /**/ &BR,        B0, /**/ B1, B2,
                              FLA_RIGHT );
  }

  return FLA_SUCCESS;
}
예제 #28
0
FLA_Error FLA_QR_UT_unb_var2( FLA_Obj A, FLA_Obj T )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
                           A20,  a21,     A22;

  FLA_Obj TTL,   TTR,      T00,  t01,   T02, 
          TBL,   TBR,      t10t, tau11, t12t,
                           T20,  t21,   T22;


  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_2x2( T,    &TTL, &TTR,
                      &TBL, &TBR,     0, 0, FLA_TL );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );

    FLA_Repart_2x2_to_3x3( TTL, /**/ TTR,       &T00,  /**/ &t01,   &T02,
                        /* ************* */   /* ************************ */
                                                &t10t, /**/ &tau11, &t12t,
                           TBL, /**/ TBR,       &T20,  /**/ &t21,   &T22,
                           1, 1, FLA_BR );

    /*------------------------------------------------------------*/

    // Compute tau11 and u21 from alpha11 and a21 such that tau11 and u21
    // determine a Householder transform H such that applying H from the
    // left to the column vector consisting of alpha11 and a21 annihilates
    // the entries in a21 (and updates alpha11).
    FLA_Househ2_UT( FLA_LEFT,
                    alpha11,
                    a21, tau11 );

    // / a12t \ =  H / a12t \
    // \ A22  /      \ A22  /
    //
    // where H is formed from tau11 and u21.
    FLA_Apply_H2_UT( FLA_LEFT, tau11, a21, a12t,
                                           A22 );

    // t01 = a10t' + A20' * u21;
    FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
    FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );

    FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR,       T00,  t01,   /**/ T02,
                                                     t10t, tau11, /**/ t12t,
                            /* ************** */  /* ********************** */
                              &TBL, /**/ &TBR,       T20,  t21,   /**/ T22,
                              FLA_TL );

  }

  return FLA_SUCCESS;
}