示例#1
0
FLA_Error FLA_Scalc_check( FLA_Conj conjalpha, FLA_Obj alpha, FLA_Obj A )
{
    FLA_Error e_val;

    e_val = FLA_Check_valid_conj( conjalpha );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_floating_object( A );
    FLA_Check_error_code( e_val );

    e_val = FLA_Check_nonconstant_object( A );
    FLA_Check_error_code( e_val );

    if ( FLA_Obj_is_real( A ) )
    {
        e_val = FLA_Check_consistent_object_datatype( A, alpha );
        FLA_Check_error_code( e_val );
    }
    else
    {
        e_val = FLA_Check_identical_object_precision( A, alpha );
        FLA_Check_error_code( e_val );
    }

    e_val = FLA_Check_if_scalar( alpha );
    FLA_Check_error_code( e_val );

    return FLA_SUCCESS;
}
示例#2
0
void time_QR_UT(
                 int variant, int type, int nrepeats, int m, int n,
                 FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, FLA_Obj b, FLA_Obj b_orig,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, b_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save );

  if ( FLA_Obj_is_single_precision( A ) )
    FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm );
  else
    FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( b, b_save );


  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_QR_UT( A, t );
        break;
      case FLA_ALG_FRONT:
        FLA_QR_UT( A, T );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  if ( type == FLA_ALG_REFERENCE )
  {
    FLA_Obj AT, AB;
    FLA_Obj bT, bB;
    FLA_Obj y;

    FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y );

    FLA_Copy_external( b, b_orig );

    if ( FLA_Obj_is_real( A ) )
      FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, t, b );
    else
      FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, t, b );

    FLA_Part_2x1( A,    &AT,
                        &AB,    FLA_Obj_width( A ), FLA_TOP );
    FLA_Part_2x1( b,    &bT,
                        &bB,    FLA_Obj_width( A ), FLA_TOP );
    FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                       FLA_NONUNIT_DIAG, FLA_ONE, AT, bT );
    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, bT, FLA_ONE, b_orig );
    FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y );
    FLA_Nrm2_external( y, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    FLA_Obj_free( &y );
  }
  else
  {
    FLA_Obj x, y;

    FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y );
    FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &x );

    FLA_Copy_external( b, b_orig );

    FLA_QR_UT_solve( A, T, b, x );

    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, x, FLA_ONE, b_orig );
    FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y );
    FLA_Nrm2_external( y, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    FLA_Obj_free( &x );
    FLA_Obj_free( &y );
  }

  *gflops = (         2.0   * m * n * n -
              ( 2.0 / 3.0 ) * n * n * n ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( b_save, b );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &b_save );
  FLA_Obj_free( &norm );
}
FLA_Error FLA_Bidiag_apply_V_external( FLA_Side side, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B )
{
  int          info = 0;
#ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES
  FLA_Datatype datatype;
  // int          m_A, n_A;
  int          m_B, n_B;
  int          cs_A;
  int          cs_B;
  int          k_t;
  int          lwork;
  FLA_Obj      work;
  char         blas_side;
  char         blas_vect = 'P';
  char         blas_trans;
  int          i;

  //if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
  //  FLA_Apply_Q_check( side, trans, storev, A, t, B );

  if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS;

  datatype = FLA_Obj_datatype( A );

  // m_A      = FLA_Obj_length( A );
  // n_A      = FLA_Obj_width( A );
  cs_A     = FLA_Obj_col_stride( A );

  m_B      = FLA_Obj_length( B );
  n_B      = FLA_Obj_width( B );
  cs_B     = FLA_Obj_col_stride( B );

  if ( blas_vect == 'Q' ) k_t = FLA_Obj_vector_dim( t );
  else                    k_t = FLA_Obj_vector_dim( t ) + 1;

  if ( FLA_Obj_is_real( A ) && trans == FLA_CONJ_TRANSPOSE )
    trans = FLA_TRANSPOSE;

  FLA_Param_map_flame_to_netlib_side( side, &blas_side );
  FLA_Param_map_flame_to_netlib_trans( trans, &blas_trans );


  // Make a workspace query the first time through. This will provide us with
  // and ideal workspace size based on an internal block size.
  lwork = -1;
  FLA_Obj_create( datatype, 1, 1, 0, 0, &work );

  for ( i = 0; i < 2; ++i )
  {
    if ( i == 1 )
    {
      // Grab the queried ideal workspace size from the work array, free the
      // work object, and then re-allocate the workspace with the ideal size.
      if      ( datatype == FLA_FLOAT || datatype == FLA_COMPLEX )
        lwork = ( int ) *FLA_FLOAT_PTR( work );
      else if ( datatype == FLA_DOUBLE || datatype == FLA_DOUBLE_COMPLEX )
        lwork = ( int ) *FLA_DOUBLE_PTR( work );

      FLA_Obj_free( &work );
      FLA_Obj_create( datatype, lwork, 1, 0, 0, &work );
    }

    switch( datatype ){
  
    case FLA_FLOAT:
    {
      float *buff_A    = ( float * ) FLA_FLOAT_PTR( A );
      float *buff_t    = ( float * ) FLA_FLOAT_PTR( t );
      float *buff_B    = ( float * ) FLA_FLOAT_PTR( B );
      float *buff_work = ( float * ) FLA_FLOAT_PTR( work );
  
      F77_sormbr( &blas_vect,
                  &blas_side,
                  &blas_trans,
                  &m_B,
                  &n_B,
                  &k_t,
                  buff_A,    &cs_A,
                  buff_t,
                  buff_B,    &cs_B,
                  buff_work, &lwork,
                  &info );
  
      break;
    }
  
    case FLA_DOUBLE:
    {
      double *buff_A    = ( double * ) FLA_DOUBLE_PTR( A );
      double *buff_t    = ( double * ) FLA_DOUBLE_PTR( t );
      double *buff_B    = ( double * ) FLA_DOUBLE_PTR( B );
      double *buff_work = ( double * ) FLA_DOUBLE_PTR( work );
  
      F77_dormbr( &blas_vect,
                  &blas_side,
                  &blas_trans,
                  &m_B,
                  &n_B,
                  &k_t,
                  buff_A,    &cs_A,
                  buff_t,
                  buff_B,    &cs_B,
                  buff_work, &lwork,
                  &info );
  
      break;
    }
  
    case FLA_COMPLEX:
    {
      scomplex *buff_A    = ( scomplex * ) FLA_COMPLEX_PTR( A );
      scomplex *buff_t    = ( scomplex * ) FLA_COMPLEX_PTR( t );
      scomplex *buff_B    = ( scomplex * ) FLA_COMPLEX_PTR( B );
      scomplex *buff_work = ( scomplex * ) FLA_COMPLEX_PTR( work );
  
      F77_cunmbr( &blas_vect,
                  &blas_side,
                  &blas_trans,
                  &m_B,
                  &n_B,
                  &k_t,
                  buff_A,    &cs_A,
                  buff_t,
                  buff_B,    &cs_B,
                  buff_work, &lwork,
                  &info );
  
      break;
    }
  
    case FLA_DOUBLE_COMPLEX:
    {
      dcomplex *buff_A    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A );
      dcomplex *buff_t    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( t );
      dcomplex *buff_B    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B );
      dcomplex *buff_work = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( work );
  
      F77_zunmbr( &blas_vect,
                  &blas_side,
                  &blas_trans,
                  &m_B,
                  &n_B,
                  &k_t,
                  buff_A,    &cs_A,
                  buff_t,
                  buff_B,    &cs_B,
                  buff_work, &lwork,
                  &info );
  
      break;
    }

    }
  }

  FLA_Obj_free( &work );
#else
  FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED );
#endif

  return info;
}
示例#4
0
void time_Apply_G_rf(
               int variant, int type, int n_repeats, int m, int k, int n, int b_alg,
               FLA_Obj A, FLA_Obj A_ref, FLA_Obj G, FLA_Obj P,
               double *dtime, double *diff, double *gflops )
{
  int irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, G_save, norm;

  if ( FLA_Obj_is_real( A ) )
  {
    if (
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 1 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 1 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 2 && type == FLA_ALG_BLOCKED ) ||

       //( variant == 3 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 3 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 3 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 6 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 6 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 6 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 9 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 9 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 9 && type == FLA_ALG_BLOCKED ) ||

       ( variant == 4 ) ||
       ( variant == 5 ) ||
       ( variant == 7 ) ||
       ( variant == 8 ) ||
       FALSE
    ) 
    {
      *gflops = 0.0;
      *diff   = 0.0;
      return;
    }
  }
  else if ( FLA_Obj_is_complex( A ) )
  {
    if (
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 1 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 1 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 2 && type == FLA_ALG_BLOCKED ) ||

       //( variant == 3 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 3 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 3 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 6 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 6 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 6 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 9 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 9 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 9 && type == FLA_ALG_BLOCKED ) ||

       ( variant == 4 ) ||
       ( variant == 5 ) ||
       ( variant == 7 ) ||
       ( variant == 8 ) ||
       FALSE
    )
    {
      *gflops = 0.0;
      *diff   = 0.0;
      return;
    }
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  //dim_t b_flash_m = b_alg;
  //dim_t b_flash_n = n;
  //FLASH_Obj_create_hier_copy_of_flat_ext( A, 1, &b_flash_m, &b_flash_n, &AH ); 

//printf ( "flash dims: %d x %d\n", FLA_Obj_length( AH ), FLA_Obj_width( AH ) );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( G, G_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( G_save, G );
    //FLASH_Obj_hierarchify( A_save, AH );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      break;

    // Time variant 1
    case 1:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var1( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var1( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var1( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 2
    case 2:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var2( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var2( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var2( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 3
    case 3:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var3( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var3( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var3( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 6
    case 6:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var6( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var6( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var6( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 9
    case 9:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var9( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var9( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var9( G, A, b_alg );
        break;
      }
      break;
    }


    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  if ( variant == 1 && type == FLA_ALG_UNB_OPT )
  {
    //FLA_Obj_show( "A_ref", A, "%9.2e + %9.2e ", "" );
    //FLA_Obj_show( "A", A, "%9.2e ", "" );

    FLA_Copy( A, A_ref );
    *diff = 0.0;
  }
  else
  {
    //FLA_Obj_show( "A", A, "%9.2e + %9.2e ", "" );

//if ( variant == 7 && type == FLA_ALG_UNB_ASM )
    //FLA_Obj_show( "A", A, "%9.2e", "" );

    //if ( variant == 9 ) FLASH_Obj_flatten( AH, A );

    FLA_Axpy( FLA_MINUS_ONE, A_ref, A );
    FLA_Norm_frob( A, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    //*diff = FLA_Max_elemwise_diff( A_ref, A );
  }


  *gflops = 6.0 * k * m * ( n - 1 ) /
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 2.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( G_save, G );

  //FLASH_Obj_free( &AH );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &G_save );
  FLA_Obj_free( &norm );
}
示例#5
0
void time_Hess_UT(
                 int variant, int type, int nrepeats, int m,
                 FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Hess_UT( A, t );
        break;
      case FLA_ALG_FRONT:
        FLA_Hess_UT( A, T );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  //if ( type == FLA_ALG_REFERENCE )
  //{
  //  ;
  //}
  //else
  {
    FLA_Obj    AT, AB;
    FLA_Obj Q, QT, QB;
    FLA_Obj E, ET, EB;
    FLA_Obj F;
    dim_t   m_A, m_T;

    m_A = FLA_Obj_length( A );
    m_T = FLA_Obj_length( T );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &Q );
    FLA_Set_to_identity( Q );

    FLA_Part_2x1( Q,    &QT,
                        &QB,    1, FLA_TOP );
    FLA_Part_2x1( A,    &AT,
                        &AB,    1, FLA_TOP );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
    }
    else
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QB );

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_ONE, A_save, Q, FLA_ZERO, E );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                       FLA_ONE, Q, E, FLA_ZERO, F );

    FLA_Copy( A, E );
    FLA_Part_2x1( E,    &ET,
                        &EB,    1, FLA_TOP );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, EB );

    *diff = FLA_Max_elemwise_diff( E, F );

    FLA_Obj_free( &Q );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );
  }

  *gflops = ( 10.0 / 3.0 * m * m * m ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}
示例#6
0
void time_Bidiag_UT(
                 int param_combo, int type, int nrepeats, int m, int n,
                 FLA_Obj A, FLA_Obj tu, FLA_Obj tv, FLA_Obj TU, FLA_Obj TV,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( param_combo ){

    case 0:
    {
      switch( type )
      {
      case FLA_ALG_REFERENCE:
        REF_Bidiag_UT( A, tu, tv );
        break;
      case FLA_ALG_FRONT:
        FLA_Bidiag_UT( A, TU, TV );
        break;
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  {
    FLA_Obj AL, AR;
    FLA_Obj ATL, ATR,
            ABL, ABR;
    FLA_Obj QU;
    FLA_Obj QV, QVL, QVR;
    FLA_Obj E, EL, ER;
    FLA_Obj F;
    FLA_Obj WU, WV, eye;
    FLA_Obj tvT,
            tvB;
    dim_t   m_A, n_A, m_TU;


//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    m_A = FLA_Obj_length( A );
    n_A = FLA_Obj_width( A );
    m_TU = FLA_Obj_length( TU );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A,  m_A, 0, 0, &QU );
    FLA_Obj_create( FLA_Obj_datatype( A ), n_A,  n_A, 0, 0, &QV );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  m_A, 0, 0, &WU );
    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  n_A, 0, 0, &WV );

    FLA_Set_to_identity( QU );
    FLA_Set_to_identity( QV );

    FLA_Part_1x2( QV,   &QVL, &QVR,   1, FLA_LEFT );
    FLA_Part_1x2( A,    &AL,  &AR,    1, FLA_LEFT );
    FLA_Part_2x2( A,    &ATL, &ATR,
                        &ABL, &ABR,   1, 1, FLA_BL );
    FLA_Part_2x1( tv,   &tvT,
                        &tvB,   1, FLA_BOTTOM );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, AR, tv, QVR );
      //
      // Need to apply backwards transformation, since vectors are stored columnwise.
      // QL? RQ?
      //
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, AR, tvT, QVR );
    }
    else
    {
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, TU, WU, QU );
      FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, AR, TV, WV, QVR );
    }

/*
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye );     
    FLA_Set_to_identity( eye );

    //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
    //          FLA_ONE, QV, QV, FLA_MINUS_ONE, eye );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, QU, QU, FLA_MINUS_ONE, eye );

FLA_Obj_show( "eye", eye, "%10.3e", "" );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff );
    FLA_Obj_free( &eye );
*/

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A_save, QV, FLA_ZERO, E );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, QU, E, FLA_ZERO, F );

//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    FLA_Copy( A, E );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, E );
    FLA_Part_1x2( E,    &EL, &ER,      1, FLA_LEFT );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, ER );

//FLA_Obj_show( "B", E, "%10.3e", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e", "" );
//FLA_Obj_show( "B", E, "%10.3e + %10.3e ", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e + %10.3e ", "" );

    *diff = FLA_Max_elemwise_diff( E, F );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );

    FLA_Obj_free( &QU );
    FLA_Obj_free( &QV );
    FLA_Obj_free( &WU );
    FLA_Obj_free( &WV );
  }

  *gflops = 4.0 * n * n * ( m - n / 3.0 ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}