int Symm_ru_unb_var4( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
                           A20,  a21,     A22;

  FLA_Obj BL,    BR,       B0,  b1,  B2;

  FLA_Obj CL,    CR,       C0,  c1,  C2;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( B,    &BL,  &BR,      0, FLA_LEFT );

  FLA_Part_1x2( C,    &CL,  &CR,      0, FLA_LEFT );

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );

    FLA_Repart_1x2_to_1x3( BL,  /**/ BR,        &B0, /**/ &b1, &B2,
                           1, FLA_RIGHT );

    FLA_Repart_1x2_to_1x3( CL,  /**/ CR,        &C0, /**/ &c1, &C2,
                           1, FLA_RIGHT );

    /*------------------------------------------------------------*/
		//c1 = (b1 * alpha11) + c1;
    FLA_Axpy(alpha11, b1, c1);
    
    //c1 = (B2 * a12t') + c1;
    FLA_Gemv(FLA_NO_TRANSPOSE, FLA_ONE, B2, a12t, FLA_ONE, c1);
    
    //C2 = (b1 * a12t) + C2
    FLA_Ger(FLA_ONE, a12t, b1, C0);

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &BL,  /**/ &BR,        B0, b1, /**/ B2,
                              FLA_LEFT );

    FLA_Cont_with_1x3_to_1x2( &CL,  /**/ &CR,        C0, c1, /**/ C2,
                              FLA_LEFT );

  }

  return FLA_SUCCESS;
}
FLA_Error FLA_Hess_UT_step_unb_var2( FLA_Obj A, FLA_Obj T )
{
  FLA_Obj  ATL,   ATR,      A00,  a01,     A02, 
           ABL,   ABR,      a10t, alpha11, a12t,
                            A20,  a21,     A22;
  FLA_Obj  TTL,   TTR,      T00,  t01,   T02, 
           TBL,   TBR,      t10t, tau11, t12t,
                            T20,  t21,   T22;
  FLA_Obj  yT,              y0,
           yB,              psi1,
                            y2;
  FLA_Obj  zT,              z0,
           zB,              zeta1,
                            z2;
  FLA_Obj  y, z;
           
  FLA_Obj  inv_tau11;
  FLA_Obj  minus_inv_tau11;
  FLA_Obj  first_elem;
  FLA_Obj  beta;
  FLA_Obj  conj_beta;
  FLA_Obj  dot_product;

  FLA_Obj  a21_t,
           a21_b;

  FLA_Datatype datatype_A;
  dim_t        m_A;
  dim_t        b_alg;


  b_alg      = FLA_Obj_length( T );

  datatype_A = FLA_Obj_datatype( A );
  m_A        = FLA_Obj_length( A );

  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &inv_tau11 );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &minus_inv_tau11 );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &first_elem );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &beta );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &conj_beta );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &dot_product );
  FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y );
  FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );
  FLA_Part_2x2( T,    &TTL, &TTR,
                      &TBL, &TBR,     0, 0, FLA_TL );
  FLA_Part_2x1( y,    &yT, 
                      &yB,            0, FLA_TOP );
  FLA_Part_2x1( z,    &zT, 
                      &zB,            0, FLA_TOP );

  while ( FLA_Obj_length( ATL ) < b_alg )
  {
    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );
    FLA_Repart_2x2_to_3x3( TTL, /**/ TTR,       &T00,  /**/ &t01,   &T02,
                        /* ************* */   /* ************************** */
                                                &t10t, /**/ &tau11, &t12t,
                           TBL, /**/ TBR,       &T20,  /**/ &t21,   &T22,
                           1, 1, FLA_BR );
    FLA_Repart_2x1_to_3x1( yT,                &y0, 
                        /* ** */            /* **** */
                                              &psi1, 
                           yB,                &y2,        1, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( zT,                &z0, 
                        /* ** */            /* ***** */
                                              &zeta1, 
                           zB,                &z2,        1, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    if ( FLA_Obj_length( A22 ) > 0 )
    {
      FLA_Part_2x1( a21,    &a21_t,
                            &a21_b,        1, FLA_TOP );

      // [ u21, tau11, a21 ] = House( a21 );
      FLA_Househ2_UT( FLA_LEFT,
                      a21_t,
                      a21_b, tau11 );

      // inv_tau11            =  1 / tau11;
      // minus_inv_tau11      = -1 / tau11;
      FLA_Set( FLA_ONE, inv_tau11 );
      FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 );
      FLA_Copy( inv_tau11, minus_inv_tau11 );
      FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 );

      // Save first element of a21_t and set it to one so we can use a21 as
      // u21 in subsequent computations. We will restore a21_t later on.
      FLA_Copy( a21_t, first_elem );
      FLA_Set( FLA_ONE, a21_t );

      // y21 = A22' * u21;
      FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 );

      // z21 = A22 * u21;
      FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 );

      // beta      = u21' * z21 / 2;
      // conj_beta = conj(beta);
      FLA_Dotc( FLA_CONJUGATE, a21, z2, beta );
      FLA_Inv_scal( FLA_TWO, beta );
      FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta );

      // y21' = ( y21' - beta / tau * u21' ) / tau;
      // y21  = ( y21 - conj(beta) / tau * u21 ) / tau;
      FLA_Scal( minus_inv_tau11, conj_beta );
      FLA_Axpy( conj_beta, a21, y2 );
      FLA_Scal( inv_tau11, y2 );

      // z21 = ( z21 - beta / tau * u21 ) / tau;
      FLA_Scal( minus_inv_tau11, beta );
      FLA_Axpy( beta, a21, z2 );
      FLA_Scal( inv_tau11, z2 );

      // a12t = a12t * ( I - u21 * u21' / tau );
      //      = a12t - ( a12t * u21 ) * u21' / tau;
      FLA_Dot( a12t, a21, dot_product );
      FLA_Scal( minus_inv_tau11, dot_product );
      FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t );

      // A02 = A02 * ( I - u21 * u21' / tau );
      //     = A02 - ( A02 * u21 ) * u21' / tau;
      FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 );
      FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 );

      // A22 = A22 - u21 * y21' - z21 * u21';
      FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 );
      FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 );

      // t01 = U20' * u21;
      FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 );

      // Restore first element of a21.
      FLA_Copy( first_elem, a21_t );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );
    FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR,       T00,  t01,   /**/ T02,
                                                     t10t, tau11, /**/ t12t,
                            /* ************** */  /* ************************ */
                              &TBL, /**/ &TBR,       T20,  t21,   /**/ T22,
                              FLA_TL );
    FLA_Cont_with_3x1_to_2x1( &yT,                y0, 
                                                  psi1, 
                            /* ** */           /* **** */
                              &yB,                y2,     FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &zT,                z0, 
                                                  zeta1, 
                            /* ** */           /* ***** */
                              &zB,                z2,     FLA_TOP );
  }

  FLA_Obj_free( &inv_tau11 );
  FLA_Obj_free( &minus_inv_tau11 );
  FLA_Obj_free( &first_elem );
  FLA_Obj_free( &beta );
  FLA_Obj_free( &conj_beta );
  FLA_Obj_free( &dot_product );
  FLA_Obj_free( &y );
  FLA_Obj_free( &z );

  return FLA_SUCCESS;
}
Exemple #3
0
void time_Apply_G_rf(
               int variant, int type, int n_repeats, int m, int k, int n, int b_alg,
               FLA_Obj A, FLA_Obj A_ref, FLA_Obj G, FLA_Obj P,
               double *dtime, double *diff, double *gflops )
{
  int irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, G_save, norm;

  if ( FLA_Obj_is_real( A ) )
  {
    if (
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 1 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 1 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 2 && type == FLA_ALG_BLOCKED ) ||

       //( variant == 3 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 3 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 3 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 6 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 6 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 6 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 9 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 9 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 9 && type == FLA_ALG_BLOCKED ) ||

       ( variant == 4 ) ||
       ( variant == 5 ) ||
       ( variant == 7 ) ||
       ( variant == 8 ) ||
       FALSE
    ) 
    {
      *gflops = 0.0;
      *diff   = 0.0;
      return;
    }
  }
  else if ( FLA_Obj_is_complex( A ) )
  {
    if (
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 1 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 1 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 2 && type == FLA_ALG_BLOCKED ) ||

       //( variant == 3 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 3 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 3 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 6 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 6 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 6 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 9 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 9 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 9 && type == FLA_ALG_BLOCKED ) ||

       ( variant == 4 ) ||
       ( variant == 5 ) ||
       ( variant == 7 ) ||
       ( variant == 8 ) ||
       FALSE
    )
    {
      *gflops = 0.0;
      *diff   = 0.0;
      return;
    }
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  //dim_t b_flash_m = b_alg;
  //dim_t b_flash_n = n;
  //FLASH_Obj_create_hier_copy_of_flat_ext( A, 1, &b_flash_m, &b_flash_n, &AH ); 

//printf ( "flash dims: %d x %d\n", FLA_Obj_length( AH ), FLA_Obj_width( AH ) );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( G, G_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( G_save, G );
    //FLASH_Obj_hierarchify( A_save, AH );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      break;

    // Time variant 1
    case 1:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var1( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var1( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var1( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 2
    case 2:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var2( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var2( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var2( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 3
    case 3:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var3( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var3( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var3( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 6
    case 6:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var6( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var6( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var6( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 9
    case 9:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var9( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var9( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var9( G, A, b_alg );
        break;
      }
      break;
    }


    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  if ( variant == 1 && type == FLA_ALG_UNB_OPT )
  {
    //FLA_Obj_show( "A_ref", A, "%9.2e + %9.2e ", "" );
    //FLA_Obj_show( "A", A, "%9.2e ", "" );

    FLA_Copy( A, A_ref );
    *diff = 0.0;
  }
  else
  {
    //FLA_Obj_show( "A", A, "%9.2e + %9.2e ", "" );

//if ( variant == 7 && type == FLA_ALG_UNB_ASM )
    //FLA_Obj_show( "A", A, "%9.2e", "" );

    //if ( variant == 9 ) FLASH_Obj_flatten( AH, A );

    FLA_Axpy( FLA_MINUS_ONE, A_ref, A );
    FLA_Norm_frob( A, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    //*diff = FLA_Max_elemwise_diff( A_ref, A );
  }


  *gflops = 6.0 * k * m * ( n - 1 ) /
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 2.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( G_save, G );

  //FLASH_Obj_free( &AH );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &G_save );
  FLA_Obj_free( &norm );
}
int Symm_ru_unb_var6( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
                           A20,  a21,     A22;

  FLA_Obj BL,    BR,       B0,  b1,  B2;

  FLA_Obj CL,    CR,       C0,  c1,  C2;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_BR );

  FLA_Part_1x2( B,    &BL,  &BR,      0, FLA_RIGHT );

  FLA_Part_1x2( C,    &CL,  &CR,      0, FLA_RIGHT );

  while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  &a01,     /**/ &A02,
                                                &a10t, &alpha11, /**/ &a12t,
                        /* ************* */   /* ************************** */
                           ABL, /**/ ABR,       &A20,  &a21,     /**/ &A22,
                           1, 1, FLA_TL );

    FLA_Repart_1x2_to_1x3( BL,  /**/ BR,        &B0, &b1, /**/ &B2,
                           1, FLA_LEFT );

    FLA_Repart_1x2_to_1x3( CL,  /**/ CR,        &C0, &c1, /**/ &C2,
                           1, FLA_LEFT );

    /*------------------------------------------------------------*/

    /*c1 = B0 * a01 + c1;
    c1 = SLAP_Gemv_n(1,B0,a01,c1);*/
    FLA_Gemv(FLA_NO_TRANSPOSE, FLA_ONE, B0, a01, FLA_ONE, c1); 

    /*c1 = alpha11 * b1 + c1;
    c1 = SLAP_Axpy(alpha11,b1,c1);*/
    FLA_Axpy(alpha11,b1,c1);

    /*c1 = B2 * a21 + c1;
    c1 = SLAP_Gemv_n(1,B2,a12,c1);*/
    FLA_Gemv(FLA_NO_TRANSPOSE, FLA_ONE, B2, a12t, FLA_ONE, c1); 

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  /**/ a01,     A02,
                            /* ************** */  /* ************************ */
                                                     a10t, /**/ alpha11, a12t,
                              &ABL, /**/ &ABR,       A20,  /**/ a21,     A22,
                              FLA_BR );

    FLA_Cont_with_1x3_to_1x2( &BL,  /**/ &BR,        B0, /**/ b1, B2,
                              FLA_RIGHT );

    FLA_Cont_with_1x3_to_1x2( &CL,  /**/ &CR,        C0, /**/ c1, C2,
                              FLA_RIGHT );

  }

  return FLA_SUCCESS;
}
int Symm_unb_var6( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
  FLA_Obj ATL,   ATR,      A00,  a01,     A02, 
          ABL,   ABR,      a10t, alpha11, a12t,
          A20,  a21,     A22;

  FLA_Obj BT,              B0,
          BB,              b1t,
          B2;

  FLA_Obj CT,              C0,
          CB,              c1t,
          C2;

  FLA_Part_2x2( A,    &ATL, &ATR,
      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_2x1( B,    &BT, 
      &BB,            0, FLA_TOP );

  FLA_Part_2x1( C,    &CT, 
      &CB,            0, FLA_TOP );

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
        /* ************* */   /* ************************** */
        &a10t, /**/ &alpha11, &a12t,
        ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
        1, 1, FLA_BR );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
        /* ** */            /* *** */
        &b1t, 
        BB,                &B2,        1, FLA_BOTTOM );

    FLA_Repart_2x1_to_3x1( CT,                &C0, 
        /* ** */            /* *** */
        &c1t, 
        CB,                &C2,        1, FLA_BOTTOM );

    /*------------------------------------------------------------*/
    
    // c1t = c1t + a10t*B0 + alpha11*b1t + a21t*B2;
    FLA_Gemv(FLA_TRANSPOSE, FLA_ONE, B0, a10t, FLA_ONE, c1t);
    FLA_Gemv(FLA_TRANSPOSE, FLA_ONE, B2, a21, FLA_ONE, c1t);
    FLA_Axpy(alpha11, b1t, c1t);

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
        a10t, alpha11, /**/ a12t,
        /* ************** */  /* ************************ */
        &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
        FLA_TL );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
        b1t, 
        /* ** */           /* *** */
        &BB,                B2,     FLA_TOP );

    FLA_Cont_with_3x1_to_2x1( &CT,                C0, 
        c1t, 
        /* ** */           /* *** */
        &CB,                C2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
Exemple #6
0
void time_Tevd_v(
               int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max,
               FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l,
               double *dtime, double *diff1, double* diff2, double *gflops )
{
  int irep;

  double
    k, dtime_old = 1.0e9;

  FLA_Obj
    A_save, G_save, d_save, e_save;

  if (
       //( variant == 0 ) ||
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       FALSE
     )
  {
    *dtime  = 0.0;
    *gflops = 0.0;
    *diff1  = 0.0;
    *diff2  = 0.0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( G, G_save );
  FLA_Copy_external( d, d_save );
  FLA_Copy_external( e, e_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( G_save, G );
    FLA_Copy_external( d_save, d );
    FLA_Copy_external( e_save, e );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      REF_Tevd_v( d, e, A );
      break;

    // Time variant 1
    case 1:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 2
    case 2:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg );
        break;
      }
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }
  {
    FLA_Obj V, A_rev_evd, norm, eye;

	FLA_Copy( d, l );

//FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" );
//FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" );
	FLA_Sort_evd( FLA_FORWARD, l, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );


    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

/*
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A, D, FLA_ZERO, A_rev_evd );
    FLA_Copy( A_rev_evd, D );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, D, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );
*/
//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );
    //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

/*
FLA_Obj_free( &EL );
FLA_Obj_free( &EU );
FLA_Obj_free( &D );
FLA_Obj_free( &dc );
FLA_Obj_free( &ec );
*/

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                2.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }
  else 
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                1.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( G_save, G );
  FLA_Copy_external( d_save, d );
  FLA_Copy_external( e_save, e );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &G_save );
  FLA_Obj_free( &d_save );
  FLA_Obj_free( &e_save );
}
Exemple #7
0
void time_Lyap_h(
               int variant, int type, int n_repeats, int m, int nb_alg,
               FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale,
               double *dtime, double *diff, double *gflops )
{
  int irep;

  double dtime_old = 1.0e9;

  FLA_Obj C_save, norm;

  fla_blocksize_t* bp;
  fla_lyap_t*      cntl_lyap_unb;
  fla_lyap_t*      cntl_lyap_opt;
  fla_lyap_t*      cntl_lyap_blk;

  if ( type == FLA_ALG_UNB_OPT && variant > 4 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  bp               = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_lyap_unb    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_UNB_VAR_OFFSET + variant,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL );
  cntl_lyap_opt    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_OPT_VAR_OFFSET + variant,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL );
  cntl_lyap_blk    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_BLK_VAR_OFFSET + variant,
                                               bp,
                                               fla_scal_cntl_blas,
                                               fla_lyap_cntl_leaf,
                                               fla_sylv_cntl,
                                               fla_gemm_cntl_blas,
                                               fla_gemm_cntl_blas,
                                               fla_hemm_cntl_blas,
                                               fla_her2k_cntl_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( C, C_save );


  for ( irep = 0 ; irep < n_repeats; irep++ )
  {
    FLA_Copy_external( C_save, C );

    *dtime = FLA_Clock();

    switch( variant )
    {

    case 0:
      REF_Lyap_h( isgn, A, C, scale );
      break;

    case 1:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var1( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var1( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var1( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 2:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var2( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var2( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var2( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 3:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var3( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var3( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var3( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 4:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var4( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var4( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var4( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


  FLA_Blocksize_free( bp );
  FLA_Cntl_obj_free( cntl_lyap_unb );
  FLA_Cntl_obj_free( cntl_lyap_opt );
  FLA_Cntl_obj_free( cntl_lyap_blk );


/*
  if ( variant == 0 )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C );
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }
*/
  {
    FLA_Obj X, W;

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W );

    FLA_Copy( C, X );
    FLA_Hermitianize( FLA_UPPER_TRIANGULAR, X );

    FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W );
    FLA_Gemm( FLA_NO_TRANSPOSE,   FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE,  W );
    FLA_Scal( isgn, W );
/*
    if ( variant == 3 && type == FLA_ALG_UNBLOCKED )
    {
      FLA_Obj_show( "W", W, "%10.3e + %10.3e ", "" );
      FLA_Obj_show( "C_save", C_save, "%10.3e + %10.3e ", "" );
    }
*/
    FLA_Axpy( FLA_MINUS_ONE, C_save, W );
    FLA_Norm1( W, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    FLA_Obj_free( &X );
    FLA_Obj_free( &W );
  }

  *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( C ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_save, C );

  FLA_Obj_free( &C_save );
  FLA_Obj_free( &norm );
}
void time_Hevd_lv_components(
               int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
               FLA_Obj A, FLA_Obj l,
               double* dtime, double* diff1, double* diff2, double* gflops,
               double* dtime_tred, double* gflops_tred,
               double* dtime_tevd, double* gflops_tevd,
               double* dtime_appq, double* gflops_appq, int* k_perf )
{
  int     i;
  double  k;
  double  dtime_save      = 1.0e9;
  double  dtime_tred_save = 1.0e9;
  double  dtime_tevd_save = 1.0e9;
  double  dtime_appq_save = 1.0e9;
  double  flops_tred;
  double  flops_tevd;
  double  flops_appq;
  double  mult_tred;
  double  mult_tevd;
  double  mult_appq;

  FLA_Obj A_save, Z;

  if (
       ( variant == -3 ) ||
       ( variant == -4 ) ||
       ( variant == -5 ) ||
       //( variant == 0 ) ||
       //( variant == -1 ) ||
       //( variant == -2 ) ||
       //( variant == 1 ) ||
       //( variant == 2 ) ||
       //( variant == 3 ) ||
       //( variant == 4 ) ||
       FALSE
     )
  {
    *gflops      = 0.0;
    *dtime       = 0.0;
    *diff1       = 0.0;
    *diff2       = 0.0;
    *dtime_tred  = 0.0;
    *dtime_tevd  = 0.0;
    *dtime_appq  = 0.0;
    *gflops_tred = 0.0;
    *gflops_tevd = 0.0;
    *gflops_appq = 0.0;
    *k_perf      = 0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z );

  FLA_Copy_external( A, A_save );

  for ( i = 0 ; i < n_repeats; i++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case -3:
    {
      *k_perf = 0;
      REF_Hevd_lv( A, l,
                   dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -4:
    {
      *k_perf = 0;
      REF_Hevdd_lv( A, l,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -5:
    {
      *k_perf = 0;
      REF_Hevdr_lv( A, l, Z,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case 0:
    {
      *k_perf = 0;
      REF_Hevd_lv_components( A, l,
                              dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -1:
    {
      *k_perf = 0;
      REF_Hevdd_lv_components( A, l,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -2:
    {
      *k_perf = 0;
      REF_Hevdr_lv_components( A, l, Z,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 1
    case 1:
    {
      *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 2
    case 2:
    {
      *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    if ( *dtime < dtime_save )
    {
      dtime_save      = *dtime;
      dtime_tred_save = *dtime_tred;
      dtime_tevd_save = *dtime_tevd;
      dtime_appq_save = *dtime_appq;
    }
  }

  *dtime      = dtime_save;
  *dtime_tred = dtime_tred_save;
  *dtime_tevd = dtime_tevd_save;
  *dtime_appq = dtime_appq_save;

//if ( variant == -3 || variant == 0 )
//printf( "\ndtime is %9.3e\n", *dtime );

  {
    FLA_Obj V, A_rev_evd, norm, eye;

    if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  flops_tred = ( ( 4.0 / 3.0 )   * m * m * m );
  flops_tevd = (   4.5           * k * m * m     +
                   3.0           * k * m * m * m );

  if ( variant == -1 || variant == -2 ||
       variant == -4 || variant == -5 )
    flops_appq = ( 2.0           * m * m * m );
  else
    flops_appq = ( 4.0 / 3.0     * m * m * m );

/*
  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops      = ( 4.0 * flops_tred + 
                     2.0 * flops_tevd + 
                     4.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9;
  }
  else
  {
    *gflops      = ( 1.0 * flops_tred + 
                     1.0 * flops_tevd + 
                     1.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9;
  }
*/

  if ( FLA_Obj_is_complex( A ) )
  {
    mult_tred = 4.0;
    mult_tevd = 2.0;
    mult_appq = 4.0;
  }
  else
  {
    mult_tred = 1.0;
    mult_tevd = 1.0;
    mult_appq = 1.0;
  }

  *gflops = ( mult_tred * flops_tred + 
              mult_tevd * flops_tevd + 
              mult_appq * flops_appq ) / *dtime / 1e9;

  *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9;
  *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9;
  *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &Z );
}