int Symm_ru_blk_var6( FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj BL,    BR,       B0,  B1,  B2;

  FLA_Obj CL,    CR,       C0,  C1,  C2;

  int b;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_BR );

  FLA_Part_1x2( B,    &BL,  &BR,      0, FLA_RIGHT );

  FLA_Part_1x2( C,    &CL,  &CR,      0, FLA_RIGHT );

  while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){

    b = min( FLA_Obj_length( ATL ), nb_alg );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, &A01, /**/ &A02,
                                                &A10, &A11, /**/ &A12,
                        /* ************* */   /* ******************** */
                           ABL, /**/ ABR,       &A20, &A21, /**/ &A22,
                           b, b, FLA_TL );

    FLA_Repart_1x2_to_1x3( BL,  /**/ BR,        &B0, &B1, /**/ &B2,
                           b, FLA_LEFT );

    FLA_Repart_1x2_to_1x3( CL,  /**/ CR,        &C0, &C1, /**/ &C2,
                           b, FLA_LEFT );

    /*------------------------------------------------------------*/

    /*C1 = B0 * A01 + B1 * A11 + B2 * A12' + C1;*/
    FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, B0, A01, FLA_ONE, C1);
    FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, B1, A11, FLA_ONE, C1);
    FLA_Gemm(FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, B2, A12, FLA_ONE, C1);

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, /**/ A01, A02,
                            /* ************** */  /* ****************** */
                                                     A10, /**/ A11, A12,
                              &ABL, /**/ &ABR,       A20, /**/ A21, A22,
                              FLA_BR );

    FLA_Cont_with_1x3_to_1x2( &BL,  /**/ &BR,        B0, /**/ B1, B2,
                              FLA_RIGHT );

    FLA_Cont_with_1x3_to_1x2( &CL,  /**/ &CR,        C0, /**/ C1, C2,
                              FLA_RIGHT );

  }

  return FLA_SUCCESS;
}
Example #2
0
FLA_Error LU_blk_var4( FLA_Obj A, int nb_alg )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  dim_t b;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){

    b = min( FLA_Obj_length( ABR ), nb_alg );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    /* A11 = A11 - A10 * A01 ); */
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_MINUS_ONE, A10, A01, FLA_ONE, A11 );

    /* A11 = LU( A11 ); */
    LU_unb_var4( A11 );

    /* A12 = A12 - A10 * A02; */
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_MINUS_ONE, A10, A02, FLA_ONE, A12 );

    /* A12 = inv( trilu( A11 ) ) * A12; */
    FLA_Trsm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
              FLA_ONE, A11, A12 );

    /* A21 = A21 - A20 * A01; */
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_MINUS_ONE, A20, A01, FLA_ONE, A21 );

    /* A21 = A21 * inv( triu( A11 ) ); */
    FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
              FLA_ONE, A11, A21 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );
  }

  return FLA_SUCCESS;
}
Example #3
0
int LU_blk_var3( FLA_Obj A, int nb_alg )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  int b;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){

    b = min( FLA_Obj_length( ABR ), nb_alg );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    // A01 := inv(L00) * A01
    FLA_Trsm( FLA_LEFT, FLA_LOWER_TRIANGULAR,
	      FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
	      FLA_ONE, A00, A01 );

    // A11 := LU(A11 - A10 * A01)
    FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                FLA_MINUS_ONE, A10, A01, FLA_ONE, A11);
    LU_unb_var3(A11);

    // A21 := (A21 - A20 * A01) * inv(U11)
    FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
	      FLA_MINUS_ONE, A20, A01, FLA_ONE, A21);
    FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
	      FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
	      FLA_ONE, A11, A21);

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

  }

  return FLA_SUCCESS;
}
// ============================================================================
void compute_case1( int m, int n, int k, int l, 
         FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj C, int print_data ) {
  FLA_Obj  slice_A, slice_B;
  int      datatype, h;
  double   * buff_cb_A, * buff_cb_B;

  // Some initializations.
  datatype  = FLA_Obj_datatype( cb_A );
  buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
  buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );

  // Prepare temporal slices.
  FLA_Obj_create_without_buffer( datatype, m, k, & slice_A );
  FLA_Obj_create_without_buffer( datatype, n, k, & slice_B );

  // Initialize matrix C for the result.
  MyFLA_Obj_set_to_zero( C );

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " Ci = [ ", C, "%le", " ];" );
    FLA_Obj_show( " cb_A = [ ", cb_A, "%le", " ];" );
    FLA_Obj_show( " cb_B = [ ", cb_B, "%le", " ];" );
  }

  // Perform computation.
  for( h = 0; h < l; h++ ) {
    FLA_Obj_attach_buffer( buff_cb_A + m * k * h, 1, m, & slice_A );
    FLA_Obj_attach_buffer( buff_cb_B + n * k * h, 1, n, & slice_B );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, 
              FLA_ONE, slice_A, slice_B, FLA_ONE, C );
  }
  
  // Remove temporal slices.
  FLA_Obj_free_without_buffer( & slice_A );
  FLA_Obj_free_without_buffer( & slice_B );

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " Cf = [ ", C, "%le", " ];" );
  }
}
Example #5
0
int main(int argc, char *argv[])
{
  int 
    m_input,
    m,
    p_first, p_last, p_inc,
    p,
    k_accum,
    b_alg,
    n_iter_max,
    variant,
    n_repeats,
    i,
    n_variants = 2;

  char *colors = "brkgmcbrkg";
  char *ticks  = "o+*xso+*xs";
  char m_dim_desc[14];
  char m_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff1, diff2;

  FLA_Datatype datatype, dt_real;

  FLA_Obj
    A, l, Q, Ql, TT, r, d, e, A_orig, G, R, W2, de, alpha;

  FLA_Init();


  fprintf( stdout, "%c number of repeats:", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c enter n_iter_max (per eigenvalue): ", '%' );
  scanf( "%d", &n_iter_max );
  fprintf( stdout, "%c %d\n", '%', n_iter_max );

  fprintf( stdout, "%c enter number of sets of Givens rotations to accumulate:", '%' );
  scanf( "%d", &k_accum );
  fprintf( stdout, "%c %d\n", '%', k_accum );

  fprintf( stdout, "%c enter blocking size for application of G:", '%' );
  scanf( "%d", &b_alg );
  fprintf( stdout, "%c %d\n", '%', b_alg );

  fprintf( stdout, "%c enter problem size first, last, inc:", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' );
  scanf( "%d", &m_input );
  fprintf( stdout, "%c %d\n", '%', m_input );


  fprintf( stdout, "\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }


  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {

    m = m_input;

    if( m < 0 ) m = p / abs(m_input);

    //datatype = FLA_FLOAT;
    //datatype = FLA_DOUBLE;
    //datatype = FLA_COMPLEX;
    datatype = FLA_DOUBLE_COMPLEX;

    FLA_Obj_create( datatype,  m,         m, 0, 0, &A );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &A_orig );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &Q );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &Ql );
    FLA_Obj_create( datatype,  m,         1, 0, 0, &r );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &W2 );
    FLA_Obj_create( datatype,  m-1, k_accum, 0, 0, &G );

	dt_real = FLA_Obj_datatype_proj_to_real( A );

    FLA_Obj_create( dt_real, m,      1, 0, 0, &l );
    FLA_Obj_create( dt_real, m,      1, 0, 0, &d );
    FLA_Obj_create( dt_real, m-1,    1, 0, 0, &e );
    FLA_Obj_create( dt_real, m,      m, 0, 0, &R );

	FLA_Obj_create( dt_real, 1,      1, 0, 0, &alpha );

	*FLA_DOUBLE_PTR( alpha ) = 1.0 / ( sqrt( sqrt( (double) m ) ) );

	FLA_Random_unitary_matrix( Q );

	//FLA_Fill_with_uniform_dist( FLA_ONE,   l );
	//FLA_Fill_with_inverse_dist( FLA_ONE,   l );
	FLA_Fill_with_geometric_dist( alpha,   l );


    {
      FLA_Copy( Q, Ql );
      FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, Ql );
      FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                FLA_ONE, Ql, Q, FLA_ZERO, A );
      FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A );
      FLA_Copy( A, A_orig );
    }

    FLA_Set( FLA_ZERO, l );
    FLA_Set( FLA_ZERO, Q );

	FLA_Tridiag_UT_create_T( A, &TT );
	FLA_Tridiag_UT( FLA_LOWER_TRIANGULAR, A, TT );
	FLA_Tridiag_UT_realify( FLA_LOWER_TRIANGULAR, A, r );
	FLA_Tridiag_UT_extract_diagonals( FLA_LOWER_TRIANGULAR, A, d, e );
	FLA_Tridiag_UT_form_Q( FLA_LOWER_TRIANGULAR, A, TT );
	FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A );
    FLA_Obj_free( &TT );

    time_Tevd_v( 0, FLA_ALG_REFERENCE, n_repeats, m, k_accum, b_alg, n_iter_max,
                 A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops );

    fprintf( stdout, "data_REFq( %d, 1:3 ) = [ %d %6.3lf %9.2e %6.2le %6.2le ]; \n", i, p, gflops, dtime, diff1, diff2 );
    fflush( stdout );

    for ( variant = 1; variant <= n_variants; variant++ ){
      
      fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p );
      fflush( stdout );

      time_Tevd_v( variant, FLA_ALG_UNB_OPT, n_repeats, m, k_accum, b_alg, n_iter_max,
                   A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops );

      fprintf( stdout, "%6.3lf %9.2e %6.2le %6.2le ", gflops, dtime, diff1, diff2 );
      fflush( stdout );

      fprintf( stdout, "];\n" );
      fflush( stdout );
    }

    fprintf( stdout, "\n" );

    FLA_Obj_free( &A );
    FLA_Obj_free( &A_orig );
    FLA_Obj_free( &Q );
    FLA_Obj_free( &Ql );
    FLA_Obj_free( &G );
    FLA_Obj_free( &W2 );
    FLA_Obj_free( &r );
    FLA_Obj_free( &l );
    FLA_Obj_free( &d );
    FLA_Obj_free( &e );
    FLA_Obj_free( &R );
    FLA_Obj_free( &alpha );
  }

/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 1; i <= n_variants; i++ ) {
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
  }

  fprintf( stdout, "legend( ... \n" );
  fprintf( stdout, "'Reference', ... \n" );

  for ( i = 1; i < n_variants; i++ )
    fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i );
  fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME Hevd_lv performance (%s, %s)' );\n", 
           m_dim_desc, n_dim_desc );
  fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/

  FLA_Finalize( );

  return 0;
}
Example #6
0
FLA_Error FLA_Svd_uv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg )
{
    FLA_Error    r_val = FLA_SUCCESS;
    FLA_Datatype dt;
    FLA_Datatype dt_real;
    FLA_Datatype dt_comp;
    FLA_Obj      scale, T, S, rL, rR, d, e, G, H;
    dim_t        m_A, n_A;
    dim_t        min_m_n;
    dim_t        n_GH;
    double       crossover_ratio = 17.0 / 9.0;

    n_GH    = k_accum;

    m_A     = FLA_Obj_length( A );
    n_A     = FLA_Obj_width( A );
    min_m_n = FLA_Obj_min_dim( A );
    dt      = FLA_Obj_datatype( A );
    dt_real = FLA_Obj_datatype_proj_to_real( A );
    dt_comp = FLA_Obj_datatype_proj_to_complex( A );

    // Create matrices to hold block Householder transformations.
    FLA_Bidiag_UT_create_T( A, &T, &S );

    // Create vectors to hold the realifying scalars.
    FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rL );
    FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rR );

    // Create vectors to hold the diagonal and sub-diagonal.
    FLA_Obj_create( dt_real, min_m_n,      1, 0, 0, &d );
    FLA_Obj_create( dt_real, min_m_n-1,    1, 0, 0, &e );

    // Create matrices to hold the left and right Givens scalars.
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G );
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H );

    // Create a real scaling factor.
    FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale );

    // Compute a scaling factor; If none is needed, sigma will be set to one.
    FLA_Svd_compute_scaling( A, scale );

    // Scale the matrix if scale is non-unit.
    if ( !FLA_Obj_equals( scale, FLA_ONE ) )
        FLA_Scal( scale, A );

    if ( m_A < crossover_ratio * n_A )
    {
        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( A, T, S );
        FLA_Bidiag_UT_realify( A, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( A, d, e );

        // Form U and V.
        FLA_Bidiag_UT_form_U( A, T, U );
        FLA_Bidiag_UT_form_V( A, S, V );

        // Apply the realifying scalars in rL and rR to U and V, respectively.
        {
            FLA_Obj UL, UR;
            FLA_Obj VL, VR;

            FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );
            FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );

            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, UL );
            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
        }

        // Perform a singular value decomposition on the bidiagonal matrix.
        r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, U, V, b_alg );
    }
    else // if ( crossover_ratio * n_A <= m_A )
    {
        FLA_Obj TQ, R;
        FLA_Obj AT,
                AB;
        FLA_Obj UL, UR;

        // Perform a QR factorization on A and form Q in U.
        FLA_QR_UT_create_T( A, &TQ );
        FLA_QR_UT( A, TQ );
        FLA_QR_UT_form_Q( A, TQ, U );
        FLA_Obj_free( &TQ );

        // Set the lower triangle of R to zero and then copy the upper
        // triangle of A to R.
        FLA_Part_2x1( A,   &AT,
                           &AB,   n_A, FLA_TOP );
        FLA_Obj_create( dt, n_A, n_A, 0, 0, &R );
        FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R );
        FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R );

        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( R, T, S );
        FLA_Bidiag_UT_realify( R, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( R, d, e );

        // Form V from right Householder vectors in upper triangle of R.
        FLA_Bidiag_UT_form_V( R, S, V );

        // Form U in R.
        FLA_Bidiag_UT_form_U( R, T, R );

        // Apply the realifying scalars in rL and rR to U and V, respectively.
        FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, R );
        FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V );

        // Perform a singular value decomposition on the bidiagonal matrix.
        r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, R, V, b_alg );

        // Multiply R into U, storing the result in A and then copying back
        // to U.
        FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );
        FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                  FLA_ONE, UL, R, FLA_ZERO, A );
        FLA_Copy( A, UL );

        FLA_Obj_free( &R );
    }

    // Copy the converged eigenvalues to the output vector.
    FLA_Copy( d, s );

    // Sort the singular values and singular vectors in descending order.
    FLA_Sort_svd( FLA_BACKWARD, s, U, V );

    // If the matrix was scaled, rescale the singular values.
    if ( !FLA_Obj_equals( scale, FLA_ONE ) )
        FLA_Inv_scal( scale, s );

    FLA_Obj_free( &scale );
    FLA_Obj_free( &T );
    FLA_Obj_free( &S );
    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
    FLA_Obj_free( &d );
    FLA_Obj_free( &e );
    FLA_Obj_free( &G );
    FLA_Obj_free( &H );

    return r_val;
}
FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg,
                                      FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V,
                                      double* dtime_bred, double* dtime_bsvd, double* dtime_appq,
                                      double* dtime_qrfa, double* dtime_gemm )
{
	FLA_Error    r_val = FLA_SUCCESS;
	FLA_Datatype dt;
	FLA_Datatype dt_real;
	FLA_Datatype dt_comp;
	FLA_Obj      T, S, rL, rR, d, e, G, H, RG, RH, W;
	dim_t        m_A, n_A;
	dim_t        min_m_n;
	dim_t        n_GH;
	double       crossover_ratio = 17.0 / 9.0;
	double       dtime_temp;

	n_GH    = k_accum;

	m_A     = FLA_Obj_length( A );
	n_A     = FLA_Obj_width( A );
	min_m_n = FLA_Obj_min_dim( A );
	dt      = FLA_Obj_datatype( A );
	dt_real = FLA_Obj_datatype_proj_to_real( A );
	dt_comp = FLA_Obj_datatype_proj_to_complex( A );

	// If the matrix is a scalar, then the SVD is easy.
	if ( min_m_n == 1 )
	{
		FLA_Copy( A, s );
		FLA_Set_to_identity( U );
		FLA_Set_to_identity( V );

		return FLA_SUCCESS;
	}

	// Create matrices to hold block Householder transformations.
	FLA_Bidiag_UT_create_T( A, &T, &S );

	// Create vectors to hold the realifying scalars.
	FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rL );
	FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rR );

	// Create vectors to hold the diagonal and sub-diagonal.
	FLA_Obj_create( dt_real, min_m_n,      1, 0, 0, &d );
	FLA_Obj_create( dt_real, min_m_n-1,    1, 0, 0, &e );

	// Create matrices to hold the left and right Givens scalars.
	FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G );
	FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H );

	// Create matrices to hold the left and right Givens matrices.
	FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG );
	FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH );
	FLA_Obj_create( dt,      m_A,     n_A,     0, 0, &W );

	if ( m_A >= n_A )
	{
		if ( m_A < crossover_ratio * n_A )
		{
			dtime_temp = FLA_Clock();
			{
			// Reduce the matrix to bidiagonal form.
			// Apply scalars to rotate elements on the sub-diagonal to the real domain.
			// Extract the diagonal and sub-diagonal from A.
			FLA_Bidiag_UT( A, T, S );
			FLA_Bidiag_UT_realify( A, rL, rR );
			FLA_Bidiag_UT_extract_diagonals( A, d, e );
			}
			*dtime_bred = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Form U and V.
			FLA_Bidiag_UT_form_U( A, T, U );
			FLA_Bidiag_UT_form_V( A, S, V );
			}
			*dtime_appq = FLA_Clock() - dtime_temp;

			// Apply the realifying scalars in rL and rR to U and V, respectively.
			{
				FLA_Obj UL, UR;
				FLA_Obj VL, VR;

				FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );
				FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );

				FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, UL );
				FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
			}

			dtime_temp = FLA_Clock();
			{
			// Perform a singular value decomposition on the bidiagonal matrix.
			r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg );
			}
			*dtime_bsvd = FLA_Clock() - dtime_temp;
		}
		else // if ( crossover_ratio * n_A <= m_A )
		{
			FLA_Obj TQ, R;
			FLA_Obj AT,
			        AB;
			FLA_Obj UL, UR;

			//FLA_QR_UT_create_T( A, &TQ );
			FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ );

			dtime_temp = FLA_Clock();
			{
			// Perform a QR factorization on A and form Q in U.
			FLA_QR_UT( A, TQ );
			}
			*dtime_qrfa = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			FLA_QR_UT_form_Q( A, TQ, U );
			}
			*dtime_appq = FLA_Clock() - dtime_temp;

			FLA_Obj_free( &TQ );

			// Set the lower triangle of R to zero and then copy the upper
			// triangle of A to R.
			FLA_Part_2x1( A,   &AT,
			                   &AB,   n_A, FLA_TOP );
			FLA_Obj_create( dt, n_A, n_A, 0, 0, &R );
			FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R );
			FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R );

			dtime_temp = FLA_Clock();
			{
			// Reduce the matrix to bidiagonal form.
			// Apply scalars to rotate elements on the superdiagonal to the real domain.
			// Extract the diagonal and superdiagonal from A.
			FLA_Bidiag_UT( R, T, S );
			FLA_Bidiag_UT_realify( R, rL, rR );
			FLA_Bidiag_UT_extract_diagonals( R, d, e );
			}
			*dtime_bred = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Form V from right Householder vectors in upper triangle of R.
			FLA_Bidiag_UT_form_V( R, S, V );

			// Form U in R.
			FLA_Bidiag_UT_form_U( R, T, R );
			}
			*dtime_appq += FLA_Clock() - dtime_temp;

			// Apply the realifying scalars in rL and rR to U and V, respectively.
			FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, R );
			FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V );

			dtime_temp = FLA_Clock();
			{
			// Perform a singular value decomposition on the bidiagonal matrix.
			r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg );
			}
			*dtime_bsvd = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Multiply R into U, storing the result in A and then copying back
			// to U.
			FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );
			FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
			          FLA_ONE, UL, R, FLA_ZERO, A );
			FLA_Copy( A, UL );
			}
			*dtime_gemm = FLA_Clock() - dtime_temp;

			FLA_Obj_free( &R );
		}
	}
	else // if ( m_A < n_A )
	{
		FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
	}

	// Copy the converged eigenvalues to the output vector.
	FLA_Copy( d, s );

	// Sort the singular values and singular vectors in descending order.
	FLA_Sort_svd( FLA_BACKWARD, s, U, V );

	FLA_Obj_free( &T );
	FLA_Obj_free( &S );
	FLA_Obj_free( &rL );
	FLA_Obj_free( &rR );
	FLA_Obj_free( &d );
	FLA_Obj_free( &e );
	FLA_Obj_free( &G );
	FLA_Obj_free( &H );
	FLA_Obj_free( &RG );
	FLA_Obj_free( &RH );
	FLA_Obj_free( &W );

	return r_val;
}
int Symm_blk_var5( FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj BT,              B0,
          BB,              B1,
                           B2;

  FLA_Obj CT,              C0,
          CB,              C1,
                           C2;

  int b;

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_2x1( B,    &BT, 
                      &BB,            0, FLA_TOP );

  FLA_Part_2x1( C,    &CT, 
                      &CB,            0, FLA_TOP );

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){

    b = min( FLA_Obj_length( ABR ), nb_alg );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                        /* ** */            /* ** */
                                              &B1, 
                           BB,                &B2,        b, FLA_BOTTOM );

    FLA_Repart_2x1_to_3x1( CT,                &C0, 
                        /* ** */            /* ** */
                                              &C1, 
                           CB,                &C2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/
	  
    // C2 = C2 + A21*B1;
    FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A21, B1, FLA_ONE, C2);

    // C1 = C1 + A11*B1 + A21^T*B2;
    // FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A11, B1, FLA_ONE, C1);
    FLA_Symm(FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A11, B1, FLA_ONE, C1);
    FLA_Gemm(FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A21, B2, FLA_ONE, C1);

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                  B1, 
                            /* ** */           /* ** */
                              &BB,                B2,     FLA_TOP );

    FLA_Cont_with_3x1_to_2x1( &CT,                C0, 
                                                  C1, 
                            /* ** */           /* ** */
                              &CB,                C2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
Example #9
0
void FLA_Gemm_kernel( FLA_Obj alpha, FLA_Obj packed_A, 
		      FLA_Obj packed_B, FLA_Obj packed_C )
{
  FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, packed_A, packed_B,
	    FLA_ONE, packed_C );
}
Example #10
0
void time_Gemm( 
               int param_combo, int type, int nrepeats, int m, int k, int n,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  if ( param_combo != 4 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( param_combo ){

    // Time parameter combination 0
    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 1
    case 1:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 2
    case 2:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 3
    case 3:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 4
    case 4:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        //FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 5
    case 5:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 6
    case 6:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 7
    case 7:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 8
    case 8:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }
	
    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


/*
  if ( type == FLA_ALG_REFERENCE )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }
*/

  *gflops = 2.0 * m * k * n / 
            dtime_old / 
            1.0e9;

  if ( param_combo == 0 ||
       param_combo == 1 ||
       param_combo == 2 ||
       param_combo == 3 ||
       param_combo == 6 )
  *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
Example #11
0
void time_Tevd_v(
               int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max,
               FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l,
               double *dtime, double *diff1, double* diff2, double *gflops )
{
  int irep;

  double
    k, dtime_old = 1.0e9;

  FLA_Obj
    A_save, G_save, d_save, e_save;

  if (
       //( variant == 0 ) ||
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       FALSE
     )
  {
    *dtime  = 0.0;
    *gflops = 0.0;
    *diff1  = 0.0;
    *diff2  = 0.0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( G, G_save );
  FLA_Copy_external( d, d_save );
  FLA_Copy_external( e, e_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( G_save, G );
    FLA_Copy_external( d_save, d );
    FLA_Copy_external( e_save, e );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      REF_Tevd_v( d, e, A );
      break;

    // Time variant 1
    case 1:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 2
    case 2:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg );
        break;
      }
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }
  {
    FLA_Obj V, A_rev_evd, norm, eye;

	FLA_Copy( d, l );

//FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" );
//FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" );
	FLA_Sort_evd( FLA_FORWARD, l, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );


    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

/*
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A, D, FLA_ZERO, A_rev_evd );
    FLA_Copy( A_rev_evd, D );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, D, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );
*/
//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );
    //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

/*
FLA_Obj_free( &EL );
FLA_Obj_free( &EU );
FLA_Obj_free( &D );
FLA_Obj_free( &dc );
FLA_Obj_free( &ec );
*/

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                2.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }
  else 
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                1.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( G_save, G );
  FLA_Copy_external( d_save, d );
  FLA_Copy_external( e_save, e );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &G_save );
  FLA_Obj_free( &d_save );
  FLA_Obj_free( &e_save );
}
Example #12
0
void libfla_test_hessut_experiment( test_params_t params,
                                    unsigned int  var,
                                    char*         sc_str,
                                    FLA_Datatype  datatype,
                                    unsigned int  p_cur,
                                    unsigned int  pci,
                                    unsigned int  n_repeats,
                                    signed int    impl,
                                    double*       perf,
                                    double*       residual )
{
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	FLA_Obj      A, T, W, Qh, AQ, QhAQ, norm;
	FLA_Obj      AT, AB;
	FLA_Obj      QhT, QhB;
	FLA_Obj      A_save;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

	if ( impl == FLA_TEST_FLAT_FRONT_END ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &W );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &W );
	}

	// Initialize the test matrices.
	FLA_Random_matrix( A );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );

	// Create auxiliary matrices to be used when checking the result.
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Qh );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &AQ );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &QhAQ );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		FLA_Copy_external( A_save, A );
		
		time = FLA_Clock();

		libfla_test_hessut_impl( impl, A, T );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = ( 10.0 / 3.0 * m * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Check the result by computing R - Q' A_orig Q.
	FLA_Set_to_identity( Qh );
	FLA_Part_2x1( Qh,   &QhT,
	                    &QhB,   1, FLA_TOP );
	FLA_Part_2x1( A,    &AT,
	                    &AB,    1, FLA_TOP );
	FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE,
	                AB, T, W, QhB );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
	          FLA_ONE, A_save, Qh, FLA_ZERO, AQ );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
	          FLA_ONE, Qh, AQ, FLA_ZERO, QhAQ );
	FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, AB );
	*residual = FLA_Max_elemwise_diff( A, QhAQ );

	// Free the supporting flat objects.
	FLA_Obj_free( &W );
	FLA_Obj_free( &Qh );
	FLA_Obj_free( &AQ );
	FLA_Obj_free( &QhAQ );
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
}
Example #13
0
FLA_Error REF_Gemm( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C )
{
  FLA_Gemm( transa, transb, alpha, A, B, beta, C );
  
  return 0;
}
Example #14
0
void time_Lyap_h(
               int variant, int type, int n_repeats, int m, int nb_alg,
               FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale,
               double *dtime, double *diff, double *gflops )
{
  int irep;

  double dtime_old = 1.0e9;

  FLA_Obj C_save, norm;

  fla_blocksize_t* bp;
  fla_lyap_t*      cntl_lyap_unb;
  fla_lyap_t*      cntl_lyap_opt;
  fla_lyap_t*      cntl_lyap_blk;

  if ( type == FLA_ALG_UNB_OPT && variant > 4 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  bp               = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_lyap_unb    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_UNB_VAR_OFFSET + variant,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL );
  cntl_lyap_opt    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_OPT_VAR_OFFSET + variant,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL );
  cntl_lyap_blk    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_BLK_VAR_OFFSET + variant,
                                               bp,
                                               fla_scal_cntl_blas,
                                               fla_lyap_cntl_leaf,
                                               fla_sylv_cntl,
                                               fla_gemm_cntl_blas,
                                               fla_gemm_cntl_blas,
                                               fla_hemm_cntl_blas,
                                               fla_her2k_cntl_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( C, C_save );


  for ( irep = 0 ; irep < n_repeats; irep++ )
  {
    FLA_Copy_external( C_save, C );

    *dtime = FLA_Clock();

    switch( variant )
    {

    case 0:
      REF_Lyap_h( isgn, A, C, scale );
      break;

    case 1:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var1( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var1( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var1( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 2:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var2( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var2( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var2( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 3:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var3( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var3( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var3( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 4:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var4( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var4( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var4( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


  FLA_Blocksize_free( bp );
  FLA_Cntl_obj_free( cntl_lyap_unb );
  FLA_Cntl_obj_free( cntl_lyap_opt );
  FLA_Cntl_obj_free( cntl_lyap_blk );


/*
  if ( variant == 0 )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C );
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }
*/
  {
    FLA_Obj X, W;

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W );

    FLA_Copy( C, X );
    FLA_Hermitianize( FLA_UPPER_TRIANGULAR, X );

    FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W );
    FLA_Gemm( FLA_NO_TRANSPOSE,   FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE,  W );
    FLA_Scal( isgn, W );
/*
    if ( variant == 3 && type == FLA_ALG_UNBLOCKED )
    {
      FLA_Obj_show( "W", W, "%10.3e + %10.3e ", "" );
      FLA_Obj_show( "C_save", C_save, "%10.3e + %10.3e ", "" );
    }
*/
    FLA_Axpy( FLA_MINUS_ONE, C_save, W );
    FLA_Norm1( W, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    FLA_Obj_free( &X );
    FLA_Obj_free( &W );
  }

  *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( C ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_save, C );

  FLA_Obj_free( &C_save );
  FLA_Obj_free( &norm );
}
void time_Hevd_lv_components(
               int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
               FLA_Obj A, FLA_Obj l,
               double* dtime, double* diff1, double* diff2, double* gflops,
               double* dtime_tred, double* gflops_tred,
               double* dtime_tevd, double* gflops_tevd,
               double* dtime_appq, double* gflops_appq, int* k_perf )
{
  int     i;
  double  k;
  double  dtime_save      = 1.0e9;
  double  dtime_tred_save = 1.0e9;
  double  dtime_tevd_save = 1.0e9;
  double  dtime_appq_save = 1.0e9;
  double  flops_tred;
  double  flops_tevd;
  double  flops_appq;
  double  mult_tred;
  double  mult_tevd;
  double  mult_appq;

  FLA_Obj A_save, Z;

  if (
       ( variant == -3 ) ||
       ( variant == -4 ) ||
       ( variant == -5 ) ||
       //( variant == 0 ) ||
       //( variant == -1 ) ||
       //( variant == -2 ) ||
       //( variant == 1 ) ||
       //( variant == 2 ) ||
       //( variant == 3 ) ||
       //( variant == 4 ) ||
       FALSE
     )
  {
    *gflops      = 0.0;
    *dtime       = 0.0;
    *diff1       = 0.0;
    *diff2       = 0.0;
    *dtime_tred  = 0.0;
    *dtime_tevd  = 0.0;
    *dtime_appq  = 0.0;
    *gflops_tred = 0.0;
    *gflops_tevd = 0.0;
    *gflops_appq = 0.0;
    *k_perf      = 0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z );

  FLA_Copy_external( A, A_save );

  for ( i = 0 ; i < n_repeats; i++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case -3:
    {
      *k_perf = 0;
      REF_Hevd_lv( A, l,
                   dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -4:
    {
      *k_perf = 0;
      REF_Hevdd_lv( A, l,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -5:
    {
      *k_perf = 0;
      REF_Hevdr_lv( A, l, Z,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case 0:
    {
      *k_perf = 0;
      REF_Hevd_lv_components( A, l,
                              dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -1:
    {
      *k_perf = 0;
      REF_Hevdd_lv_components( A, l,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -2:
    {
      *k_perf = 0;
      REF_Hevdr_lv_components( A, l, Z,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 1
    case 1:
    {
      *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 2
    case 2:
    {
      *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    if ( *dtime < dtime_save )
    {
      dtime_save      = *dtime;
      dtime_tred_save = *dtime_tred;
      dtime_tevd_save = *dtime_tevd;
      dtime_appq_save = *dtime_appq;
    }
  }

  *dtime      = dtime_save;
  *dtime_tred = dtime_tred_save;
  *dtime_tevd = dtime_tevd_save;
  *dtime_appq = dtime_appq_save;

//if ( variant == -3 || variant == 0 )
//printf( "\ndtime is %9.3e\n", *dtime );

  {
    FLA_Obj V, A_rev_evd, norm, eye;

    if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  flops_tred = ( ( 4.0 / 3.0 )   * m * m * m );
  flops_tevd = (   4.5           * k * m * m     +
                   3.0           * k * m * m * m );

  if ( variant == -1 || variant == -2 ||
       variant == -4 || variant == -5 )
    flops_appq = ( 2.0           * m * m * m );
  else
    flops_appq = ( 4.0 / 3.0     * m * m * m );

/*
  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops      = ( 4.0 * flops_tred + 
                     2.0 * flops_tevd + 
                     4.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9;
  }
  else
  {
    *gflops      = ( 1.0 * flops_tred + 
                     1.0 * flops_tevd + 
                     1.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9;
  }
*/

  if ( FLA_Obj_is_complex( A ) )
  {
    mult_tred = 4.0;
    mult_tevd = 2.0;
    mult_appq = 4.0;
  }
  else
  {
    mult_tred = 1.0;
    mult_tevd = 1.0;
    mult_appq = 1.0;
  }

  *gflops = ( mult_tred * flops_tred + 
              mult_tevd * flops_tevd + 
              mult_appq * flops_appq ) / *dtime / 1e9;

  *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9;
  *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9;
  *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &Z );
}
Example #16
0
void time_Bidiag_UT(
                 int param_combo, int type, int nrepeats, int m, int n,
                 FLA_Obj A, FLA_Obj tu, FLA_Obj tv, FLA_Obj TU, FLA_Obj TV,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( param_combo ){

    case 0:
    {
      switch( type )
      {
      case FLA_ALG_REFERENCE:
        REF_Bidiag_UT( A, tu, tv );
        break;
      case FLA_ALG_FRONT:
        FLA_Bidiag_UT( A, TU, TV );
        break;
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  {
    FLA_Obj AL, AR;
    FLA_Obj ATL, ATR,
            ABL, ABR;
    FLA_Obj QU;
    FLA_Obj QV, QVL, QVR;
    FLA_Obj E, EL, ER;
    FLA_Obj F;
    FLA_Obj WU, WV, eye;
    FLA_Obj tvT,
            tvB;
    dim_t   m_A, n_A, m_TU;


//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    m_A = FLA_Obj_length( A );
    n_A = FLA_Obj_width( A );
    m_TU = FLA_Obj_length( TU );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A,  m_A, 0, 0, &QU );
    FLA_Obj_create( FLA_Obj_datatype( A ), n_A,  n_A, 0, 0, &QV );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  m_A, 0, 0, &WU );
    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  n_A, 0, 0, &WV );

    FLA_Set_to_identity( QU );
    FLA_Set_to_identity( QV );

    FLA_Part_1x2( QV,   &QVL, &QVR,   1, FLA_LEFT );
    FLA_Part_1x2( A,    &AL,  &AR,    1, FLA_LEFT );
    FLA_Part_2x2( A,    &ATL, &ATR,
                        &ABL, &ABR,   1, 1, FLA_BL );
    FLA_Part_2x1( tv,   &tvT,
                        &tvB,   1, FLA_BOTTOM );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, AR, tv, QVR );
      //
      // Need to apply backwards transformation, since vectors are stored columnwise.
      // QL? RQ?
      //
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, AR, tvT, QVR );
    }
    else
    {
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, TU, WU, QU );
      FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, AR, TV, WV, QVR );
    }

/*
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye );     
    FLA_Set_to_identity( eye );

    //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
    //          FLA_ONE, QV, QV, FLA_MINUS_ONE, eye );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, QU, QU, FLA_MINUS_ONE, eye );

FLA_Obj_show( "eye", eye, "%10.3e", "" );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff );
    FLA_Obj_free( &eye );
*/

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A_save, QV, FLA_ZERO, E );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, QU, E, FLA_ZERO, F );

//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    FLA_Copy( A, E );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, E );
    FLA_Part_1x2( E,    &EL, &ER,      1, FLA_LEFT );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, ER );

//FLA_Obj_show( "B", E, "%10.3e", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e", "" );
//FLA_Obj_show( "B", E, "%10.3e + %10.3e ", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e + %10.3e ", "" );

    *diff = FLA_Max_elemwise_diff( E, F );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );

    FLA_Obj_free( &QU );
    FLA_Obj_free( &QV );
    FLA_Obj_free( &WU );
    FLA_Obj_free( &WV );
  }

  *gflops = 4.0 * n * n * ( m - n / 3.0 ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}
// ============================================================================
void compute_case2b( int size_a, int size_b, int size_c, int size_d,
         int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C, 
         int print_data ) {
  FLA_Obj  slice_A, slice_B, slice_C;
  int      datatype, size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci,
           iter_a, iter_b, iter_c, iter_d, iter_i, iter_j, 
           ii, jj, ldim_slice_B;
  size_t   idx_A, idx_B, idx_C;
  double   * buff_cb_A, * buff_cb_B, * buff_cb_C, * buff_slice_B;

  // Some initializations.
  datatype  = FLA_Obj_datatype( cb_A );
  buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
  buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );
  buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C );
  
  size_ab  = size_a * size_b;
  size_abc = size_a * size_b * size_c;

  size_ia  = size_i * size_a;
  size_iaj = size_i * size_a * size_j;

  size_jc  = size_j * size_c;
  size_jci = size_j * size_c * size_i;

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" );
    FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" );
    FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" );
  }

  // Prepare temporal slices without buffer.
  FLA_Obj_create_without_buffer( datatype, size_i, size_a, & slice_A );
  FLA_Obj_create( datatype, size_i, size_d, 0, 0, & slice_B );
  FLA_Obj_create_without_buffer( datatype, size_a, size_d, & slice_C );

  // Perform computation.
  for( iter_b = 0; iter_b < size_b; iter_b++ ) {
   
    for( iter_c = 0; iter_c < size_c; iter_c++ ) {

      iter_a = 0;
      iter_d = 0;
      iter_i = 0;

      idx_C = ( ( size_t ) iter_a ) +
              ( ( size_t ) iter_b * size_a ) +
              ( ( size_t ) iter_c * size_ab ) +
              ( ( size_t ) iter_d * size_abc ),
      FLA_Obj_attach_buffer( & buff_cb_C[ idx_C ], 1, size_abc, & slice_C );
      MyFLA_Obj_set_to_zero( slice_C );
 
      for( iter_j = 0; iter_j < size_j; iter_j++ ) {

        // Define Ai.
        idx_A = ( ( size_t ) iter_i ) +
                ( ( size_t ) iter_a * size_i ) +
                ( ( size_t ) iter_j * size_ia ) +
                ( ( size_t ) iter_b * size_iaj );
        FLA_Obj_attach_buffer( & buff_cb_A[ idx_A ], 1, size_i, & slice_A );

        // Define Bi.
        buff_slice_B = ( double * ) FLA_Obj_buffer_at_view( slice_B );
        ldim_slice_B = FLA_Obj_col_stride( slice_B );
        for( jj = 0; jj < size_d; jj++ ) {
          for( ii = 0; ii < size_i; ii++ ) {
            idx_B = ( ( size_t ) iter_j ) +
                    ( ( size_t ) iter_c * size_j ) + 
                    ( ( size_t ) ii * size_jc ) +
                    ( ( size_t ) jj * size_jci );
            buff_slice_B[ ii + jj * ldim_slice_B ] = buff_cb_B[ idx_B ];
          }
        }

        // Compute Ai' * Bi.
        FLA_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, 
                  FLA_ONE, slice_A, slice_B, FLA_ONE, slice_C );
      }
    }
  }

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" );
    FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" );
    FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" );
  }

  // Remove temporal slices.
  FLA_Obj_free_without_buffer( & slice_A );
  FLA_Obj_free( & slice_B );
  FLA_Obj_free_without_buffer( & slice_C );
}
Example #18
0
int Trsm_blk_var1( FLA_Obj L, FLA_Obj B, int nb_alg )
{
  FLA_Obj LTL,   LTR,      L00, L01, L02, 
          LBL,   LBR,      L10, L11, L12,
                           L20, L21, L22;

  FLA_Obj BT,              B0,
          BB,              B1,
                           B2;

  int b;

  FLA_Part_2x2( L,    &LTL, &LTR,
                      &LBL, &LBR,     0, 0, FLA_TL );

  FLA_Part_2x1( B,    &BT, 
                      &BB,            0, FLA_TOP );

  while ( FLA_Obj_length( LTL ) < FLA_Obj_length( L ) ){

    b = min( FLA_Obj_length( LBR ), nb_alg );

    FLA_Repart_2x2_to_3x3( LTL, /**/ LTR,       &L00, /**/ &L01, &L02,
                        /* ************* */   /* ******************** */
                                                &L10, /**/ &L11, &L12,
                           LBL, /**/ LBR,       &L20, /**/ &L21, &L22,
                           b, b, FLA_BR );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                        /* ** */            /* ** */
                                              &B1, 
                           BB,                &B2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    /* B1 = B1 - L10 * B0 */
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
	      FLA_MINUS_ONE, L10, B0, FLA_ONE, B1 );

    /* B1 = inv( L11 ) B1 */
    Trsm_unb_var1( L11, B1 );
    //Alternatively, try
    //    FLA_Trsm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
    //	      FLA_ONE, L11, B1 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &LTL, /**/ &LTR,       L00, L01, /**/ L02,
                                                     L10, L11, /**/ L12,
                            /* ************** */  /* ****************** */
                              &LBL, /**/ &LBR,       L20, L21, /**/ L22,
                              FLA_TL );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                  B1, 
                            /* ** */           /* ** */
                              &BB,                B2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
FLA_Error FLA_Svd_ext_u_unb_var1( FLA_Svd_type jobu, FLA_Svd_type jobv,
                                  dim_t n_iter_max,
                                  FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V,
                                  dim_t k_accum,
                                  dim_t b_alg )
{
    FLA_Error    r_val = FLA_SUCCESS;
    FLA_Datatype dt;
    FLA_Datatype dt_real;
    FLA_Datatype dt_comp;
    FLA_Obj      scale, T, S, rL, rR, d, e, G, H, C; // C is dummy.
    dim_t        m_A, n_A, min_m_n;
    dim_t        n_GH;
    double       crossover_ratio = 17.0 / 9.0;
    FLA_Bool     u_is_formed = FALSE, 
                 v_is_formed = FALSE;
    int          apply_scale;

    n_GH    = k_accum;

    m_A     = FLA_Obj_length( A );
    n_A     = FLA_Obj_width( A );
    min_m_n = min( m_A, n_A );
    dt      = FLA_Obj_datatype( A );
    dt_real = FLA_Obj_datatype_proj_to_real( A );
    dt_comp = FLA_Obj_datatype_proj_to_complex( A );

    // Create matrices to hold block Householder transformations.
    FLA_Bidiag_UT_create_T( A, &T, &S );

    // Create vectors to hold the realifying scalars.
    if ( FLA_Obj_is_complex( A ) )
    {
        FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rL );
        FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rR );
    }

    // Create vectors to hold the diagonal and sub-diagonal.
    FLA_Obj_create( dt_real, min_m_n,      1, 0, 0, &d );
    FLA_Obj_create( dt_real, min_m_n-1,    1, 0, 0, &e );

    // Create matrices to hold the left and right Givens scalars.
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G );
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H );

    // Create a real scaling factor.
    FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale );

    // Scale matrix A if necessary. 
    FLA_Max_abs_value( A, scale );
    apply_scale =
      ( FLA_Obj_gt( scale, FLA_OVERFLOW_SQUARE_THRES  ) == TRUE ) -     
      ( FLA_Obj_lt( scale, FLA_UNDERFLOW_SQUARE_THRES ) == TRUE ); 
    
    if ( apply_scale )
      FLA_Scal( apply_scale > 0 ? FLA_SAFE_MIN : FLA_SAFE_INV_MIN, A );   

    if ( m_A < crossover_ratio * n_A )
    {
        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( A, T, S );
        if ( FLA_Obj_is_complex( A ) )
            FLA_Bidiag_UT_realify( A, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( A, d, e );

        // Form U and V.
        if ( u_is_formed == FALSE )
        {
            switch ( jobu )
            {
            case FLA_SVD_VECTORS_MIN_OVERWRITE:
                if ( jobv != FLA_SVD_VECTORS_NONE )
                    FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_NO_TRANSPOSE, V );
                v_is_formed = TRUE; // For this case, V should be formed here.
                U = A;
            case FLA_SVD_VECTORS_ALL:
            case FLA_SVD_VECTORS_MIN_COPY:
                FLA_Bidiag_UT_form_U_ext( FLA_UPPER_TRIANGULAR, A, T, FLA_NO_TRANSPOSE, U );
                u_is_formed = TRUE;
                break;
            case FLA_SVD_VECTORS_NONE:
                // Do nothing
                break;
            }
        }
        if ( v_is_formed == FALSE )
        {
            if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE )
            {
                FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_CONJ_TRANSPOSE, A );
                v_is_formed = TRUE; /* and */
                V = A; // This V is actually V^H.

                // V^H -> V
                FLA_Obj_flip_base( &V );
                FLA_Obj_flip_view( &V );
                if ( FLA_Obj_is_complex( A ) )
                    FLA_Conjugate( V );
            }
            else if ( jobv != FLA_SVD_VECTORS_NONE )
            {
                FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_NO_TRANSPOSE, V );
                v_is_formed = TRUE;
            }
        }

        // For complex matrices, apply realification transformation.
        if ( FLA_Obj_is_complex( A ) && jobu != FLA_SVD_VECTORS_NONE )
        {
            FLA_Obj UL, UR;
            FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );
            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, UL );
        }
        if ( FLA_Obj_is_complex( A ) && jobv != FLA_SVD_VECTORS_NONE )
        {
            FLA_Obj VL, VR;
            FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );
            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
        }

        // Perform a singular value decomposition on the upper bidiagonal matrix.
        r_val = FLA_Bsvd_ext_opt_var1( n_iter_max,
                                       d, e, G, H,
                                       jobu, U, jobv, V,
                                       FALSE, C, // C is not referenced
                                       b_alg );
    }
    else // if ( crossover_ratio * n_A <= m_A )
    {
        FLA_Obj TQ, R;
        FLA_Obj AT,
                AB;

        // Perform a QR factorization on A.
        FLA_QR_UT_create_T( A, &TQ );
        FLA_QR_UT( A, TQ );

        // Set the lower triangle of R to zero and then copy the upper
        // triangle of A to R.
        FLA_Part_2x1( A,   &AT,
                           &AB,   n_A, FLA_TOP );
        FLA_Obj_create( dt, n_A, n_A, 0, 0, &R );
        FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R );
        FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R );

        // Form U; if necessary overwrite on A.
        if ( u_is_formed == FALSE )
        {
            switch ( jobu )
            {
            case FLA_SVD_VECTORS_MIN_OVERWRITE:
                U = A;
            case FLA_SVD_VECTORS_ALL:
            case FLA_SVD_VECTORS_MIN_COPY:
                FLA_QR_UT_form_Q( A, TQ, U );
                u_is_formed = TRUE;
                break;
            case FLA_SVD_VECTORS_NONE:
                // Do nothing
                break;
            }
        }
        FLA_Obj_free( &TQ );

        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( R, T, S );
        if ( FLA_Obj_is_complex( R ) )
            FLA_Bidiag_UT_realify( R, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( R, d, e );

        if ( v_is_formed == FALSE )
        {
            if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE )
            {
                FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, R, S, FLA_CONJ_TRANSPOSE, AT );
                v_is_formed = TRUE; /* and */
                V = AT; // This V is actually V^H.

                // V^H -> V
                FLA_Obj_flip_base( &V );
                FLA_Obj_flip_view( &V );
                if ( FLA_Obj_is_complex( A ) )
                    FLA_Conjugate( V );
            }
            else if ( jobv != FLA_SVD_VECTORS_NONE )
            {
                FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, R, S, FLA_NO_TRANSPOSE, V );
                v_is_formed = TRUE;
            }
        }

        // Apply householder vectors U in R.
        FLA_Bidiag_UT_form_U_ext( FLA_UPPER_TRIANGULAR, R, T, FLA_NO_TRANSPOSE, R );

        // Apply the realifying scalars in rL and rR to U and V, respectively.
        if ( FLA_Obj_is_complex( A ) && jobu != FLA_SVD_VECTORS_NONE )
        {
            FLA_Obj RL, RR;
            FLA_Part_1x2( R,   &RL, &RR,   min_m_n, FLA_LEFT );
            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, RL );
        }
        if ( FLA_Obj_is_complex( A ) && jobv != FLA_SVD_VECTORS_NONE )
        {
            FLA_Obj VL, VR;
            FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );
            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
        }

        // Perform a singular value decomposition on the bidiagonal matrix.
        r_val = FLA_Bsvd_ext_opt_var1( n_iter_max,
                                       d, e, G, H,
                                       jobu, R, jobv, V,
                                       FALSE, C,
                                       b_alg );

        // Multiply R into U, storing the result in A and then copying back
        // to U.
        if ( jobu != FLA_SVD_VECTORS_NONE )
        {
            FLA_Obj UL, UR;
            FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );

            if ( jobu == FLA_SVD_VECTORS_MIN_OVERWRITE || 
                 jobv == FLA_SVD_VECTORS_MIN_OVERWRITE )
            {
                FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, UL, &C );
                FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                          FLA_ONE, UL, R, FLA_ZERO, C );
                FLA_Copy( C, UL );
                FLA_Obj_free( &C );
            }
            else
            {
                FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                          FLA_ONE, UL, R, FLA_ZERO, A );
                FLA_Copy( A, UL );
            }
        }
        FLA_Obj_free( &R );
    }

    // Copy the converged eigenvalues to the output vector.
    FLA_Copy( d, s );

    // No sort is required as it is applied on FLA_Bsvd.

    if ( apply_scale ) 
      FLA_Scal( apply_scale < 0 ? FLA_SAFE_MIN : FLA_SAFE_INV_MIN, s ); 

    // When V is overwritten, flip it again.
    if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE )
    {
        // Always apply conjugation first wrt dimensions used; then, flip base.
        if ( FLA_Obj_is_complex( V ) )
            FLA_Conjugate( V );
        FLA_Obj_flip_base( &V );
    }

    FLA_Obj_free( &scale );
    FLA_Obj_free( &T );
    FLA_Obj_free( &S );

    if ( FLA_Obj_is_complex( A ) )
    {
        FLA_Obj_free( &rL );
        FLA_Obj_free( &rR );
    }

    FLA_Obj_free( &d );
    FLA_Obj_free( &e );
    FLA_Obj_free( &G );
    FLA_Obj_free( &H );

    return r_val;
}