Example #1
0
FLA_Error REF_Gemm_nn( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
  FLA_Datatype datatype;
  int          m, k, n, ldim_A, ldim_B, ldim_C;

  datatype = FLA_Obj_datatype( A );
  ldim_A   = FLA_Obj_ldim( A );
  ldim_B   = FLA_Obj_ldim( B );
  ldim_C   = FLA_Obj_ldim( C );
  m        = FLA_Obj_length( A );
  k        = FLA_Obj_width( A );
  n        = FLA_Obj_width( B );
  
  switch( datatype ){
    case FLA_DOUBLE:
    {
      double *buff_A, *buff_B, *buff_C, d_one=1.0;

      buff_A = ( double * ) FLA_Obj_buffer_at_view( A );
      buff_B = ( double * ) FLA_Obj_buffer_at_view( B );
      buff_C = ( double * ) FLA_Obj_buffer_at_view( C );
    
      FLA_C2F( dgemm )( "N", "N", &m, &n, &k,
                        &d_one, buff_A, &ldim_A, buff_B, &ldim_B,
                        &d_one, buff_C, &ldim_C );
    } break;
  }
  
  return 0;
}
Example #2
0
FLA_Error REF_Syrk_ln( FLA_Obj A, FLA_Obj C )
{
  FLA_Datatype datatype;
  int          k, m, ldim_A, ldim_C;

  datatype = FLA_Obj_datatype( A );
  ldim_A   = FLA_Obj_ldim( A );
  ldim_C   = FLA_Obj_ldim( C );
  k        = FLA_Obj_width( A );
  m        = FLA_Obj_length( A );
  
  switch( datatype ){
    case FLA_DOUBLE:
    {
      double *buff_A, *buff_C, d_one=1.0;

      buff_A = ( double * ) FLA_Obj_buffer_at_view( A );
      buff_C = ( double * ) FLA_Obj_buffer_at_view( C );
    
      dsyrk_( "L", "N", &m, &k,
              &d_one, buff_A, &ldim_A, &d_one, buff_C, &ldim_C );
    } break;
  }
  
  return 0;
}
// ============================================================================
void compute_case4a( int size_a, int size_b, int size_c, int size_d,
                     int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C,
                     int print_data ) {
    int      size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci,
             iter_a, iter_b, iter_c, iter_d, iter_i, iter_j;
    double   * buff_cb_A, * buff_cb_B, * buff_cb_C, ci, * ptr_ai, * ptr_bi;

    // Some initializations.
    buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
    buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );
    buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C );

    size_ab  = size_a * size_b;
    size_abc = size_a * size_b * size_c;

    size_ia  = size_i * size_a;
    size_iaj = size_i * size_a * size_j;

    size_jc  = size_j * size_c;
    size_jci = size_j * size_c * size_i;

    // Show data.
    if( print_data == 1 ) {
        FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" );
        FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" );
        FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" );
    }

    // Perform computation.
    for( iter_a = 0; iter_a < size_a; iter_a++ ) {
        for( iter_b = 0; iter_b < size_b; iter_b++ ) {
            for( iter_c = 0; iter_c < size_c; iter_c++ ) {
                for( iter_d = 0; iter_d < size_d; iter_d++ ) {
                    ci = 0.0;
                    for( iter_j = 0; iter_j < size_j; iter_j++ ) {
                        ptr_ai = & buff_cb_A[ 0 + iter_a * size_i +
                                              iter_j * size_ia + iter_b * size_iaj ];
                        ptr_bi = & buff_cb_B[ iter_j + iter_c * size_j + 0 * size_jc +
                                              iter_d * size_jci ];
                        for( iter_i = 0; iter_i < size_i; iter_i++ ) {
                            ci += ( * ptr_ai ) * ( * ptr_bi );
                            ptr_ai++;
                            ptr_bi += size_jc;
                        }
                    }
                    buff_cb_C[ iter_a + iter_b * size_a + iter_c * size_ab +
                               iter_d * size_abc ] = ci;
                }
            }
        }
    }

    // Show data.
    if( print_data == 1 ) {
        FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" );
        FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" );
        FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" );
    }
}
int FLAME_invert_ztau( FLA_Obj t )
{
    dim_t     m    = FLA_Obj_vector_dim( t );
    dim_t     inc  = FLA_Obj_vector_inc( t );
    dcomplex* buff = FLA_Obj_buffer_at_view( t );
    double    one  = 1.0;
    double    conjsign = one; // if conjugate -one;
    double    zero = 0.0;
    double    temp, s, xr_s, xi_s;
    dcomplex* chi;
    int       i;

    for ( i = 0; i < m; ++i )
    {
        chi  = buff + i*inc;
        s    = bl1_fmaxabs( chi->real, chi->imag );
        if ( s != zero )
        {
            xr_s = chi->real / s;
            xi_s = chi->imag / s;
            temp = xr_s * chi->real + xi_s * chi->imag;
            chi->real =            xr_s / temp;
            chi->imag = conjsign * xi_s / temp;
        }
    }
    return 0;
}
Example #5
0
FLA_Bool FLA_Obj_is_identical( FLA_Obj A, FLA_Obj B )
{
  FLA_Bool r_val = FALSE;

  // For LU_piv, if A and B are identical, we do not need copy.
  // Elemtype should be checked as they can have the same buffer pointer
  // but elemtype can be either scalar or matrix.
  if ( A.base != NULL && A.base != NULL )
    if ( ( A.base == B.base ) || 
         ( A.base->elemtype == B.base->elemtype  && 
           A.base->datatype == B.base->datatype ) )
      if ( FLA_Obj_buffer_at_view( A ) == FLA_Obj_buffer_at_view( B ) )
        if ( A.m == B.m && A.n == B.n )
          r_val = TRUE;
  
  return r_val;
}
Example #6
0
FLA_Bool FLA_Obj_is_overlapped( FLA_Obj A, FLA_Obj B )
{
  FLA_Bool r_val = FALSE;

  // For form_Q, if A and B are not overlapped, we do not use in-place forming Q.
  if ( A.base != NULL && A.base != NULL )
    if ( ( A.base == B.base ) || 
         ( A.base->elemtype == B.base->elemtype && 
           A.base->datatype == B.base->datatype ) )
      if ( FLA_Obj_buffer_at_view( A ) == FLA_Obj_buffer_at_view( B ) )
        if ( ( ( A.offm <= B.offm && B.offm < ( A.offm + A.m ) ) &&
               ( A.offn <= B.offn && B.offn < ( A.offn + A.n ) ) ) ||
             ( ( B.offm <= A.offm && A.offm < ( B.offm + B.m ) ) &&
               ( B.offn <= A.offn && A.offn < ( B.offn + B.n ) ) ) )
          r_val = TRUE;
  
  return r_val;
}
// ============================================================================
void compute_case1( int m, int n, int k, int l, 
         FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj C, int print_data ) {
  FLA_Obj  slice_A, slice_B;
  int      datatype, h;
  double   * buff_cb_A, * buff_cb_B;

  // Some initializations.
  datatype  = FLA_Obj_datatype( cb_A );
  buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
  buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );

  // Prepare temporal slices.
  FLA_Obj_create_without_buffer( datatype, m, k, & slice_A );
  FLA_Obj_create_without_buffer( datatype, n, k, & slice_B );

  // Initialize matrix C for the result.
  MyFLA_Obj_set_to_zero( C );

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " Ci = [ ", C, "%le", " ];" );
    FLA_Obj_show( " cb_A = [ ", cb_A, "%le", " ];" );
    FLA_Obj_show( " cb_B = [ ", cb_B, "%le", " ];" );
  }

  // Perform computation.
  for( h = 0; h < l; h++ ) {
    FLA_Obj_attach_buffer( buff_cb_A + m * k * h, 1, m, & slice_A );
    FLA_Obj_attach_buffer( buff_cb_B + n * k * h, 1, n, & slice_B );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, 
              FLA_ONE, slice_A, slice_B, FLA_ONE, C );
  }
  
  // Remove temporal slices.
  FLA_Obj_free_without_buffer( & slice_A );
  FLA_Obj_free_without_buffer( & slice_B );

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " Cf = [ ", C, "%le", " ];" );
  }
}
// ============================================================================
void compute_case5( int m, int n, int k, int l, 
         FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj C, int print_data ) {
  int      datatype, i, j, kl;
  double   * buff_cb_A, * buff_cb_B, d_one = 1.0;

  // Some initializations.
  datatype  = FLA_Obj_datatype( cb_A );
  buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
  buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );

  // Initialize matrix C for the result.
  MyFLA_Obj_set_to_zero( C );

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " Ci = [ ", C, "%le", " ];" );
    FLA_Obj_show( " cb_A = [ ", cb_A, "%le", " ];" );
    FLA_Obj_show( " cb_B = [ ", cb_B, "%le", " ];" );
  }

  // Perform computation.
  kl = k * l;
  for( i = 0; i < k; i++ ) {
    for( j = 0; j < l; j++ ) {
      dger_( & m, & n,
             & d_one, ( double * ) buff_cb_A + l * m * i + j,
                      & l,
                      ( double * ) buff_cb_B + i + k * j,
                      & kl,
             ( double * ) FLA_Obj_buffer_at_view( C ), 
             & m );
    }
  }
  
  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " Cf = [ ", C, "%le", " ];" );
  }
}
void FLA_CAQR_UT_inc_init_structure( dim_t p, dim_t nb_part, FLA_Obj R )
{
	dim_t    m, n;
	dim_t    rs, cs;
	dim_t    i, j, ip;
	FLA_Obj* buff_R;

	m      = FLA_Obj_length( R );
	n      = FLA_Obj_width( R );
	rs     = FLA_Obj_row_stride( R );
	cs     = FLA_Obj_col_stride( R );
	buff_R = FLA_Obj_buffer_at_view( R );

	// Fill in R by row panels.
	for ( ip = 0; ip < p; ++ip )
	{
		FLA_Obj* buff_R1 = buff_R + (ip*nb_part)*rs;

		int  m_behind   = ip*nb_part;
		int  m_ahead    = m - m_behind;

		int  m_cur      = min( nb_part, m_ahead );
		int  n_cur      = n;

		// Iterate across columns for the current panel.
		for ( j = 0; j < n_cur; ++j )
		{
			FLA_Obj* rho = buff_R1 + j*cs;

			// Mark the above-diagonal blocks as full.
			for ( i = 0; i < j; ++i )
			{
				rho->base->uplo = FLA_FULL_MATRIX;
				rho += rs;
			}

			// Mark the diagonal block as triangular.
			rho->base->uplo = FLA_UPPER_TRIANGULAR;
			rho += rs;
			
			// Mark the below-diagonal blocks as zero.
			for ( i = j + 1; i < m_cur; ++i )
			{
				rho->base->uplo = FLA_ZERO_MATRIX;
				rho += rs;
			}
		}
	}
}
Example #10
0
void FLA_Gemm_pack_andor_scale_A( FLA_Trans transA, FLA_Obj alpha,
                                  FLA_Obj A, FLA_Obj* packed_A )
{
  int m, n, ldim_A;
  double *buff_packed_A, *buff_packed_aligned_A, *buff_A;

  m      = FLA_Obj_length( A );
  n      = FLA_Obj_width ( A );
  ldim_A = FLA_Obj_ldim  ( A );
  buff_A = ( double* ) FLA_Obj_buffer_at_view( A );

  dgemm_itcopy( m, n, buff_A, ldim_A, FLA_Work_buffer_aligned_A );

  FLA_Obj_create_without_buffer( FLA_DOUBLE, m, n, packed_A );
  //  FLA_Obj_attach_buffer( buff_packed_A, n, packed_A );
}
Example #11
0
void FLA_Gemm_pack_andor_scale_B( FLA_Trans transB, FLA_Obj alpha,
                                  FLA_Obj B, FLA_Obj* packed_B )
{
  int m, n, ldim_B;
  double *buff_packed_B, *buff_packed_aligned_B, *buff_B;

  m      = FLA_Obj_length( B );
  n      = FLA_Obj_width ( B );
  ldim_B = FLA_Obj_ldim  ( B );
  buff_B = ( double* ) FLA_Obj_buffer_at_view( B );

  dgemm_oncopy( m, n, buff_B, ldim_B, FLA_Work_buffer_aligned_B );

  FLA_Obj_create_without_buffer( FLA_DOUBLE, m, n, packed_B );
  //  FLA_Obj_attach_buffer( buff_packed_B, m, packed_B );
}
// Transform tau.
int FLAME_invert_stau( FLA_Obj t )
{
    dim_t  m    = FLA_Obj_vector_dim( t );
    dim_t  inc  = FLA_Obj_vector_inc( t );
    float* buff = FLA_Obj_buffer_at_view( t );
    float  one  = 1.0F;
    float  zero = 0.0F;
    float* chi;
    int    i;

    for ( i = 0; i < m; ++i )
    {
        chi = buff + i*inc;
        if ( *chi != zero )
            *chi = ( one / *chi );
    }
    return 0;
}
Example #13
0
FLA_Error REF_Chol( int time_lapack, FLA_Obj A, int nb_alg )
{
  int n, ldim_A, info;
  
  double *buff_A, sqrt();


  n = FLA_Obj_length( A );
  ldim_A = FLA_Obj_col_stride( A );

  buff_A = (double *) FLA_Obj_buffer_at_view( A );

  if ( time_lapack ){

    //     dpotrfx_( "lower", &n, buff_A, &ldim_A, &info, &nb_alg );

  }
  else{
    int i, j, k;

    for ( j=0; j<n; j++ ){

      /* alpha11 = sqrt( alpha11 ) */
      AA( j, j ) = sqrt( AA( j, j ) );
    
      /* a21 = a21 / alpha11 */
      for ( i=j+1; i<n; i++ )
	AA( i,j ) = AA( i,j )  / AA( j,j );

      /* A22 = A22 - tril( a21 * a21') */
      for ( k=j+1; k<n; k++ )
	for ( i=k; i<n; i++ )
	  AA( i,k ) = AA( i,k ) - AA( i,j ) * AA( k,j );
    }
  }

  return FLA_SUCCESS;
}
Example #14
0
void FLA_Gemm_kernel( FLA_Obj alpha, FLA_Obj packed_A, 
                      FLA_Obj packed_B, FLA_Obj packed_C )
{
  int m, n, k, ldim_C;
  double alpha_value, 
    *buff_A, *buff_B, *buff_C,
    *buff_aligned_A,   *buff_aligned_B;

  m      = FLA_Obj_length( packed_C );
  n      = FLA_Obj_width ( packed_C );
  k      = FLA_Obj_length( packed_A ); 
  ldim_C = FLA_Obj_ldim  ( packed_C );

  //  buff_A = ( double * ) FLA_Obj_buffer_at_view( packed_A );
  //  buff_B = ( double * ) FLA_Obj_buffer_at_view( packed_B );
  buff_C = ( double* ) FLA_Obj_buffer_at_view( packed_C );

  alpha_value = FLA_DOUBLE_VALUE( alpha );

  dgemm_kernel( m, n, k, alpha_value, 
                FLA_Work_buffer_aligned_A, 
                FLA_Work_buffer_aligned_B, buff_C, ldim_C );
}
Example #15
0
int main( int argc, char *argv[] )
{
    int
    i, j,
    size,
    n_threads,
    n_repeats,
    n_trials,
    nb_alg,
    increment,
    begin;

    FLA_Datatype
    datatype = FLA_DOUBLE;

    FLA_Obj
    A;

    double
    b_norm_value = 0.0,
    dtime,
    *dtimes,
    *flops,
    *T;

    char
    output_file_m[100];

    FILE
    *fpp;

    fprintf( stdout, "%c Enter number of repeats: ", '%' );
    scanf( "%d", &n_repeats );
    fprintf( stdout, "%c %d\n", '%', n_repeats );

    fprintf( stdout, "%c Enter blocksize: ", '%' );
    scanf( "%d", &nb_alg );
    fprintf( stdout, "%c %d\n", '%', nb_alg );

    fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' );
    scanf( "%d%d%d", &begin, &increment, &n_trials );
    fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials );

    fprintf( stdout, "%c Enter number of threads: ", '%' );
    scanf( "%d", &n_threads );
    fprintf( stdout, "%c %d\n\n", '%', n_threads );

    sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE );
    fpp = fopen( output_file_m, "a" );

    fprintf( fpp, "%%\n" );
    fprintf( fpp, "%% | Matrix Size |    PLASMA   |\n" );
    fprintf( fpp, "%% |    n x n    |    GFlops   |\n" );
    fprintf( fpp, "%% -----------------------------\n" );

    FLA_Init();
    PLASMA_Init( n_threads );

    PLASMA_Disable( PLASMA_AUTOTUNING );
    PLASMA_Set( PLASMA_TILE_SIZE, nb_alg );
    PLASMA_Set( PLASMA_INNER_BLOCK_SIZE, nb_alg / 4 );

    dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) );
    flops  = ( double * ) FLA_malloc( n_trials  * sizeof( double ) );

    fprintf( fpp, "%s = [\n", OUTPUT_FILE );

    for ( i = 0; i < n_trials; i++ )
    {
        size = begin + i * increment;

        FLA_Obj_create( datatype, size, size, 0, 0, &A );

        for ( j = 0; j < n_repeats; j++ )
        {
            FLA_Random_matrix( A );

            PLASMA_Alloc_Workspace_dgeqrf( size, size, &T );

            dtime = FLA_Clock();

            PLASMA_dgeqrf( size, size, FLA_Obj_buffer_at_view( A ), size, T );

            dtime = FLA_Clock() - dtime;
            dtimes[j] = dtime;

            free( T );
        }

        dtime = dtimes[0];
        for ( j = 1; j < n_repeats; j++ )
            dtime = min( dtime, dtimes[j] );
        flops[i] = 4.0 / 3.0 * size * size * size / dtime / 1e9;

        fprintf( fpp, "   %d   %6.3f\n", size, flops[i] );

        printf( "Time: %e  |  GFlops: %6.3f\n",
                dtime, flops[i] );
        printf( "Matrix size: %d x %d  |  nb_alg: %d\n",
                size, size, nb_alg );
        printf( "Norm of difference: %le\n\n", b_norm_value );

        FLA_Obj_free( &A );
    }

    fprintf( fpp, "];\n" );

    fflush( fpp );
    fclose( fpp );

    FLA_free( dtimes );
    FLA_free( flops );

    PLASMA_Finalize();
    FLA_Finalize();

    return 0;
}
// ============================================================================
void compute_case2b( int size_a, int size_b, int size_c, int size_d,
         int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C, 
         int print_data ) {
  FLA_Obj  slice_A, slice_B, slice_C;
  int      datatype, size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci,
           iter_a, iter_b, iter_c, iter_d, iter_i, iter_j, 
           ii, jj, ldim_slice_B;
  size_t   idx_A, idx_B, idx_C;
  double   * buff_cb_A, * buff_cb_B, * buff_cb_C, * buff_slice_B;

  // Some initializations.
  datatype  = FLA_Obj_datatype( cb_A );
  buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
  buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );
  buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C );
  
  size_ab  = size_a * size_b;
  size_abc = size_a * size_b * size_c;

  size_ia  = size_i * size_a;
  size_iaj = size_i * size_a * size_j;

  size_jc  = size_j * size_c;
  size_jci = size_j * size_c * size_i;

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" );
    FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" );
    FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" );
  }

  // Prepare temporal slices without buffer.
  FLA_Obj_create_without_buffer( datatype, size_i, size_a, & slice_A );
  FLA_Obj_create( datatype, size_i, size_d, 0, 0, & slice_B );
  FLA_Obj_create_without_buffer( datatype, size_a, size_d, & slice_C );

  // Perform computation.
  for( iter_b = 0; iter_b < size_b; iter_b++ ) {
   
    for( iter_c = 0; iter_c < size_c; iter_c++ ) {

      iter_a = 0;
      iter_d = 0;
      iter_i = 0;

      idx_C = ( ( size_t ) iter_a ) +
              ( ( size_t ) iter_b * size_a ) +
              ( ( size_t ) iter_c * size_ab ) +
              ( ( size_t ) iter_d * size_abc ),
      FLA_Obj_attach_buffer( & buff_cb_C[ idx_C ], 1, size_abc, & slice_C );
      MyFLA_Obj_set_to_zero( slice_C );
 
      for( iter_j = 0; iter_j < size_j; iter_j++ ) {

        // Define Ai.
        idx_A = ( ( size_t ) iter_i ) +
                ( ( size_t ) iter_a * size_i ) +
                ( ( size_t ) iter_j * size_ia ) +
                ( ( size_t ) iter_b * size_iaj );
        FLA_Obj_attach_buffer( & buff_cb_A[ idx_A ], 1, size_i, & slice_A );

        // Define Bi.
        buff_slice_B = ( double * ) FLA_Obj_buffer_at_view( slice_B );
        ldim_slice_B = FLA_Obj_col_stride( slice_B );
        for( jj = 0; jj < size_d; jj++ ) {
          for( ii = 0; ii < size_i; ii++ ) {
            idx_B = ( ( size_t ) iter_j ) +
                    ( ( size_t ) iter_c * size_j ) + 
                    ( ( size_t ) ii * size_jc ) +
                    ( ( size_t ) jj * size_jci );
            buff_slice_B[ ii + jj * ldim_slice_B ] = buff_cb_B[ idx_B ];
          }
        }

        // Compute Ai' * Bi.
        FLA_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, 
                  FLA_ONE, slice_A, slice_B, FLA_ONE, slice_C );
      }
    }
  }

  // Show data.
  if( print_data == 1 ) {
    FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" );
    FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" );
    FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" );
  }

  // Remove temporal slices.
  FLA_Obj_free_without_buffer( & slice_A );
  FLA_Obj_free( & slice_B );
  FLA_Obj_free_without_buffer( & slice_C );
}
Example #17
0
int main(int argc, char *argv[])
{
  int n, nfirst, nlast, ninc, nlast_unb, i, irep,
    nrepeats, nb_alg;

  double
    dtime, dtime_best, 
    gflops, max_gflops,
    diff, d_n;

  FLA_Obj
    A, Aref, Aold, delta;
  
  /* Initialize FLAME */
  FLA_Init( );

  /* Every time trial is repeated "repeat" times and the fastest run in recorded */
  printf( "%% number of repeats:" );
  scanf( "%d", &nrepeats );
  printf( "%% %d\n", nrepeats );

  /* Enter the max GFLOPS attainable 
     This is used to set the y-axis range for the graphs. Here is how
     you figure out what to enter (on Linux machines):
     1) more /proc/cpuinfo   (this lists the contents of this file).
     2) read through this and figure out the clock rate of the machine (in GHz).
     3) Find out (from an expert of from the web) the number of floating point
        instructions that can be performed per core per clock cycle.
     4) Figure out if you are using "multithreaded BLAS" which automatically
        parallelize calls to the Basic Linear Algebra Subprograms.  If so,
        check how many cores are available.
     5) Multiply 2) x 3) x 4) and enter this in response to the below.

     If you enter a value for max GFLOPS that is lower that the maximum that
     is observed in the experiments, then the top of the graph is set to the 
     observed maximum.  Thus, one possibility is to simply set this to 0.0.
  */

  printf( "%% enter max GFLOPS:" );
  scanf( "%lf", &max_gflops );
  printf( "%% %lf\n", max_gflops );

  /* Enter the algorithmic block size */
  printf( "%% enter nb_alg:" );
  scanf( "%d", &nb_alg );
  printf( "%% %d\n", nb_alg );

  /* Timing trials for matrix sizes n=nfirst to nlast in increments 
     of ninc will be performed.  Unblocked versions are only tested to
     nlast_unb */
  printf( "%% enter nfirst, nlast, ninc, nlast_unb:" );
  scanf( "%d%d%d%d", &nfirst, &nlast, &ninc, &nlast_unb );
  printf( "%% %d %d %d %d\n", nfirst, nlast, ninc, nlast_unb );

  i = 1;
  for ( n=nfirst; n<= nlast; n+=ninc ){
   
    /* Allocate space for the matrices */
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aref );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aold );
    FLA_Obj_create( FLA_DOUBLE, 1, 1, 1, 1, &delta );

    /* Generate random matrix A and save in Aold */
    FLA_Random_matrix( Aold );

    /* Add something large to the diagonal to make sure it isn't ill-conditionsed */
    d_n = ( double ) n;
    *( ( double * ) FLA_Obj_buffer_at_view( delta ) ) = d_n;
    FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold );
    
    /* Set gflops = billions of floating point operations that will be performed */
    gflops = 1.0/3.0 * n * n * n * 1.0e-09;

    /* Time the reference implementation */
#if TIME_LAPACK == TRUE

#else
    //    if ( n <= nlast_unb )
#endif
    {
      for ( irep=0; irep<nrepeats; irep++ ){
	FLA_Copy( Aold, Aref );
    
	dtime = FLA_Clock();
    
	REF_Chol( TIME_LAPACK, Aref, nb_alg );
    
	dtime = FLA_Clock() - dtime;
    
	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }
  
      printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n,
	      gflops / dtime_best );
      fflush( stdout );
    }  

    /* Time FLA_Chol */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );

      dtime = FLA_Clock();

      FLA_Chol( FLA_LOWER_TRIANGULAR, A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n,
            gflops / dtime_best );

    if ( gflops / dtime_best > max_gflops ) 
      max_gflops = gflops / dtime_best;

    fflush( stdout );


    /* Time the your implementations */


    /* Variant 1 unblocked */
    
    if ( n <= nlast_unb ){
      for ( irep=0; irep<nrepeats; irep++ ){

	FLA_Copy( Aold, A );
    
	dtime = FLA_Clock();

#if TIME_UNB_VAR1 == TRUE
	Chol_unb_var1( A );
#else
	REF_Chol( TIME_LAPACK, A, nb_alg );
#endif


	dtime = FLA_Clock() - dtime;

	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }    

      diff = FLA_Max_elemwise_diff( A, Aref );

      printf( "data_unb_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
	      gflops / dtime_best, diff );
      fflush( stdout );
    }

    /* Variant 1 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

#if TIME_BLK_VAR1 == TRUE
      Chol_blk_var1( A, nb_alg );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );


    /* Variant 2 unblocked */
    if ( n <= nlast_unb ){
      for ( irep=0; irep<nrepeats; irep++ ){
	
	FLA_Copy( Aold, A );
	
	dtime = FLA_Clock();
	

#if TIME_UNB_VAR2 == TRUE
	Chol_unb_var2( A );
#else	
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

	dtime = FLA_Clock() - dtime;
	
	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }    
      
      diff = FLA_Max_elemwise_diff( A, Aref );
      
      printf( "data_unb_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
	      gflops / dtime_best, diff );
      fflush( stdout );
    }

    /* Variant 2 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

#if TIME_BLK_VAR2 == TRUE
      Chol_blk_var2( A, nb_alg );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 3 unblocked */
    if ( n <= nlast_unb ){
      for ( irep=0; irep<nrepeats; irep++ ){
	
	FLA_Copy( Aold, A );
	
	dtime = FLA_Clock();
	
#if TIME_UNB_VAR3 == TRUE
	Chol_unb_var3( A );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

	dtime = FLA_Clock() - dtime;
	
	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }    
      
      diff = FLA_Max_elemwise_diff( A, Aref );
      
      printf( "data_unb_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
	      gflops / dtime_best, diff );
      fflush( stdout );
    }

    /* Variant 3 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

#if TIME_BLK_VAR3 == TRUE
      Chol_blk_var3( A, nb_alg );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    FLA_Obj_free( &A );
    FLA_Obj_free( &Aold );
    FLA_Obj_free( &Aref );
    FLA_Obj_free( &delta );
    printf( "\n" );

    i++;
  }

  /* Print the MATLAB commands to plot the data */

  /* Delete all existing figures */
  printf( "close all\n" );


#if OCTAVE == TRUE
  /* Plot the performance of FLAME */
  printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), '-k;libflame;' ); \n" );

  /* Indicate that you want to add to the existing plot */
  printf( "hold on\n" );

  /* Plot the performance of the reference implementation */
  printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), '-m;reference;' ); \n" );

  /* Plot the performance of your implementations */
  printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), \"-rx;UnbVar1;\" ); \n" );
  printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), \"-go;UnbVar2;\" ); \n" );
  printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), \"-b*;UnbVar3;\" ); \n" );
  printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), \"-rx;BlkVar1;\", \"markersize\", 3 ); \n" );
  printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), \"-go;BlkVar2;\", \"markersize\", 3  ); \n" );
  printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), \"-b*;BlkVar3;\", \"markersize\", 3  ); \n" );

#else

  /* Plot the performance of FLAME */
  printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" );

  /* Indicate that you want to add to the existing plot */
  printf( "hold on\n" );

  /* Plot the performance of the reference implementation */
  printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" );

  /* Plot the performance of your implementations */
  printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.x' ); \n" );
  printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.o' ); \n" );
  printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.*' ); \n" );
  printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r-x'); \n" );
  printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g-o'); \n" );
  printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b-*'); \n" );
#endif

  printf( "hold off \n");

  printf( "xlabel( 'matrix dimension m=n' );\n");
  printf( "ylabel( 'GFLOPS/sec.' );\n");
  printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops );

#if OCTAVE == TRUE
  printf( "legend( 2 ); \n" );

  printf(" print -landscape -solid -color -deps -F:24 Chol.eps\n" );
#else
  printf( "legend( 'FLA Chol', ...\n");
  printf( "        'Simple loops', ...\n");
  printf( "        'unb var1', ...\n");
  printf( "        'unb var2', ...\n");
  printf( "        'unb var3', ...\n");
  printf( "        'blk var1', ...\n");
  printf( "        'blk var2', ...\n");
  printf( "        'blk var3', 2);\n");

  printf( "print -r100 -dpdf Chol.pdf\n");
#endif

  FLA_Finalize( );

  exit( 0 );
}
Example #18
0
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Datatype realtype = REALTYPE;
  FLA_Obj      
    A, TU, TV, 
    A_copy, A_recovered,
    U, V, Vb, B, Be, d, e, 
    DU, DV;

  FLA_Obj     
    ATL, ATR,
    ABL, ABR, Ae;

  FLA_Uplo     uplo;
  dim_t        m, n, min_m_n;
  FLA_Error    init_result; 

  double       residual_A = 0.0;

  if ( argc == 3 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME Bidiag setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_Bidiag_UT_create_T( A, &TU, &TV );

  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 
  {
    scomplex *buff_A = FLA_Obj_buffer_at_view( A );
    buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01;  
    buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01;
  }

  //FLA_Set_to_identity( A );
  //FLA_Scal( FLA_MINUS_ONE, A );

  if ( m >= n ) {
    uplo = FLA_UPPER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
    Ae = ATR; 
  } else {
    uplo = FLA_LOWER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
    Ae = ABL;
  }

  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );

  // Bidiag test
  {
    FLA_Obj      norm;
    FLA_Bool     apply_scale;

    FLA_Obj_create( realtype, 1,1, 0,0, &norm );

    FLA_Max_abs_value( A, norm );
    apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); 

    if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A );
    FLA_Bidiag_UT( A, TU, TV );
    if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); 

    FLA_Obj_free( &norm );
  }


  // Orthonomal basis U, V. 
  FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U );
  FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V );

  FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE,   U );
  FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); 

  if ( FLA_Obj_is_complex( A ) ){
    FLA_Obj rL, rR;
    
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL );
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR );

    FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------");
    FLA_Bidiag_UT_realify( A, rL, rR );
    FLA_Obj_fshow( stdout, " - Factor    realified - ", A, "% 6.4e", "------");

    FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------");

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U );
    FLA_Apply_diag_matrix( FLA_LEFT,  FLA_CONJUGATE, rR, V );

    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
  }

  // U^H U
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU );
  FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, U, U, FLA_ZERO, DU );

  // V^H V
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV );
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, V, V, FLA_ZERO, DV );
  
  // Recover the matrix
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B );
  FLA_Set( FLA_ZERO, B );

  // Set B
  FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d );  
  FLA_Set_diagonal_vector( A, d );
  FLA_Set_diagonal_matrix( d, B );
  FLA_Obj_free( &d );

  if ( min_m_n > 1 ) {
    FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e );  
    FLA_Set_diagonal_vector( Ae, e );
    if ( uplo == FLA_UPPER_TRIANGULAR ) {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
      Be = ATR;
    } else {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
      Be = ABL;
    }
    FLA_Set_diagonal_matrix( e, Be );
    FLA_Obj_free( &e );
  }

  // Vb := B (V^H)
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb );
  FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE,
                     FLA_NONUNIT_DIAG, FLA_ONE, B, Vb );

  // A := U Vb
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                     FLA_ONE, U, Vb, FLA_ZERO, A_recovered );

  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );

  if (1) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U'U - ", DU,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - VV' - ", DV,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n);
    fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &TU );
  FLA_Obj_free( &TV );

  FLA_Obj_free( &B );

  FLA_Obj_free( &U );
  FLA_Obj_free( &V );
  FLA_Obj_free( &Vb );

  FLA_Obj_free( &DU );
  FLA_Obj_free( &DV );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );


  FLA_Finalize_safe( init_result );     
}
Example #19
0
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Obj      A, A_flame, A_lapack, C;
  int          m;
  FLA_Error    init_result; 

  FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame;
  FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack;
  testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, 
    *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack;
  int lwork, info, is_flame;
  
  if ( argc == 3 ) {
    m = atoi(argv[1]);
    is_flame = atoi(argv[2]);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m is_flame\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       is_flame : 1 yes, 0 no\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  fprintf( stdout, "lapack2flame: %d x %d: \n", m, m);

  FLA_Obj_create( datatype, m, m, 0, 0, &A );
  FLA_Random_matrix( A ); 
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame  );
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack );

  FLA_Obj_create( datatype, m, m, 0, 0, &C );
  FLA_Random_matrix( C ); 


  if ( is_flame ) {
    fprintf( stdout, " flame executed\n");
    FLA_Bidiag_UT_create_T( A_flame, &TU, &TV );

    FLA_Bidiag_UT( A_flame, TU, TV );
    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame );
    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame );

    FLA_Bidiag_UT_form_U( U_flame, TU, U_flame );
    FLA_Bidiag_UT_form_V( V_flame, TV, V_flame );
    
    FLA_Obj_create( datatype, m,      1, 0, 0, &d_flame );
    FLA_Obj_create( datatype, m - 1,  1, 0, 0, &e_flame );
    FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame );

    FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame );
    {
      FLA_Obj BTL, BTR, BBL, BBR;
      FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL );
      FLA_Set_diagonal_matrix( d_flame, B_flame );
      FLA_Set_diagonal_matrix( e_flame, BTR );
    }

    if (1) {
      fprintf( stdout, " - FLAME ----------\n");
      FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------");
    }
  } else {
    fprintf( stdout, " lapack executed\n");

    FLA_Obj_create( datatype, m, 1, 0, 0, &tauq );
    FLA_Obj_create( datatype, m, 1, 0, 0, &taup );
    FLA_Obj_create( datatype, m,      1, 0, 0, &d_lapack );
    FLA_Obj_create( datatype, m - 1,  1, 0, 0, &e_lapack );

    buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack );
    buff_tauq     = (testtype*)FLA_Obj_buffer_at_view( tauq );
    buff_taup     = (testtype*)FLA_Obj_buffer_at_view( taup );
    buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack );
    buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack );

    lwork = 32*m;
    
    FLA_Obj_create( datatype, lwork, 1, 0, 0, &W );
    buff_W = (testtype*)FLA_Obj_buffer_at_view( W );
    sgebrd_( &m, &m, 
             buff_A_lapack, &m,
             buff_d_lapack,
             buff_e_lapack,
             buff_tauq,
             buff_taup,
             buff_W,
             &lwork,
             &info );

    FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack );
    FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack );
    
    FLA_Copy( A_lapack, U_lapack );
    FLA_Copy( A_lapack, V_lapack );

    buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack );
    buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack );

    sorgbr_( "Q", &m, &m, &m,
             buff_U_lapack, &m,
             buff_tauq, 
             buff_W,
             &lwork,
             &info );
    
    sorgbr_( "P", &m, &m, &m,
             buff_V_lapack, &m,
             buff_taup,
             buff_W,
             &lwork,
             &info );

    FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack );
    {
      FLA_Obj BTL, BTR, BBL, BBR;
      FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL );
      FLA_Set_diagonal_matrix( d_lapack, B_lapack );
      FLA_Set_diagonal_matrix( e_lapack, BTR );
    }
    
    FLA_Obj_free( &W );    


    if (1) {
      fprintf( stdout, " - LAPACK ----------\n");
      FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------");
    }
  }

  {
    testtype     dummy;
    int          zero = 0, one = 1;
    FLA_Obj      D_lapack;

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack );

    if ( is_flame ) {
      buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame );
      buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame );
      buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame );
      buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame );
    }

    FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W );
    buff_W = (testtype*)FLA_Obj_buffer_at_view( W );
    sbdsqr_( "U", &m, &m, &m, &zero, 
             buff_d_lapack, buff_e_lapack, 
             buff_V_lapack, &m, 
             buff_U_lapack, &m, 
             &dummy, &one, 
             buff_W, &info );
    FLA_Obj_free( &W );
    if (info != 0)
      printf( " Error info = %d\n", info );

    if ( is_flame )
      FLA_Set_diagonal_matrix( d_flame, D_lapack );
    else
      FLA_Set_diagonal_matrix( d_lapack, D_lapack );

    if ( is_flame ) {
      fprintf( stdout, " - FLAME ----------\n");
      FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------");
    } else {
      fprintf( stdout, " - LAPACK ----------\n");
      FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------");
    }

    FLA_Obj_free( &D_lapack );
  }

  if ( is_flame ) {
    FLA_Obj_free( &TU );
    FLA_Obj_free( &TV );
    FLA_Obj_free( &U_flame );
    FLA_Obj_free( &V_flame );
    FLA_Obj_free( &d_flame );
    FLA_Obj_free( &e_flame );
    FLA_Obj_free( &B_flame );
  } else {
    FLA_Obj_free( &tauq );
    FLA_Obj_free( &taup );
    FLA_Obj_free( &d_lapack );
    FLA_Obj_free( &e_lapack );
    FLA_Obj_free( &U_lapack );
    FLA_Obj_free( &V_lapack );
    FLA_Obj_free( &B_lapack );
  }
  FLA_Obj_free( &A );
  FLA_Obj_free( &A_flame );
  FLA_Obj_free( &A_lapack );

  FLA_Obj_free( &C );

  FLA_Finalize_safe( init_result );     
}
// ============================================================================
void compute_case4b( int size_a, int size_b, int size_c, int size_d,
                     int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C,
                     int print_data ) {
    FLA_Obj  slice_C;
    int      datatype, size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci,
             iter_a, iter_b, iter_c, iter_d, iter_i, iter_j, ldim_slice_C;
    size_t   idx_A, idx_B, idx_C;
    double   * buff_cb_A, * buff_cb_B, * buff_cb_C, * buff_slice_C, d_one = 1.0;

    // Some initializations.
    datatype  = FLA_Obj_datatype( cb_A );
    buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A );
    buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B );
    buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C );

    size_ab  = size_a * size_b;
    size_abc = size_a * size_b * size_c;

    size_ia  = size_i * size_a;
    size_iaj = size_i * size_a * size_j;

    size_jc  = size_j * size_c;
    size_jci = size_j * size_c * size_i;

    // Show data.
    if( print_data == 1 ) {
        FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" );
        FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" );
        FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" );
    }

    // Prepare temporal slices without buffer.
    FLA_Obj_create_without_buffer( datatype, size_a, size_c, & slice_C );
#if 0
    FLA_Obj_create_without_buffer( datatype, size_a, 1, & slice_A );
    FLA_Obj_create_without_buffer( datatype, size_c, 1, & slice_B );
#endif

    // Perform computation.
    for( iter_b = 0; iter_b < size_b; iter_b++ ) {

        for( iter_d = 0; iter_d < size_d; iter_d++ ) {

            // Define slice_C.
            iter_a = 0;
            iter_c = 0;
            idx_C = ( ( size_t ) iter_a ) +
                    ( ( size_t ) iter_b * size_a ) +
                    ( ( size_t ) iter_c * size_ab ) +
                    ( ( size_t ) iter_d * size_abc );
            FLA_Obj_attach_buffer( & buff_cb_C[ idx_C ], 1, size_ab, & slice_C );
            buff_slice_C = ( double * ) FLA_Obj_buffer_at_view( slice_C );
            ldim_slice_C = FLA_Obj_col_stride( slice_C );

            // Initialize slice_C.
            MyFLA_Obj_set_to_zero( slice_C );

            for( iter_i = 0; iter_i < size_i; iter_i++ ) {

                for( iter_j = 0; iter_j < size_j; iter_j++ ) {
#if 0
                    // Define slice_A.
                    FLA_Obj_attach_buffer(
                        & buff_cb_A[ iter_i + 0 * size_i + iter_j * size_ia +
                                     iter_b * size_iaj ],
                        size_i, 1, & slice_A );

                    // Define slice_B.
                    FLA_Obj_attach_buffer(
                        & buff_cb_B[ iter_j + 0 * size_j + iter_i * size_jc +
                                     iter_d * size_jci ],
                        size_j, 1, & slice_B );

                    // Compute DGER operation.
                    FLA_Ger( FLA_ONE, slice_A, slice_B, slice_C );
#endif
                    idx_A = ( ( size_t ) iter_i ) +
                            ( ( size_t ) 0 * size_i ) +
                            ( ( size_t ) iter_j * size_ia ) +
                            ( ( size_t ) iter_b * size_iaj );
                    idx_B = ( ( size_t ) iter_j ) +
                            ( ( size_t ) 0 * size_j ) +
                            ( ( size_t ) iter_i * size_jc ) +
                            ( ( size_t ) iter_d * size_jci );

                    dger_( & size_a, & size_c,
                           & d_one,
                           & buff_cb_A[ idx_A ], & size_i,
                           & buff_cb_B[ idx_B ], & size_j,
                           buff_slice_C, & ldim_slice_C );
                }
            }

        }
    }

    // Show data.
    if( print_data == 1 ) {
        FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" );
        FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" );
        FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" );
    }

    // Remove temporal slices.
    FLA_Obj_free_without_buffer( & slice_C );
#if 0
    FLA_Obj_free_without_buffer( & slice_A );
    FLA_Obj_free_without_buffer( & slice_B );
#endif
}