Exemple #1
0
int main(int argc, char *argv[])
{
  int 
    m_input,
    m,
    p_first, p_last, p_inc,
    p,
    nb_alg,
    variant,
    n_repeats,
    i, j,
    datatype,
    n_variants = 4;

  int  sign;
  
  int  blocksize[16];

  char *colors = "brkgmcbrkg";
  char *ticks  = "o+*xso+*xs";
  char m_dim_desc[14];
  char n_dim_desc[14];
  char m_dim_tag[10];
  char n_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff;

  FLA_Obj
    A, C, C_ref, scale, isgn, norm;
  

  FLA_Init();


  fprintf( stdout, "%c number of repeats:", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c Enter sign (-1 or 1):", '%' );
  scanf( "%d", &sign );
  fprintf( stdout, "%c %d\n", '%', sign );

  fprintf( stdout, "%c Enter blocking size:", '%' );
  scanf( "%d", &nb_alg );
  fprintf( stdout, "%c %d\n", '%', nb_alg );

  fprintf( stdout, "%c enter problem size first, last, inc:", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' );
  scanf( "%d", &m_input );
  fprintf( stdout, "%c %d\n", '%', m_input );


  fprintf( stdout, "\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }


  if ( 0 < sign )
    isgn = FLA_ONE;
  else
    isgn = FLA_MINUS_ONE;


  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {

    m = m_input;

    if( m < 0 ) m = p / abs(m_input);

    //datatype = FLA_FLOAT;
    //datatype = FLA_DOUBLE;
    //datatype = FLA_COMPLEX;
    datatype = FLA_DOUBLE_COMPLEX;

    FLA_Obj_create( datatype, m, m, 0, 0, &A );
    FLA_Obj_create( datatype, m, m, 0, 0, &C );
    FLA_Obj_create( datatype, m, m, 0, 0, &C_ref );

    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &scale );
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

    FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A );
    FLA_Norm1( A, norm );
    FLA_Shift_diag( FLA_NO_CONJUGATE, norm, A );

    FLA_Random_matrix( C );
    FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C );

/*
    time_Lyap_h( 0, FLA_ALG_REFERENCE, n_repeats, m, nb_alg,
                 isgn, A, C, C_ref, scale, &dtime, &diff, &gflops );

    fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d  %6.3lf ]; \n", i, p, gflops );
    fflush( stdout );
*/
    for ( variant = 1; variant <= n_variants; variant++ ){
      
      fprintf( stdout, "data_var%d( %d, 1:7 ) = [ %d  ", variant, i, p );
      fflush( stdout );

      time_Lyap_h( variant, FLA_ALG_UNBLOCKED, n_repeats, m, nb_alg,
                   isgn, A, C, C_ref, scale, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      time_Lyap_h( variant, FLA_ALG_UNB_OPT, n_repeats, m, nb_alg,
                   isgn, A, C, C_ref, scale, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      time_Lyap_h( variant, FLA_ALG_BLOCKED, n_repeats, m, nb_alg,
                   isgn, A, C, C_ref, scale, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      fprintf( stdout, " ]; \n" );
      fflush( stdout );
    }

    FLA_Obj_free( &A );
    FLA_Obj_free( &C );
    FLA_Obj_free( &C_ref );
    FLA_Obj_free( &scale );
    FLA_Obj_free( &norm );
    fprintf( stdout, "\n" );
  }

/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 1; i <= n_variants; i++ ){
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", 
             i, i, colors[ i-1 ], ticks[ i-1 ] );
  }

  fprintf( stdout, "legend( ... \n" );
  fprintf( stdout, "'Reference', ... \n" );

  for ( i = 1; i <= n_variants; i++ )
    fprintf( stdout, "'FLAME var%d', ... \n", i );

  fprintf( stdout, "'Location', 'SouthEast' ); \n" );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME sylv\\_nn performance (%s)' );\n", 
           m_dim_desc );
  fprintf( stdout, "print -depsc sylv_nn_%s.eps\n", m_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/
  FLA_Finalize( );

  return 0;
}
Exemple #2
0
int main(int argc, char *argv[])
{
  int 
    m_input,
    m,
    p_first, p_last, p_inc,
    p,
    k_accum,
    b_alg,
    n_iter_max,
    variant,
    n_repeats,
    i,
    n_variants = 2;

  char *colors = "brkgmcbrkg";
  char *ticks  = "o+*xso+*xs";
  char m_dim_desc[14];
  char m_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff1, diff2;

  FLA_Datatype datatype, dt_real;

  FLA_Obj
    A, l, Q, Ql, TT, r, d, e, A_orig, G, R, W2, de, alpha;

  FLA_Init();


  fprintf( stdout, "%c number of repeats:", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c enter n_iter_max (per eigenvalue): ", '%' );
  scanf( "%d", &n_iter_max );
  fprintf( stdout, "%c %d\n", '%', n_iter_max );

  fprintf( stdout, "%c enter number of sets of Givens rotations to accumulate:", '%' );
  scanf( "%d", &k_accum );
  fprintf( stdout, "%c %d\n", '%', k_accum );

  fprintf( stdout, "%c enter blocking size for application of G:", '%' );
  scanf( "%d", &b_alg );
  fprintf( stdout, "%c %d\n", '%', b_alg );

  fprintf( stdout, "%c enter problem size first, last, inc:", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' );
  scanf( "%d", &m_input );
  fprintf( stdout, "%c %d\n", '%', m_input );


  fprintf( stdout, "\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }


  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {

    m = m_input;

    if( m < 0 ) m = p / abs(m_input);

    //datatype = FLA_FLOAT;
    //datatype = FLA_DOUBLE;
    //datatype = FLA_COMPLEX;
    datatype = FLA_DOUBLE_COMPLEX;

    FLA_Obj_create( datatype,  m,         m, 0, 0, &A );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &A_orig );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &Q );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &Ql );
    FLA_Obj_create( datatype,  m,         1, 0, 0, &r );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &W2 );
    FLA_Obj_create( datatype,  m-1, k_accum, 0, 0, &G );

	dt_real = FLA_Obj_datatype_proj_to_real( A );

    FLA_Obj_create( dt_real, m,      1, 0, 0, &l );
    FLA_Obj_create( dt_real, m,      1, 0, 0, &d );
    FLA_Obj_create( dt_real, m-1,    1, 0, 0, &e );
    FLA_Obj_create( dt_real, m,      m, 0, 0, &R );

	FLA_Obj_create( dt_real, 1,      1, 0, 0, &alpha );

	*FLA_DOUBLE_PTR( alpha ) = 1.0 / ( sqrt( sqrt( (double) m ) ) );

	FLA_Random_unitary_matrix( Q );

	//FLA_Fill_with_uniform_dist( FLA_ONE,   l );
	//FLA_Fill_with_inverse_dist( FLA_ONE,   l );
	FLA_Fill_with_geometric_dist( alpha,   l );


    {
      FLA_Copy( Q, Ql );
      FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, Ql );
      FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                FLA_ONE, Ql, Q, FLA_ZERO, A );
      FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A );
      FLA_Copy( A, A_orig );
    }

    FLA_Set( FLA_ZERO, l );
    FLA_Set( FLA_ZERO, Q );

	FLA_Tridiag_UT_create_T( A, &TT );
	FLA_Tridiag_UT( FLA_LOWER_TRIANGULAR, A, TT );
	FLA_Tridiag_UT_realify( FLA_LOWER_TRIANGULAR, A, r );
	FLA_Tridiag_UT_extract_diagonals( FLA_LOWER_TRIANGULAR, A, d, e );
	FLA_Tridiag_UT_form_Q( FLA_LOWER_TRIANGULAR, A, TT );
	FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A );
    FLA_Obj_free( &TT );

    time_Tevd_v( 0, FLA_ALG_REFERENCE, n_repeats, m, k_accum, b_alg, n_iter_max,
                 A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops );

    fprintf( stdout, "data_REFq( %d, 1:3 ) = [ %d %6.3lf %9.2e %6.2le %6.2le ]; \n", i, p, gflops, dtime, diff1, diff2 );
    fflush( stdout );

    for ( variant = 1; variant <= n_variants; variant++ ){
      
      fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p );
      fflush( stdout );

      time_Tevd_v( variant, FLA_ALG_UNB_OPT, n_repeats, m, k_accum, b_alg, n_iter_max,
                   A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops );

      fprintf( stdout, "%6.3lf %9.2e %6.2le %6.2le ", gflops, dtime, diff1, diff2 );
      fflush( stdout );

      fprintf( stdout, "];\n" );
      fflush( stdout );
    }

    fprintf( stdout, "\n" );

    FLA_Obj_free( &A );
    FLA_Obj_free( &A_orig );
    FLA_Obj_free( &Q );
    FLA_Obj_free( &Ql );
    FLA_Obj_free( &G );
    FLA_Obj_free( &W2 );
    FLA_Obj_free( &r );
    FLA_Obj_free( &l );
    FLA_Obj_free( &d );
    FLA_Obj_free( &e );
    FLA_Obj_free( &R );
    FLA_Obj_free( &alpha );
  }

/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 1; i <= n_variants; i++ ) {
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
  }

  fprintf( stdout, "legend( ... \n" );
  fprintf( stdout, "'Reference', ... \n" );

  for ( i = 1; i < n_variants; i++ )
    fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i );
  fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME Hevd_lv performance (%s, %s)' );\n", 
           m_dim_desc, n_dim_desc );
  fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/

  FLA_Finalize( );

  return 0;
}
Exemple #3
0
void libfla_test_symm_experiment( test_params_t params,
                                  unsigned int  var,
                                  char*         sc_str,
                                  FLA_Datatype  datatype,
                                  unsigned int  p_cur,
                                  unsigned int  pci,
                                  unsigned int  n_repeats,
                                  signed int    impl,
                                  double*       perf,
                                  double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	unsigned int n;
	signed int   n_input    = -1;
	FLA_Side     side;
	FLA_Uplo     uplo;
	FLA_Obj      A, B, C, x, y, z, w, norm;
	FLA_Obj      alpha, beta;
	FLA_Obj      C_save;
	FLA_Obj      A_test, B_test, C_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur / abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur / abs(n_input);
	else               n = p_cur;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side );
	FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo );

	// Create the matrices for the current operation.
	if ( side == FLA_LEFT )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, m, 1, 0, 0, &w );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, n, 1, 0, 0, &w );
	}
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C );

	// Create a norm scalar.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Initialize the test matrices.
	FLA_Random_symm_matrix( uplo, A );
	FLA_Random_matrix( B );
	FLA_Random_matrix( C );

	// Initialize the test vectors.
	FLA_Random_matrix( x );
    FLA_Set( FLA_ZERO, y );
    FLA_Set( FLA_ZERO, z );
    FLA_Set( FLA_ZERO, w );

	// Set constants.
	alpha = FLA_TWO;
	beta  = FLA_MINUS_ONE;

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
		FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test );
	}
	else
	{
		A_test = A;
		B_test = B;
		C_test = C;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( C_save, C_test );
		else
			FLA_Copy_external( C_save, C_test );
		
		time = FLA_Clock();

		libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Copy the solution to flat matrix X.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_flatten( C_test, C );
	}
	else
    {
		// No action needed since C_test and C refer to the same object.
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &B_test );
		FLASH_Obj_free( &C_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_free();

	// Compute the performance of the best experiment repeat.
	if ( side == FLA_LEFT )
		*perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
	else
		*perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute:
	//   y = C * x
	// and compare to
	//   z = ( beta * C_orig + alpha * A * B ) x      (side = left)
	//   z = ( beta * C_orig + alpha * B * A ) x      (side = right)
	FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y );

	if ( side == FLA_LEFT )
	{
		FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w );
		FLA_Symv_external( uplo,             alpha,   A, w, FLA_ZERO, z );
	}
	else
	{
		FLA_Symv_external( uplo,             FLA_ONE, A, x, FLA_ZERO, w );
		FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha,   B, w, FLA_ZERO, z );
	}
	FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z );

	// Compute || y - z ||.
	//FLA_Axpy_external( FLA_MINUS_ONE, y, z );
	//FLA_Nrm2_external( z, norm );
	//FLA_Obj_extract_real_scalar( norm, residual );
	*residual = FLA_Max_elemwise_diff( y, z );

	// Free the supporting flat objects.
	FLA_Obj_free( &C_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &B );
	FLA_Obj_free( &C );
	FLA_Obj_free( &x );
	FLA_Obj_free( &y );
	FLA_Obj_free( &z );
	FLA_Obj_free( &w );
	FLA_Obj_free( &norm );
}
FLA_Error FLA_Bidiag_blk_external( FLA_Obj A, FLA_Obj tu, FLA_Obj tv )
{
  int          info = 0;
#ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES
  FLA_Datatype datatype;
  int          m_A, n_A, cs_A;
  int          min_m_n, max_m_n;
  int          lwork;
  FLA_Obj      d, e, work_obj;

  if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
    FLA_Bidiag_check( A, tu, tv );

  if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS;

  datatype = FLA_Obj_datatype( A );

  m_A      = FLA_Obj_length( A );
  n_A      = FLA_Obj_width( A );
  min_m_n  = FLA_Obj_min_dim( A );
  max_m_n  = FLA_Obj_max_dim( A );
  cs_A     = FLA_Obj_col_stride( A );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n,     1, 0, 0, &d );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n - 1, 1, 0, 0, &e );

  lwork    = (m_A + n_A) * FLA_Query_blocksize( datatype, FLA_DIMENSION_MIN );
  FLA_Obj_create( datatype, lwork, 1, 0, 0, &work_obj );


  switch( datatype ){

  case FLA_FLOAT:
  {
    float* buff_A    = ( float * ) FLA_FLOAT_PTR( A );
    float* buff_d    = ( float * ) FLA_FLOAT_PTR( d );
    float* buff_e    = ( float * ) FLA_FLOAT_PTR( e );
    float* buff_tu   = ( float * ) FLA_FLOAT_PTR( tu );
    float* buff_tv   = ( float * ) FLA_FLOAT_PTR( tv );
    float* buff_work = ( float * ) FLA_FLOAT_PTR( work_obj );

    F77_sgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  }

  case FLA_DOUBLE:
  {
    double* buff_A    = ( double * ) FLA_DOUBLE_PTR( A );
    double* buff_d    = ( double * ) FLA_DOUBLE_PTR( d );
    double* buff_e    = ( double * ) FLA_DOUBLE_PTR( e );
    double* buff_tu   = ( double * ) FLA_DOUBLE_PTR( tu );
    double* buff_tv   = ( double * ) FLA_DOUBLE_PTR( tv );
    double* buff_work = ( double * ) FLA_DOUBLE_PTR( work_obj );

    F77_dgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  } 

  case FLA_COMPLEX:
  {
    scomplex* buff_A    = ( scomplex * ) FLA_COMPLEX_PTR( A );
    float*    buff_d    = ( float    * ) FLA_FLOAT_PTR( d );
    float*    buff_e    = ( float    * ) FLA_FLOAT_PTR( e );
    scomplex* buff_tu   = ( scomplex * ) FLA_COMPLEX_PTR( tu );
    scomplex* buff_tv   = ( scomplex * ) FLA_COMPLEX_PTR( tv );
    scomplex* buff_work = ( scomplex * ) FLA_COMPLEX_PTR( work_obj );

    F77_cgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  } 

  case FLA_DOUBLE_COMPLEX:
  {
    dcomplex* buff_A    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A );
    double*   buff_d    = ( double   * ) FLA_DOUBLE_PTR( d );
    double*   buff_e    = ( double   * ) FLA_DOUBLE_PTR( e );
    dcomplex* buff_tu   = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tu );
    dcomplex* buff_tv   = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tv );
    dcomplex* buff_work = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( work_obj );

    F77_zgebrd( &m_A,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_tu,
                buff_tv,
                buff_work,
                &lwork,
                &info );

    break;
  } 

  }

  FLA_Obj_free( &d );
  FLA_Obj_free( &e );
  FLA_Obj_free( &work_obj );
#else
  FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED );
#endif

  return info;
}
FLA_Error FLA_Svd_uv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg )
{
    FLA_Error    r_val = FLA_SUCCESS;
    FLA_Datatype dt;
    FLA_Datatype dt_real;
    FLA_Datatype dt_comp;
    FLA_Obj      scale, T, S, rL, rR, d, e, G, H;
    dim_t        m_A, n_A;
    dim_t        min_m_n;
    dim_t        n_GH;
    double       crossover_ratio = 17.0 / 9.0;

    n_GH    = k_accum;

    m_A     = FLA_Obj_length( A );
    n_A     = FLA_Obj_width( A );
    min_m_n = FLA_Obj_min_dim( A );
    dt      = FLA_Obj_datatype( A );
    dt_real = FLA_Obj_datatype_proj_to_real( A );
    dt_comp = FLA_Obj_datatype_proj_to_complex( A );

    // Create matrices to hold block Householder transformations.
    FLA_Bidiag_UT_create_T( A, &T, &S );

    // Create vectors to hold the realifying scalars.
    FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rL );
    FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rR );

    // Create vectors to hold the diagonal and sub-diagonal.
    FLA_Obj_create( dt_real, min_m_n,      1, 0, 0, &d );
    FLA_Obj_create( dt_real, min_m_n-1,    1, 0, 0, &e );

    // Create matrices to hold the left and right Givens scalars.
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G );
    FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H );

    // Create a real scaling factor.
    FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale );

    // Compute a scaling factor; If none is needed, sigma will be set to one.
    FLA_Svd_compute_scaling( A, scale );

    // Scale the matrix if scale is non-unit.
    if ( !FLA_Obj_equals( scale, FLA_ONE ) )
        FLA_Scal( scale, A );

    if ( m_A < crossover_ratio * n_A )
    {
        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( A, T, S );
        FLA_Bidiag_UT_realify( A, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( A, d, e );

        // Form U and V.
        FLA_Bidiag_UT_form_U( A, T, U );
        FLA_Bidiag_UT_form_V( A, S, V );

        // Apply the realifying scalars in rL and rR to U and V, respectively.
        {
            FLA_Obj UL, UR;
            FLA_Obj VL, VR;

            FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );
            FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );

            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, UL );
            FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
        }

        // Perform a singular value decomposition on the bidiagonal matrix.
        r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, U, V, b_alg );
    }
    else // if ( crossover_ratio * n_A <= m_A )
    {
        FLA_Obj TQ, R;
        FLA_Obj AT,
                AB;
        FLA_Obj UL, UR;

        // Perform a QR factorization on A and form Q in U.
        FLA_QR_UT_create_T( A, &TQ );
        FLA_QR_UT( A, TQ );
        FLA_QR_UT_form_Q( A, TQ, U );
        FLA_Obj_free( &TQ );

        // Set the lower triangle of R to zero and then copy the upper
        // triangle of A to R.
        FLA_Part_2x1( A,   &AT,
                           &AB,   n_A, FLA_TOP );
        FLA_Obj_create( dt, n_A, n_A, 0, 0, &R );
        FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R );
        FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R );

        // Reduce the matrix to bidiagonal form.
        // Apply scalars to rotate elements on the superdiagonal to the real domain.
        // Extract the diagonal and superdiagonal from A.
        FLA_Bidiag_UT( R, T, S );
        FLA_Bidiag_UT_realify( R, rL, rR );
        FLA_Bidiag_UT_extract_real_diagonals( R, d, e );

        // Form V from right Householder vectors in upper triangle of R.
        FLA_Bidiag_UT_form_V( R, S, V );

        // Form U in R.
        FLA_Bidiag_UT_form_U( R, T, R );

        // Apply the realifying scalars in rL and rR to U and V, respectively.
        FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, R );
        FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V );

        // Perform a singular value decomposition on the bidiagonal matrix.
        r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, R, V, b_alg );

        // Multiply R into U, storing the result in A and then copying back
        // to U.
        FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );
        FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                  FLA_ONE, UL, R, FLA_ZERO, A );
        FLA_Copy( A, UL );

        FLA_Obj_free( &R );
    }

    // Copy the converged eigenvalues to the output vector.
    FLA_Copy( d, s );

    // Sort the singular values and singular vectors in descending order.
    FLA_Sort_svd( FLA_BACKWARD, s, U, V );

    // If the matrix was scaled, rescale the singular values.
    if ( !FLA_Obj_equals( scale, FLA_ONE ) )
        FLA_Inv_scal( scale, s );

    FLA_Obj_free( &scale );
    FLA_Obj_free( &T );
    FLA_Obj_free( &S );
    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
    FLA_Obj_free( &d );
    FLA_Obj_free( &e );
    FLA_Obj_free( &G );
    FLA_Obj_free( &H );

    return r_val;
}
Exemple #6
0
void libfla_test_qrut_experiment( test_params_t params,
                                  unsigned int  var,
                                  char*         sc_str,
                                  FLA_Datatype  datatype,
                                  unsigned int  p_cur,
                                  unsigned int  pci,
                                  unsigned int  n_repeats,
                                  signed int    impl,
                                  double*       perf,
                                  double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m, n;
	unsigned int min_m_n;
	signed int   m_input    = -2;
	signed int   n_input    = -1;
	FLA_Obj      A, T, x, b, y, norm;
	FLA_Obj      A_save;
	FLA_Obj      A_test, T_test, x_test, b_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur * abs(n_input);
	else               n = p_cur;

	// Compute the minimum dimension.
	min_m_n = min( m, n );

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A );

	if ( impl == FLA_TEST_FLAT_FRONT_END ||
	     ( impl == FLA_TEST_FLAT_BLK_VAR && var == 1 ) )
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T );
	else if ( var == 2 )
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], min_m_n, min_m_n, &T );
	else
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], 1, min_m_n, &T );

	// Initialize the test matrices.
	FLA_Random_matrix( A );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );

	// Create vectors to form a linear system.
	FLA_Obj_create( datatype, n, 1, 0, 0, &x );
	FLA_Obj_create( datatype, m, 1, 0, 0, &b );
	FLA_Obj_create( datatype, n, 1, 0, 0, &y );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Create a random right-hand side vector.
	FLA_Random_matrix( b );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		FLASH_Obj_create_hier_copy_of_flat( b, 1, &b_flash, &b_test );
		FLASH_Obj_create_hier_copy_of_flat( x, 1, &b_flash, &x_test );
	}
	else
	{
		A_test = A;
		T_test = T;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_qrut_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( A_save, A_test );
		else
			FLA_Copy_external( A_save, A_test );
		
		time = FLA_Clock();

		libfla_test_qrut_impl( impl, A_test, T_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Perform a linear solve with the result.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_QR_UT_solve( A_test, T_test, b_test, x_test );
		FLASH_Obj_flatten( x_test, x );
	}
	else
    {
		FLA_QR_UT_solve( A_test, T_test, b, x );
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &T_test );
		FLASH_Obj_free( &b_test );
		FLASH_Obj_free( &x_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_qrut_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = (         2.0   * m * n * n - 
	          ( 2.0 / 3.0 ) * n * n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute the residual.
	FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save, x, FLA_MINUS_ONE, b );
	FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b, FLA_ZERO, y );
	FLA_Nrm2_external( y, norm );
	FLA_Obj_extract_real_scalar( norm, residual );

	// Free the supporting flat objects.
	FLA_Obj_free( &x );
	FLA_Obj_free( &b );
	FLA_Obj_free( &y );
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
}
Exemple #7
0
void time_Apply_G_rf(
               int variant, int type, int n_repeats, int m, int k, int n, int b_alg,
               FLA_Obj A, FLA_Obj A_ref, FLA_Obj G, FLA_Obj P,
               double *dtime, double *diff, double *gflops )
{
  int irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, G_save, norm;

  if ( FLA_Obj_is_real( A ) )
  {
    if (
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 1 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 1 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 2 && type == FLA_ALG_BLOCKED ) ||

       //( variant == 3 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 3 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 3 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 6 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 6 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 6 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 9 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 9 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 9 && type == FLA_ALG_BLOCKED ) ||

       ( variant == 4 ) ||
       ( variant == 5 ) ||
       ( variant == 7 ) ||
       ( variant == 8 ) ||
       FALSE
    ) 
    {
      *gflops = 0.0;
      *diff   = 0.0;
      return;
    }
  }
  else if ( FLA_Obj_is_complex( A ) )
  {
    if (
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 1 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 1 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 2 && type == FLA_ALG_BLOCKED ) ||

       //( variant == 3 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 3 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 3 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 6 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 6 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 6 && type == FLA_ALG_BLOCKED ) ||
       //( variant == 9 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 9 && type == FLA_ALG_UNB_ASM ) ||
       //( variant == 9 && type == FLA_ALG_BLOCKED ) ||

       ( variant == 4 ) ||
       ( variant == 5 ) ||
       ( variant == 7 ) ||
       ( variant == 8 ) ||
       FALSE
    )
    {
      *gflops = 0.0;
      *diff   = 0.0;
      return;
    }
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  //dim_t b_flash_m = b_alg;
  //dim_t b_flash_n = n;
  //FLASH_Obj_create_hier_copy_of_flat_ext( A, 1, &b_flash_m, &b_flash_n, &AH ); 

//printf ( "flash dims: %d x %d\n", FLA_Obj_length( AH ), FLA_Obj_width( AH ) );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( G, G_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( G_save, G );
    //FLASH_Obj_hierarchify( A_save, AH );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      break;

    // Time variant 1
    case 1:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var1( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var1( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var1( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 2
    case 2:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var2( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var2( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var2( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 3
    case 3:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var3( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var3( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var3( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 6
    case 6:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var6( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var6( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var6( G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 9
    case 9:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Apply_G_rf_opt_var9( G, A );
        break;
      case FLA_ALG_UNB_ASM:
        FLA_Apply_G_rf_asm_var9( G, A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Apply_G_rf_blk_var9( G, A, b_alg );
        break;
      }
      break;
    }


    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  if ( variant == 1 && type == FLA_ALG_UNB_OPT )
  {
    //FLA_Obj_show( "A_ref", A, "%9.2e + %9.2e ", "" );
    //FLA_Obj_show( "A", A, "%9.2e ", "" );

    FLA_Copy( A, A_ref );
    *diff = 0.0;
  }
  else
  {
    //FLA_Obj_show( "A", A, "%9.2e + %9.2e ", "" );

//if ( variant == 7 && type == FLA_ALG_UNB_ASM )
    //FLA_Obj_show( "A", A, "%9.2e", "" );

    //if ( variant == 9 ) FLASH_Obj_flatten( AH, A );

    FLA_Axpy( FLA_MINUS_ONE, A_ref, A );
    FLA_Norm_frob( A, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    //*diff = FLA_Max_elemwise_diff( A_ref, A );
  }


  *gflops = 6.0 * k * m * ( n - 1 ) /
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 2.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( G_save, G );

  //FLASH_Obj_free( &AH );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &G_save );
  FLA_Obj_free( &norm );
}
void time_Eig_gest_nu(
               int variant, int type, int n_repeats, int n, int b_alg,
               FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_save = 1.0e9;

  FLA_Obj
    A_save, B_save, norm;

  fla_blocksize_t* bp;
  fla_eig_gest_t*  cntl_eig_gest_var;
  fla_eig_gest_t*  cntl_eig_gest_unb;


  if ( ( type == FLA_ALG_UNBLOCKED || type == FLA_ALG_UNB_OPT ) &&
       n > 300 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  if ( variant == 3 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }


  bp                = FLA_Blocksize_create( b_alg, b_alg, b_alg, b_alg );
  cntl_eig_gest_unb = FLA_Cntl_eig_gest_obj_create( FLA_FLAT,
                                                    //FLA_UNBLOCKED_VARIANT1,
                                                    FLA_UNB_OPT_VARIANT1,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL );
  cntl_eig_gest_var = FLA_Cntl_eig_gest_obj_create( FLA_FLAT,
                                                    variant,
                                                    bp,
                                                    cntl_eig_gest_unb,
                                                    fla_axpy_cntl_blas,
                                                    fla_axpy_cntl_blas,
                                                    fla_gemm_cntl_blas,
                                                    fla_gemm_cntl_blas,
                                                    fla_gemm_cntl_blas,
                                                    fla_hemm_cntl_blas,
                                                    fla_her2k_cntl_blas,
                                                    fla_trmm_cntl_blas,
                                                    fla_trmm_cntl_blas,
                                                    fla_trsm_cntl_blas,
                                                    fla_trsm_cntl_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( B, B_save );


  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( B_save, B );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      REF_Eig_gest_nu( A, B );
      break;

    case 1:
    {
      // Time variant 1
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var1( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var1( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var1( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:
    {
      // Time variant 2
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var2( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var2( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var2( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 3:
    {
      // Time variant 3
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        //FLA_Eig_gest_nu_unb_var3( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        //FLA_Eig_gest_nu_opt_var3( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        //FLA_Eig_gest_nu_blk_var3( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 4:
    {
      // Time variant 4
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var4( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var4( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var4( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 5:
    {
      // Time variant 5
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var5( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var5( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var5( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_save = min( *dtime, dtime_save );
  }

  FLA_Cntl_obj_free( cntl_eig_gest_var );
  FLA_Cntl_obj_free( cntl_eig_gest_unb );
  FLA_Blocksize_free( bp );

  // Recover A.
  if ( inv == FLA_NO_INVERSE )
  {
    if ( uplo == FLA_LOWER_TRIANGULAR )
    {
      // A = L' * A_orig * L
      // A_orig = inv(L') * A * inv(L)
      FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A );
      FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trsm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
    else // if ( uplo == FLA_UPPER_TRIANGULAR )
    {
      // A = U * A_orig * U'
      // A_orig = inv(U) * A * inv(U')
      FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A );
      FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
  }
  else // if ( inv == FLA_INVERSE )
  {
    if ( uplo == FLA_LOWER_TRIANGULAR )
    {
      // A = inv(L) * A_orig * inv(L')
      // A_orig = L * A * L'
      FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A );
      FLA_Trmm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trmm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
    else // if ( uplo == FLA_UPPER_TRIANGULAR )
    {
      // A = inv(U') * A_orig * inv(U)
      // A_orig = U' * A * U
      FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A );
      FLA_Trmm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trmm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
  }

  *diff = FLA_Max_elemwise_diff( A, A_save );

/*
if ( type == FLA_ALG_UNBLOCKED )
{
FLA_Obj_show( "A", A, "%10.3e", "" );
FLA_Obj_show( "A_orig", A_save, "%10.3e", "" );
}
*/

  *gflops = 1.0 * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) / 
            dtime_save / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_save;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( B_save, B );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &B_save );
  FLA_Obj_free( &norm );
}
Exemple #9
0
void time_LQ_UT(
               int param_combo, int type, int nrepeats, int m, int n, int b_flash,
               FLA_Obj A, FLA_Obj TW, FLA_Obj b, FLA_Obj x,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj A_save;

  FLASH_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ )
  {
    FLASH_Copy( A_save, A );

    *dtime = FLA_Clock();

    switch( param_combo ){

    // Time parameter combination 0
    case 0:{
      switch( type ){
      case FLA_ALG_FRONT:
        FLASH_LQ_UT( A, TW );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }
	
    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  {
    FLA_Obj A_save_flat, x_flat, b_flat, y_flat;
    FLA_Obj norm;

    FLASH_Obj_create_flat_copy_of_hier( A_save, &A_save_flat );
    FLASH_Obj_create_flat_copy_of_hier( b, &b_flat );
    FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, x, &x_flat );
    FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, x, &y_flat );
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

/*
{
FLA_Obj At, Tt;

FLASH_Obj_create_flat_copy_of_hier( A_save, &At );
FLA_Obj_create( FLA_Obj_datatype( A_save ), b_flash, FLA_Obj_min_dim( At ), 0, 0, &Tt );

FLA_LQ_UT( At, Tt );

FLASH_Obj_show( "A_save", A_save, "%9.1e", "" );
FLASH_Obj_show( "A", A, "%9.1e", "" );
FLA_Obj_show( "At", At, "%9.1e", "" );

FLA_Obj_free( &At );
FLA_Obj_free( &Tt );
}
*/

    FLASH_LQ_UT_solve( A, TW, b, x );

    FLASH_Obj_flatten( x, x_flat );

    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save_flat, x_flat, FLA_MINUS_ONE, b_flat );
    FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save_flat, b_flat, FLA_ZERO, y_flat );
    FLA_Nrm2_external( y_flat, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    FLA_Obj_free( &A_save_flat );
    FLA_Obj_free( &b_flat );
    FLA_Obj_free( &x_flat );
    FLA_Obj_free( &y_flat );
    FLA_Obj_free( &norm );
  }

  *gflops = (         2.0   * m * n * n -
              ( 2.0 / 3.0 ) * n * n * n ) /
            dtime_old / 
            1.0e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLASH_Obj_free( &A_save );
}
FLA_Error FLA_Tridiag_unb_external( FLA_Uplo uplo, FLA_Obj A, FLA_Obj t )
{
  int          info = 0;
#ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES
  FLA_Datatype datatype;
  int          n_A, cs_A;
  FLA_Obj      d, e;
  char         blas_uplo;

  if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
    FLA_Tridiag_check( uplo, A, t );

  if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS;

  datatype = FLA_Obj_datatype( A );

  n_A      = FLA_Obj_width( A );
  cs_A     = FLA_Obj_col_stride( A );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), n_A,     1, 0, 0, &d );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), n_A - 1, 1, 0, 0, &e );

  FLA_Param_map_flame_to_netlib_uplo( uplo, &blas_uplo );


  switch( datatype ){

  case FLA_FLOAT:
  {
    float* buff_A    = ( float * ) FLA_FLOAT_PTR( A );
    float* buff_d    = ( float * ) FLA_FLOAT_PTR( d );
    float* buff_e    = ( float * ) FLA_FLOAT_PTR( e );
    float* buff_t    = ( float * ) FLA_FLOAT_PTR( t );

    F77_ssytd2( &blas_uplo,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_t,
                &info );

    break;
  }

  case FLA_DOUBLE:
  {
    double* buff_A    = ( double * ) FLA_DOUBLE_PTR( A );
    double* buff_d    = ( double * ) FLA_DOUBLE_PTR( d );
    double* buff_e    = ( double * ) FLA_DOUBLE_PTR( e );
    double* buff_t    = ( double * ) FLA_DOUBLE_PTR( t );

    F77_dsytd2( &blas_uplo,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_t,
                &info );

    break;
  } 

  case FLA_COMPLEX:
  {
    scomplex* buff_A    = ( scomplex * ) FLA_COMPLEX_PTR( A );
    float*    buff_d    = ( float    * ) FLA_FLOAT_PTR( d );
    float*    buff_e    = ( float    * ) FLA_FLOAT_PTR( e );
    scomplex* buff_t    = ( scomplex * ) FLA_COMPLEX_PTR( t );

    F77_chetd2( &blas_uplo,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_t,
                &info );

    break;
  } 

  case FLA_DOUBLE_COMPLEX:
  {
    dcomplex* buff_A    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A );
    double*   buff_d    = ( double   * ) FLA_DOUBLE_PTR( d );
    double*   buff_e    = ( double   * ) FLA_DOUBLE_PTR( e );
    dcomplex* buff_t    = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( t );

    F77_zhetd2( &blas_uplo,
                &n_A,
                buff_A, &cs_A,
                buff_d,
                buff_e,
                buff_t,
                &info );

    break;
  } 

  }

  FLA_Obj_free( &d );
  FLA_Obj_free( &e );
#else
  FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED );
#endif

  return info;
}
FLA_Error FLA_Tevdr_external( FLA_Evd_type jobz, FLA_Obj d, FLA_Obj e, FLA_Obj l, FLA_Obj A )
{
  int          info = 0;
#ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES
  FLA_Datatype datatype;
  FLA_Datatype dt_real;
  int          n_A, cs_A;
  int          lisuppz, lwork, liwork;
  FLA_Obj      isuppz, work, iwork;
  char         blas_jobz;
  char         blas_range;
  int          i;
  int          vl, vu;
  int          il, iu;
  int          nzc;
  int          try_rac;
  int          n_eig_found;

  //if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
  //  FLA_Tevdd_check( jobz, d, e, A );

  if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS;

  datatype = FLA_Obj_datatype( A );
  dt_real  = FLA_Obj_datatype_proj_to_real( A );

  n_A      = FLA_Obj_width( A );
  cs_A     = FLA_Obj_col_stride( A );

  FLA_Param_map_flame_to_netlib_evd_type( jobz, &blas_jobz );

  // Hard-code some parameters.
  blas_range = 'A';
  nzc        = n_A;
  try_rac    = TRUE;

  // Allocate space for the isuppz array.
  lisuppz = 2 * n_A;
  FLA_Obj_create( FLA_INT, lisuppz, 1, 0, 0, &isuppz );

  // Make a workspace query the first time through. This will provide us with
  // and ideal workspace size.
  lwork = -1;
  liwork = -1;
  FLA_Obj_create( dt_real, 1, 1, 0, 0, &work );
  FLA_Obj_create( FLA_INT, 1, 1, 0, 0, &iwork );

  for ( i = 0; i < 2; ++i )
  {
    if ( i == 1 )
    {
      // Grab the queried ideal workspace size from the work arrays, free the
      // work object, and then re-allocate the workspace with the ideal size.
      if      ( datatype == FLA_FLOAT || datatype == FLA_COMPLEX )
      {
        lwork  = ( int ) *FLA_FLOAT_PTR( work );
        liwork = ( int ) *FLA_INT_PTR( iwork );
      }
      else if ( datatype == FLA_DOUBLE || datatype == FLA_DOUBLE_COMPLEX )
      {
        lwork  = ( int ) *FLA_DOUBLE_PTR( work );
        liwork = ( int ) *FLA_INT_PTR( iwork );
      }
//printf( "ideal workspace for n = %d\n", n_A );
//printf( "                lwork = %d\n", lwork );
//printf( "               liwork = %d\n", liwork );
      FLA_Obj_free( &work );
      FLA_Obj_free( &iwork );
      FLA_Obj_create( dt_real, lwork,  1, 0, 0, &work );
      FLA_Obj_create( FLA_INT, liwork, 1, 0, 0, &iwork );
    }

    switch( datatype ) {

    case FLA_FLOAT:
    {
      float*    buff_d      = ( float*    ) FLA_FLOAT_PTR( d );
      float*    buff_e      = ( float*    ) FLA_FLOAT_PTR( e );
      float*    buff_l      = ( float*    ) FLA_FLOAT_PTR( l );
      float*    buff_A      = ( float*    ) FLA_FLOAT_PTR( A );
      int*      buff_isuppz = ( int*      ) FLA_INT_PTR( isuppz );
      float*    buff_work   = ( float*    ) FLA_FLOAT_PTR( work );
      int*      buff_iwork  = ( int*      ) FLA_INT_PTR( iwork );
 
      F77_sstemr( &blas_jobz,
                  &blas_range,
                  &n_A,
                  buff_d,
                  buff_e,
                  &vl, &vu,
                  &il, &iu,
                  &n_eig_found,
                  buff_l,
                  buff_A,     &cs_A,
                  &nzc,
                  buff_isuppz,
                  &try_rac,
                  buff_work,  &lwork,
                  buff_iwork, &liwork,
                  &info );

      break;
    }

    case FLA_DOUBLE:
    {
      double*   buff_d      = ( double*   ) FLA_DOUBLE_PTR( d );
      double*   buff_e      = ( double*   ) FLA_DOUBLE_PTR( e );
      double*   buff_l      = ( double*   ) FLA_DOUBLE_PTR( l );
      double*   buff_A      = ( double*   ) FLA_DOUBLE_PTR( A );
      int*      buff_isuppz = ( int*      ) FLA_INT_PTR( isuppz );
      double*   buff_work   = ( double*   ) FLA_DOUBLE_PTR( work );
      int*      buff_iwork  = ( int*      ) FLA_INT_PTR( iwork );

      F77_dstemr( &blas_jobz,
                  &blas_range,
                  &n_A,
                  buff_d,
                  buff_e,
                  &vl, &vu,
                  &il, &iu,
                  &n_eig_found,
                  buff_l,
                  buff_A,     &cs_A,
                  &nzc,
                  buff_isuppz,
                  &try_rac,
                  buff_work,  &lwork,
                  buff_iwork, &liwork,
                  &info );
  
      break;
    } 
  
    case FLA_COMPLEX:
    {
      float*    buff_d      = ( float*    ) FLA_FLOAT_PTR( d );
      float*    buff_e      = ( float*    ) FLA_FLOAT_PTR( e );
      float*    buff_l      = ( float*    ) FLA_FLOAT_PTR( l );
      scomplex* buff_A      = ( scomplex* ) FLA_COMPLEX_PTR( A );
      int*      buff_isuppz = ( int*      ) FLA_INT_PTR( isuppz );
      float*    buff_work   = ( float*    ) FLA_FLOAT_PTR( work );
      int*      buff_iwork  = ( int*      ) FLA_INT_PTR( iwork );

      F77_cstemr( &blas_jobz,
                  &blas_range,
                  &n_A,
                  buff_d,
                  buff_e,
                  &vl, &vu,
                  &il, &iu,
                  &n_eig_found,
                  buff_l,
                  buff_A,     &cs_A,
                  &nzc,
                  buff_isuppz,
                  &try_rac,
                  buff_work,  &lwork,
                  buff_iwork, &liwork,
                  &info );
  
      break;
    } 
  
    case FLA_DOUBLE_COMPLEX:
    {
      double*   buff_d      = ( double*   ) FLA_DOUBLE_PTR( d );
      double*   buff_e      = ( double*   ) FLA_DOUBLE_PTR( e );
      double*   buff_l      = ( double*   ) FLA_DOUBLE_PTR( l );
      dcomplex* buff_A      = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A );
      int*      buff_isuppz = ( int*      ) FLA_INT_PTR( isuppz );
      double*   buff_work   = ( double*   ) FLA_DOUBLE_PTR( work );
      int*      buff_iwork  = ( int*      ) FLA_INT_PTR( iwork );

      F77_zstemr( &blas_jobz,
                  &blas_range,
                  &n_A,
                  buff_d,
                  buff_e,
                  &vl, &vu,
                  &il, &iu,
                  &n_eig_found,
                  buff_l,
                  buff_A,     &cs_A,
                  &nzc,
                  buff_isuppz,
                  &try_rac,
                  buff_work,  &lwork,
                  buff_iwork, &liwork,
                  &info );
  
      break;
    } 

    }
  }

  FLA_Obj_free( &isuppz );
  FLA_Obj_free( &work );
  FLA_Obj_free( &iwork );
#else
  FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED );
#endif

  return info;
}
FLA_Error FLA_Svdd_external( FLA_Svd_type jobz, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V )
{
  int          info = 0;
#ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES
  FLA_Datatype datatype;
  FLA_Datatype dt_real;
  FLA_Datatype dt_int;
  int          m_A, n_A, cs_A;
  int          cs_U;
  int          cs_V;
  int          min_m_n;
  int          lwork, lrwork, liwork;
  FLA_Obj      work, rwork, iwork;
  char         blas_jobz;
  int          i;

  if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
    FLA_Svdd_check( jobz, A, s, U, V );

  if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS;

  datatype = FLA_Obj_datatype( A );
  dt_real  = FLA_Obj_datatype_proj_to_real( A );
  dt_int   = FLA_INT;

  m_A      = FLA_Obj_length( A );
  n_A      = FLA_Obj_width( A );
  cs_A     = FLA_Obj_col_stride( A );

  cs_U     = FLA_Obj_col_stride( U );

  cs_V     = FLA_Obj_col_stride( V );

  min_m_n  = min( m_A, n_A );

  // Allocate the rwork and iwork arrays up front.
  if ( jobz == FLA_SVD_VECTORS_NONE ) lrwork   = 5 * min_m_n;
  else                                lrwork   = 5 * min_m_n * min_m_n +
                                                 7 * min_m_n;
  liwork = 8 * min_m_n;

  FLA_Obj_create( dt_int,  liwork, 1, 0, 0, &iwork );
  if ( FLA_Obj_is_complex( A ) )
    FLA_Obj_create( dt_real, lrwork, 1, 0, 0, &rwork );

  FLA_Param_map_flame_to_netlib_svd_type( jobz, &blas_jobz );

  // Make a workspace query the first time through. This will provide us with
  // and ideal workspace size based on an internal block size.
  lwork = -1;
  FLA_Obj_create( datatype, 1, 1, 0, 0, &work );

  for ( i = 0; i < 2; ++i )
  {
    if ( i == 1 )
    {
      // Grab the queried ideal workspace size from the work array, free the
      // work object, and then re-allocate the workspace with the ideal size.
      if      ( datatype == FLA_FLOAT || datatype == FLA_COMPLEX )
        lwork = ( int ) *FLA_FLOAT_PTR( work );
      else if ( datatype == FLA_DOUBLE || datatype == FLA_DOUBLE_COMPLEX )
        lwork = ( int ) *FLA_DOUBLE_PTR( work );

      FLA_Obj_free( &work );
      FLA_Obj_create( datatype, lwork, 1, 0, 0, &work );
    }

    switch( datatype ) {

    case FLA_FLOAT:
    {
      float*    buff_A     = ( float*    ) FLA_FLOAT_PTR( A );
      float*    buff_s     = ( float*    ) FLA_FLOAT_PTR( s );
      float*    buff_U     = ( float*    ) FLA_FLOAT_PTR( U );
      float*    buff_V     = ( float*    ) FLA_FLOAT_PTR( V );
      float*    buff_work  = ( float*    ) FLA_FLOAT_PTR( work );
      int*      buff_iwork = ( int*      ) FLA_INT_PTR( iwork );
  
      F77_sgesdd( &blas_jobz,
                  &m_A,
                  &n_A,
                  buff_A,    &cs_A,
                  buff_s,
                  buff_U,    &cs_U,
                  buff_V,    &cs_V,
                  buff_work, &lwork,
                  buff_iwork,
                  &info );

      break;
    }

    case FLA_DOUBLE:
    {
      double*   buff_A     = ( double*   ) FLA_DOUBLE_PTR( A );
      double*   buff_s     = ( double*   ) FLA_DOUBLE_PTR( s );
      double*   buff_U     = ( double*   ) FLA_DOUBLE_PTR( U );
      double*   buff_V     = ( double*   ) FLA_DOUBLE_PTR( V );
      double*   buff_work  = ( double*   ) FLA_DOUBLE_PTR( work );
      int*      buff_iwork = ( int*      ) FLA_INT_PTR( iwork );
  
      F77_dgesdd( &blas_jobz,
                  &m_A,
                  &n_A,
                  buff_A,    &cs_A,
                  buff_s,
                  buff_U,    &cs_U,
                  buff_V,    &cs_V,
                  buff_work, &lwork,
                  buff_iwork,
                  &info );
  
      break;
    } 
  
    case FLA_COMPLEX:
    {
      scomplex* buff_A     = ( scomplex* ) FLA_COMPLEX_PTR( A );
      float*    buff_s     = ( float*    ) FLA_FLOAT_PTR( s );
      scomplex* buff_U     = ( scomplex* ) FLA_COMPLEX_PTR( U );
      scomplex* buff_V     = ( scomplex* ) FLA_COMPLEX_PTR( V );
      scomplex* buff_work  = ( scomplex* ) FLA_COMPLEX_PTR( work );
      float*    buff_rwork = ( float*    ) FLA_FLOAT_PTR( rwork );
      int*      buff_iwork = ( int*      ) FLA_INT_PTR( iwork );
  
      F77_cgesdd( &blas_jobz,
                  &m_A,
                  &n_A,
                  buff_A,    &cs_A,
                  buff_s,
                  buff_U,    &cs_U,
                  buff_V,    &cs_V,
                  buff_work, &lwork,
                  buff_rwork,
                  buff_iwork,
                  &info );
  
      break;
    } 
  
    case FLA_DOUBLE_COMPLEX:
    {
      dcomplex* buff_A     = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A );
      double*   buff_s     = ( double*   ) FLA_DOUBLE_PTR( s );
      dcomplex* buff_U     = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( U );
      dcomplex* buff_V     = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( V );
      dcomplex* buff_work  = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( work );
      double*   buff_rwork = ( double*   ) FLA_DOUBLE_PTR( rwork );
      int*      buff_iwork = ( int*      ) FLA_INT_PTR( iwork );
  
      F77_zgesdd( &blas_jobz,
                  &m_A,
                  &n_A,
                  buff_A,    &cs_A,
                  buff_s,
                  buff_U,    &cs_U,
                  buff_V,    &cs_V,
                  buff_work, &lwork,
                  buff_rwork,
                  buff_iwork,
                  &info );
  
      break;
    } 

    }
  }

  FLA_Obj_free( &work );
  FLA_Obj_free( &iwork );
  if ( FLA_Obj_is_complex( A ) )
    FLA_Obj_free( &rwork );
#else
  FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED );
#endif

  return info;
}
Exemple #13
0
void libfla_test_eig_gest_experiment( test_params_t params,
                                      unsigned int  var,
                                      char*         sc_str,
                                      FLA_Datatype  datatype,
                                      unsigned int  p_cur,
                                      unsigned int  pci,
                                      unsigned int  n_repeats,
                                      signed int    impl,
                                      double*       perf,
                                      double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	FLA_Uplo     inv;
	FLA_Uplo     uplo;
	FLA_Obj      A, B, Y, norm;
	FLA_Obj      A_save, B_save;
	FLA_Obj      A_test, B_test, Y_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur / abs(m_input);
	else               m = p_cur;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv );
	FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo );

	if ( inv == FLA_NO_INVERSE &&
         ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) ||
	       ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) ||
	       ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) )
       )
	{
		*perf     = 0.0;
		*residual = 0.0;
		return;
	}

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B );

	// Initialize the test matrices.
	FLA_Random_spd_matrix( uplo, A );
    FLA_Scalr( uplo, FLA_TWO, A );
	FLA_Hermitianize( uplo, A );

	FLA_Random_spd_matrix( uplo, B );
    FLA_Scalr( uplo, FLA_TWO, B );
	FLA_Chol( uplo, B );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test );
		FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
	}
	else
	{
		A_test = A;
		Y_test = Y;
		B_test = B;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_eig_gest_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
		{
			FLASH_Obj_hierarchify( A_save, A_test );
			FLASH_Obj_hierarchify( B_save, B_test );
		}
		else
		{
			FLA_Copy_external( A_save, A_test );
			FLA_Copy_external( B_save, B_test );
		}
		
		time = FLA_Clock();
		
		libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Check our solution.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLA_Trans trans_left, trans_right;
		
		FLASH_Hermitianize( uplo, A_test );
		
		if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) ||
		     ( inv == FLA_INVERSE    && uplo == FLA_UPPER_TRIANGULAR ) )
		{
			trans_left  = FLA_CONJ_TRANSPOSE;
			trans_right = FLA_NO_TRANSPOSE;
		}
		else
		{
			trans_left  = FLA_NO_TRANSPOSE;
			trans_right = FLA_CONJ_TRANSPOSE;
		}

		if ( inv == FLA_NO_INVERSE )
		{
			FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
			FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
		}
		else // if ( inv == FLA_INVERSE )
		{
			FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
			FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
		}
		FLASH_Obj_flatten( A_test, A );
	}
	else
	{
		FLA_Trans trans_left, trans_right;

		FLA_Hermitianize( uplo, A_test );

		if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) ||
		     ( inv == FLA_INVERSE    && uplo == FLA_UPPER_TRIANGULAR ) )
		{
			trans_left  = FLA_CONJ_TRANSPOSE;
			trans_right = FLA_NO_TRANSPOSE;
		}
		else
		{
			trans_left  = FLA_NO_TRANSPOSE;
			trans_right = FLA_CONJ_TRANSPOSE;
		}

		if ( inv == FLA_NO_INVERSE )
		{
			FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			          FLA_ONE, B_test, A_test );
			FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
		              FLA_ONE, B_test, A_test );
		}
		else // if ( inv == FLA_INVERSE )
		{
			FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			          FLA_ONE, B_test, A_test );
			FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
		              FLA_ONE, B_test, A_test );
		}
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &Y_test );
		FLASH_Obj_free( &B_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_eig_gest_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute the residual.
	FLA_Axpy_external( FLA_MINUS_ONE, A_save, A );
	FLA_Norm1( A, norm );
	FLA_Obj_extract_real_scalar( norm, residual );

	// Free the supporting flat objects.
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );
	FLA_Obj_free( &B_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &Y );
	FLA_Obj_free( &B );
}
void time_Apply_Q_UT_lnfc(
               int variant, int type, int n_repeats, int m, int n, int nb_alg,
               FLA_Obj A, FLA_Obj A_orig, FLA_Obj t, FLA_Obj T, FLA_Obj s, FLA_Obj S, FLA_Obj B,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, A_orig_save, B_save, norm;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_orig_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( A, A_orig_save );
  FLA_Copy_external( B, B_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( A_orig_save, A_orig );
    FLA_Copy_external( B_save, B );

    *dtime = FLA_Clock();

    switch( variant )
    {

    case 0:
      REF_Apply_Q_UT_lnfc( A, t, B );
      //REF_Bidiag_form_U_blk_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, t, B );
      //FLA_Bidiag_blk_external( A_orig, t, s );
      //REF_Bidiag_form_U_blk_external( FLA_LEFT, FLA_NO_TRANSPOSE, A_orig, t, B );
      break;

    case 1:
    {
      // Time variant 1 
      switch( type ){
      case FLA_ALG_BLOCKED:
        //FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, B );
        FLA_QR_UT_form_Q( A, T, B );
        //FLA_Bidiag_UT_form_U( A, T, B );
        //FLA_Bidiag_UT( A_orig, T, S );
        //FLA_Bidiag_UT_form_U( A_orig, T, B );
        break;
      }

      break;
    }


    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }



/*
  if ( variant == 0 )
  {
    FLA_Copy_external( b, b_ref );
    if ( FLA_Obj_is_real( A ) )
      FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, t, b );
    else
      FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, t, b );
    FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                       FLA_NONUNIT_DIAG, FLA_ONE, A, b );
    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, b, FLA_ONE, b_ref );
    FLA_Nrm2_external( b_ref, norm );
    if ( FLA_Obj_is_single_precision( A ) )
      *diff = *(FLA_FLOAT_PTR(norm));
    else
      *diff = *(FLA_DOUBLE_PTR(norm));
  }
  else
*/
  {
    FLA_Obj_set_to_identity( A );
//FLA_Obj_show( "B", B, "%8.1e %8.1e ", "" );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_ONE, B, B, FLA_MINUS_ONE, A );
    FLA_Norm_frob( A, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

  }
/*
  *gflops = 2.0 * n * n * ( m - n / 3.0 ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;
*/

    *gflops = ( 4.0 * ( 2.0           * m * n * n - 2.0 / 3.0 * n * n * n ) +
                4.0 * ( 4.0 / 3.0     * m * m * m ) +
                4.0 * ( 4.0 / 3.0     * n * n * n ) +
                      ( 13.0      * 2 * m * m     ) +
                2.0 * (       3.0 * 2 * m * m * m ) +
                2.0 * (       3.0 * 2 * n * n * n ) ) /
              dtime_old / 1e9;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( A_orig_save, A_orig );
  FLA_Copy_external( B_save, B );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &A_orig_save );
  FLA_Obj_free( &B_save );
  FLA_Obj_free( &norm );
}
Exemple #15
0
void time_Lyap_h(
               int variant, int type, int n_repeats, int m, int nb_alg,
               FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale,
               double *dtime, double *diff, double *gflops )
{
  int irep;

  double dtime_old = 1.0e9;

  FLA_Obj C_save, norm;

  fla_blocksize_t* bp;
  fla_lyap_t*      cntl_lyap_unb;
  fla_lyap_t*      cntl_lyap_opt;
  fla_lyap_t*      cntl_lyap_blk;

  if ( type == FLA_ALG_UNB_OPT && variant > 4 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  bp               = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_lyap_unb    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_UNB_VAR_OFFSET + variant,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL );
  cntl_lyap_opt    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_OPT_VAR_OFFSET + variant,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL,
                                               NULL );
  cntl_lyap_blk    = FLA_Cntl_lyap_obj_create( FLA_FLAT,
                                               FLA_BLK_VAR_OFFSET + variant,
                                               bp,
                                               fla_scal_cntl_blas,
                                               fla_lyap_cntl_leaf,
                                               fla_sylv_cntl,
                                               fla_gemm_cntl_blas,
                                               fla_gemm_cntl_blas,
                                               fla_hemm_cntl_blas,
                                               fla_her2k_cntl_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( C, C_save );


  for ( irep = 0 ; irep < n_repeats; irep++ )
  {
    FLA_Copy_external( C_save, C );

    *dtime = FLA_Clock();

    switch( variant )
    {

    case 0:
      REF_Lyap_h( isgn, A, C, scale );
      break;

    case 1:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var1( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var1( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var1( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 2:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var2( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var2( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var2( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 3:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var3( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var3( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var3( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    case 4:
    {
      switch( type )
      {
        case FLA_ALG_UNBLOCKED:
          FLA_Lyap_h_unb_var4( isgn, A, C );
          break;
        case FLA_ALG_UNB_OPT:
          FLA_Lyap_h_opt_var4( isgn, A, C );
          break;
        case FLA_ALG_BLOCKED:
          FLA_Lyap_h_blk_var4( isgn, A, C, scale, cntl_lyap_blk );
          break;
      }
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


  FLA_Blocksize_free( bp );
  FLA_Cntl_obj_free( cntl_lyap_unb );
  FLA_Cntl_obj_free( cntl_lyap_opt );
  FLA_Cntl_obj_free( cntl_lyap_blk );


/*
  if ( variant == 0 )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C );
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }
*/
  {
    FLA_Obj X, W;

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W );

    FLA_Copy( C, X );
    FLA_Hermitianize( FLA_UPPER_TRIANGULAR, X );

    FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W );
    FLA_Gemm( FLA_NO_TRANSPOSE,   FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE,  W );
    FLA_Scal( isgn, W );
/*
    if ( variant == 3 && type == FLA_ALG_UNBLOCKED )
    {
      FLA_Obj_show( "W", W, "%10.3e + %10.3e ", "" );
      FLA_Obj_show( "C_save", C_save, "%10.3e + %10.3e ", "" );
    }
*/
    FLA_Axpy( FLA_MINUS_ONE, C_save, W );
    FLA_Norm1( W, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    FLA_Obj_free( &X );
    FLA_Obj_free( &W );
  }

  *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( C ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_save, C );

  FLA_Obj_free( &C_save );
  FLA_Obj_free( &norm );
}
Exemple #16
0
void time_Bidiag_UT(
                 int param_combo, int type, int nrepeats, int m, int n,
                 FLA_Obj A, FLA_Obj tu, FLA_Obj tv, FLA_Obj TU, FLA_Obj TV,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( param_combo ){

    case 0:
    {
      switch( type )
      {
      case FLA_ALG_REFERENCE:
        REF_Bidiag_UT( A, tu, tv );
        break;
      case FLA_ALG_FRONT:
        FLA_Bidiag_UT( A, TU, TV );
        break;
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  {
    FLA_Obj AL, AR;
    FLA_Obj ATL, ATR,
            ABL, ABR;
    FLA_Obj QU;
    FLA_Obj QV, QVL, QVR;
    FLA_Obj E, EL, ER;
    FLA_Obj F;
    FLA_Obj WU, WV, eye;
    FLA_Obj tvT,
            tvB;
    dim_t   m_A, n_A, m_TU;


//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    m_A = FLA_Obj_length( A );
    n_A = FLA_Obj_width( A );
    m_TU = FLA_Obj_length( TU );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A,  m_A, 0, 0, &QU );
    FLA_Obj_create( FLA_Obj_datatype( A ), n_A,  n_A, 0, 0, &QV );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  m_A, 0, 0, &WU );
    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  n_A, 0, 0, &WV );

    FLA_Set_to_identity( QU );
    FLA_Set_to_identity( QV );

    FLA_Part_1x2( QV,   &QVL, &QVR,   1, FLA_LEFT );
    FLA_Part_1x2( A,    &AL,  &AR,    1, FLA_LEFT );
    FLA_Part_2x2( A,    &ATL, &ATR,
                        &ABL, &ABR,   1, 1, FLA_BL );
    FLA_Part_2x1( tv,   &tvT,
                        &tvB,   1, FLA_BOTTOM );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, AR, tv, QVR );
      //
      // Need to apply backwards transformation, since vectors are stored columnwise.
      // QL? RQ?
      //
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, AR, tvT, QVR );
    }
    else
    {
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, TU, WU, QU );
      FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, AR, TV, WV, QVR );
    }

/*
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye );     
    FLA_Set_to_identity( eye );

    //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
    //          FLA_ONE, QV, QV, FLA_MINUS_ONE, eye );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, QU, QU, FLA_MINUS_ONE, eye );

FLA_Obj_show( "eye", eye, "%10.3e", "" );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff );
    FLA_Obj_free( &eye );
*/

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A_save, QV, FLA_ZERO, E );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, QU, E, FLA_ZERO, F );

//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    FLA_Copy( A, E );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, E );
    FLA_Part_1x2( E,    &EL, &ER,      1, FLA_LEFT );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, ER );

//FLA_Obj_show( "B", E, "%10.3e", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e", "" );
//FLA_Obj_show( "B", E, "%10.3e + %10.3e ", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e + %10.3e ", "" );

    *diff = FLA_Max_elemwise_diff( E, F );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );

    FLA_Obj_free( &QU );
    FLA_Obj_free( &QV );
    FLA_Obj_free( &WU );
    FLA_Obj_free( &WV );
  }

  *gflops = 4.0 * n * n * ( m - n / 3.0 ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}
Exemple #17
0
void time_Tevd_v(
               int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max,
               FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l,
               double *dtime, double *diff1, double* diff2, double *gflops )
{
  int irep;

  double
    k, dtime_old = 1.0e9;

  FLA_Obj
    A_save, G_save, d_save, e_save;

  if (
       //( variant == 0 ) ||
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       FALSE
     )
  {
    *dtime  = 0.0;
    *gflops = 0.0;
    *diff1  = 0.0;
    *diff2  = 0.0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( G, G_save );
  FLA_Copy_external( d, d_save );
  FLA_Copy_external( e, e_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( G_save, G );
    FLA_Copy_external( d_save, d );
    FLA_Copy_external( e_save, e );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      REF_Tevd_v( d, e, A );
      break;

    // Time variant 1
    case 1:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 2
    case 2:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg );
        break;
      }
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }
  {
    FLA_Obj V, A_rev_evd, norm, eye;

	FLA_Copy( d, l );

//FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" );
//FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" );
	FLA_Sort_evd( FLA_FORWARD, l, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );


    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

/*
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A, D, FLA_ZERO, A_rev_evd );
    FLA_Copy( A_rev_evd, D );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, D, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );
*/
//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );
    //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

/*
FLA_Obj_free( &EL );
FLA_Obj_free( &EU );
FLA_Obj_free( &D );
FLA_Obj_free( &dc );
FLA_Obj_free( &ec );
*/

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                2.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }
  else 
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                1.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( G_save, G );
  FLA_Copy_external( d_save, d );
  FLA_Copy_external( e_save, e );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &G_save );
  FLA_Obj_free( &d_save );
  FLA_Obj_free( &e_save );
}
Exemple #18
0
int main(int argc, char *argv[])
{
  int 
    m_input,
    m,
    p_first, p_last, p_inc,
    p,
    b_alg,
    variant,
    n_repeats,
    i,
    datatype,
    n_variants = 1;
  
  char *colors = "brkgmcbrkg";
  char *ticks  = "o+*xso+*xs";
  char m_dim_desc[14];
  char m_dim_tag[10];

  double max_gflops=6.0;
  double safemin;

  double
    dtime,
    gflops,
    diff;

  FLA_Obj
    A, l, Q, T, W;
  

  FLA_Init();


  fprintf( stdout, "%c number of repeats:", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c Enter blocking size:", '%' );
  scanf( "%d", &b_alg );
  fprintf( stdout, "%c %d\n", '%', b_alg );

  fprintf( stdout, "%c enter problem size first, last, inc:", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' );
  scanf( "%d", &m_input );
  fprintf( stdout, "%c %d\n", '%', m_input );


  fprintf( stdout, "\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }
/*
char ch = 's';
safemin = dlamch_( &ch );
printf( "safemin = %23.15e\n", safemin );
ch = 'e';
double eps = dlamch_( &ch );
printf( "eps dla = %23.15e\n", eps );
printf( "eps fla = %23.15e\n", FLA_EPSILON_D );
*/

  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {

    m = m_input;

    if( m < 0 ) m = p / f2c_abs(m_input);

    //datatype = FLA_FLOAT;
    //datatype = FLA_DOUBLE;
    //datatype = FLA_COMPLEX;
    datatype = FLA_DOUBLE_COMPLEX;

    FLA_Obj_create( datatype,                           m,      m, 0, 0, &A );
    FLA_Obj_create( datatype,                           m,      m, 0, 0, &Q );
    FLA_Obj_create( datatype,                           32,     m, 0, 0, &T );
    FLA_Obj_create( datatype,                           32,     m, 0, 0, &W );
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), m,      1, 0, 0, &l );

    //FLA_Random_herm_matrix( FLA_LOWER_TRIANGULAR, A );
    //FLA_Random_spd_matrix( FLA_LOWER_TRIANGULAR, A );

    FLA_Random_matrix( A );
    FLA_Obj_set_to_identity( Q );
    FLA_QR_UT( A, T );
    FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q );
    fill_eigenvalues( l );
    //FLA_Obj_show( "eig", l, "%9.2e ", "" );
    FLA_Apply_diag_matrix( FLA_LEFT, FLA_NO_CONJUGATE, l, Q );
    FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, Q );
    FLA_Copy( Q, A );

    time_Hevd_ln( 0, FLA_ALG_REFERENCE, n_repeats, m, b_alg,
                  A, l, &dtime, &diff, &gflops );

    fprintf( stdout, "data_REFs( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff );
    fflush( stdout );

    time_Hevd_ln( -1, FLA_ALG_REFERENCE, n_repeats, m, b_alg,
                  A, l, &dtime, &diff, &gflops );

    fprintf( stdout, "data_REFd( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff );
    fflush( stdout );


    for ( variant = 1; variant <= n_variants; variant++ ){
      
      fprintf( stdout, "data_var%d( %d, 1:9 ) = [ %d ", variant, i, p );
      fflush( stdout );

      time_Hevd_ln( variant, FLA_ALG_UNBLOCKED, n_repeats, m, b_alg,
                    A, l, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      //time_Hevd_ln( variant, FLA_ALG_UNB_OPT, n_repeats, m, b_alg,
      //              A, l, &dtime, &diff, &gflops );

      //fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      //fflush( stdout );

      fprintf( stdout, "];\n" );
      fflush( stdout );
    }

    fprintf( stdout, "\n" );

    FLA_Obj_free( &A );
    FLA_Obj_free( &T );
    FLA_Obj_free( &W );
    FLA_Obj_free( &Q );
    FLA_Obj_free( &l );
  }

/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 1; i <= n_variants; i++ ) {
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
  }

  fprintf( stdout, "legend( ... \n" );
  fprintf( stdout, "'Reference', ... \n" );

  for ( i = 1; i < n_variants; i++ )
    fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i );
  fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME Hevd_ln performance (%s, %s)' );\n", 
           m_dim_desc, n_dim_desc );
  fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/

  FLA_Finalize( );

  return 0;
}
FLA_Error REF_Svdd_uv_components( FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V,
                                  double* dtime_bred, double* dtime_bsvd, double* dtime_appq,
                                  double* dtime_qrfa, double* dtime_gemm )
/*
{
  *dtime_bred = 1;
  *dtime_bsvd = 1;
  *dtime_appq = 1;
  *dtime_qrfa = 1;
  *dtime_gemm = 1;

  return FLA_Svdd_external( FLA_SVD_VECTORS_ALL, A, s, U, V );
}
*/

{
  FLA_Datatype dt_A;
  FLA_Datatype dt_A_real;
  dim_t        m_A, n_A;
  dim_t        min_m_n;
  FLA_Obj      tq, tu, tv, d, e, Ur, Vr, W;
  FLA_Obj      eT, epsilonB;
  FLA_Uplo     uplo = FLA_UPPER_TRIANGULAR;
  double       crossover_ratio = 16.0 / 10.0;
  double       dtime_temp;

  dt_A      = FLA_Obj_datatype( A );
  dt_A_real = FLA_Obj_datatype_proj_to_real( A );
  m_A       = FLA_Obj_length( A );
  n_A       = FLA_Obj_width( A );

  min_m_n   = FLA_Obj_min_dim( A );

  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tq );
  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tu );
  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tv );
  FLA_Obj_create( dt_A_real, min_m_n, 1,   0, 0, &d );
  FLA_Obj_create( dt_A_real, min_m_n, 1,   0, 0, &e );
  FLA_Obj_create( dt_A_real, n_A,     n_A, 0, 0, &Ur );
  FLA_Obj_create( dt_A_real, n_A,     n_A, 0, 0, &Vr );


  FLA_Part_2x1( e,   &eT,
                     &epsilonB,    1, FLA_BOTTOM );

  if ( m_A >= n_A )
  {
    if ( m_A < crossover_ratio * n_A )
    {
      dtime_temp = FLA_Clock();
      {
        // Reduce to bidiagonal form.
        FLA_Bidiag_blk_external( A, tu, tv );
        FLA_Bidiag_UT_extract_diagonals( A, d, eT );
      }
      *dtime_bred = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Divide-and-conquor algorithm.
        FLA_Bsvdd_external( uplo, d, e, Ur, Vr );
      }
      *dtime_bsvd = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form U.
        FLA_Copy_external( Ur, U );
        FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, tu, U );

        // Form V.
        FLA_Copy_external( Vr, V );
        FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, A, tv, V );
      }
      *dtime_appq = FLA_Clock() - dtime_temp;


      *dtime_qrfa = 0.0;
      *dtime_gemm = 0.0;
    }
    else
    {
      FLA_Obj AT,
              AB;
      FLA_Obj UL, UR;

      FLA_Part_2x1( A,   &AT,
                         &AB,        n_A, FLA_TOP );
      FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );

      // Create a temporary n-by-n matrix R.
      FLA_Obj_create( dt_A, n_A, n_A, 0, 0, &W );

      dtime_temp = FLA_Clock();
      {
        // Perform a QR factorization.
        FLA_QR_blk_external( A, tq );
        FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, UL );
        FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, A );
      }
      *dtime_qrfa = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form Q.
        FLA_QR_form_Q_external( U, tq );
      }
      *dtime_appq = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Reduce R to bidiagonal form.
        FLA_Bidiag_blk_external( AT, tu, tv );
        FLA_Bidiag_UT_extract_diagonals( A, d, eT );
      }
      *dtime_bred = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Divide-and-conquor algorithm.
        FLA_Bsvdd_external( uplo, d, e, Ur, Vr );
      }
      *dtime_bsvd = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form U in W.
        FLA_Copy_external( Ur, W );
        FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, AT, tu, W );

        // Form V.
        FLA_Copy_external( Vr, V );
        FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, AT, tv, V );
      }
      *dtime_appq += FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Multiply R into U, storing the result in A and then copying
        // back to U.
        FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                           FLA_ONE, UL, W, FLA_ZERO, A );
        FLA_Copy( A, UL );
      }
      *dtime_gemm = FLA_Clock() - dtime_temp;


      // Free R.
      FLA_Obj_free( &W );
    }
  }
  else
  {
    FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
  }

  // Copy singular values to output vector.
  FLA_Copy( d, s );

  // Sort singular values and vectors.
  FLA_Sort_svd( FLA_BACKWARD, s, U, V );

  FLA_Obj_free( &tq );
  FLA_Obj_free( &tu );
  FLA_Obj_free( &tv );
  FLA_Obj_free( &d );
  FLA_Obj_free( &e );
  FLA_Obj_free( &Ur );
  FLA_Obj_free( &Vr );

  return FLA_SUCCESS;
}
Exemple #20
0
void time_Hess_UT(
                 int variant, int type, int nrepeats, int m,
                 FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Hess_UT( A, t );
        break;
      case FLA_ALG_FRONT:
        FLA_Hess_UT( A, T );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  //if ( type == FLA_ALG_REFERENCE )
  //{
  //  ;
  //}
  //else
  {
    FLA_Obj    AT, AB;
    FLA_Obj Q, QT, QB;
    FLA_Obj E, ET, EB;
    FLA_Obj F;
    dim_t   m_A, m_T;

    m_A = FLA_Obj_length( A );
    m_T = FLA_Obj_length( T );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &Q );
    FLA_Set_to_identity( Q );

    FLA_Part_2x1( Q,    &QT,
                        &QB,    1, FLA_TOP );
    FLA_Part_2x1( A,    &AT,
                        &AB,    1, FLA_TOP );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
    }
    else
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QB );

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_ONE, A_save, Q, FLA_ZERO, E );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                       FLA_ONE, Q, E, FLA_ZERO, F );

    FLA_Copy( A, E );
    FLA_Part_2x1( E,    &ET,
                        &EB,    1, FLA_TOP );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, EB );

    *diff = FLA_Max_elemwise_diff( E, F );

    FLA_Obj_free( &Q );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );
  }

  *gflops = ( 10.0 / 3.0 * m * m * m ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}
FLA_Error FLA_Svd_compute_scaling( FLA_Obj A, FLA_Obj sigma )
{
	FLA_Datatype dt_real;
	FLA_Obj      norm;
	FLA_Obj      safmin;
	FLA_Obj      prec;
	FLA_Obj      rmin;
	FLA_Obj      rmax;

	if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
		FLA_Svd_compute_scaling_check( A, sigma );

	dt_real = FLA_Obj_datatype_proj_to_real( A );

	FLA_Obj_create( dt_real, 1, 1, 0, 0, &norm );
	FLA_Obj_create( dt_real, 1, 1, 0, 0, &prec );
	FLA_Obj_create( dt_real, 1, 1, 0, 0, &safmin );
	FLA_Obj_create( dt_real, 1, 1, 0, 0, &rmin );
	FLA_Obj_create( dt_real, 1, 1, 0, 0, &rmax );

	// Query safmin, precision.
	FLA_Mach_params( FLA_MACH_PREC,  prec );
	FLA_Mach_params( FLA_MACH_SFMIN, safmin );

//FLA_Obj_show( "safmin", safmin, "%20.12e", "" );
//FLA_Obj_show( "prec", prec, "%20.12e", "" );

	// rmin = sqrt( safmin ) / prec;
	FLA_Copy( safmin, rmin );
	FLA_Sqrt( rmin );
	FLA_Inv_scal( prec, rmin );

	// rmax = 1 / rmin;
	FLA_Copy( rmin, rmax );
	FLA_Invert( FLA_NO_CONJUGATE, rmax );

//FLA_Obj_show( "rmin", rmin, "%20.12e", "" );
//FLA_Obj_show( "rmax", rmax, "%20.12e", "" );

	// Find the maximum absolute value of A.
	FLA_Max_abs_value( A, norm );

	if ( FLA_Obj_gt( norm, FLA_ZERO ) && FLA_Obj_lt( norm, rmin ) )
	{
		// sigma = rmin / norm;
		FLA_Copy( rmin, sigma );
		FLA_Inv_scal( norm, sigma );
	}
	else if ( FLA_Obj_gt( norm, rmax ) )
	{
		// sigma = rmax / norm;
		FLA_Copy( rmax, sigma );
		FLA_Inv_scal( norm, sigma );
	}
	else
	{
		// sigma = 1.0;
		FLA_Copy( FLA_ONE, sigma );
	}

	FLA_Obj_free( &norm );
	FLA_Obj_free( &prec );
	FLA_Obj_free( &safmin );
	FLA_Obj_free( &rmin );
	FLA_Obj_free( &rmax );

	return FLA_SUCCESS;
}
FLA_Error FLA_Fill_with_logarithmic_dist( FLA_Obj alpha, FLA_Obj x )
{
	FLA_Obj      lT,              l0,
	             lB,              lambda1,
	                              l2;
	FLA_Obj      l, k, alpha2;
	FLA_Datatype dt_real;
	dim_t        n_x;


	if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
		FLA_Fill_with_logarithmic_dist_check( alpha, x );

	dt_real = FLA_Obj_datatype_proj_to_real( x );
	n_x     = FLA_Obj_vector_dim( x );

	// Create a local counter to increment as we create the distribution.
	FLA_Obj_create( dt_real, 1,   1, 0, 0, &k );

	// Create a local vector l. We will work with this vector, which is
	// the same length as x, so that we can use vertical partitioning.
	FLA_Obj_create( dt_real, n_x, 1, 0, 0, &l );

	// Create a local real scalar alpha2 of the same precision as
	// alpha. Then copy alpha to alpha2, which will convert the
	// complex value to real, if necessary (ie: if alpha is complex).
	FLA_Obj_create( dt_real, 1,   1, 0, 0, &alpha2 );
	FLA_Copy( alpha, alpha2 );

	// Initialize k to 0.
	FLA_Set( FLA_ZERO, k );

	FLA_Part_2x1( l,    &lT,
	                    &lB,            0, FLA_TOP );

	while ( FLA_Obj_length( lB ) > 0 )
	{
		FLA_Repart_2x1_to_3x1( lT,                &l0,
		                    /* ** */            /* ******* */
		                                          &lambda1,
		                       lB,                &l2,        1, FLA_BOTTOM );

		/*------------------------------------------------------------*/

		// lambda1 = alpha^k;
		FLA_Pow( alpha2, k, lambda1 );

		// k = k + 1;
		FLA_Mult_add( FLA_ONE, FLA_ONE, k );

		/*------------------------------------------------------------*/

		FLA_Cont_with_3x1_to_2x1( &lT,                l0,
		                                              lambda1,
		                        /* ** */           /* ******* */
		                          &lB,                l2,     FLA_TOP );
	}

	// Normalize by last element.
	FLA_Part_2x1( l,    &lT,
	                    &lB,            1, FLA_BOTTOM );
	FLA_Inv_scal( lB, l );

	// Overwrite x with the distribution we created in l.
	FLA_Copy( l, x );

	FLA_Obj_free( &l );
	FLA_Obj_free( &k );
	FLA_Obj_free( &alpha2 );

	return FLA_SUCCESS;
}
FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg,
                                      FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V,
                                      double* dtime_bred, double* dtime_bsvd, double* dtime_appq,
                                      double* dtime_qrfa, double* dtime_gemm )
{
	FLA_Error    r_val = FLA_SUCCESS;
	FLA_Datatype dt;
	FLA_Datatype dt_real;
	FLA_Datatype dt_comp;
	FLA_Obj      T, S, rL, rR, d, e, G, H, RG, RH, W;
	dim_t        m_A, n_A;
	dim_t        min_m_n;
	dim_t        n_GH;
	double       crossover_ratio = 17.0 / 9.0;
	double       dtime_temp;

	n_GH    = k_accum;

	m_A     = FLA_Obj_length( A );
	n_A     = FLA_Obj_width( A );
	min_m_n = FLA_Obj_min_dim( A );
	dt      = FLA_Obj_datatype( A );
	dt_real = FLA_Obj_datatype_proj_to_real( A );
	dt_comp = FLA_Obj_datatype_proj_to_complex( A );

	// If the matrix is a scalar, then the SVD is easy.
	if ( min_m_n == 1 )
	{
		FLA_Copy( A, s );
		FLA_Set_to_identity( U );
		FLA_Set_to_identity( V );

		return FLA_SUCCESS;
	}

	// Create matrices to hold block Householder transformations.
	FLA_Bidiag_UT_create_T( A, &T, &S );

	// Create vectors to hold the realifying scalars.
	FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rL );
	FLA_Obj_create( dt,      min_m_n,      1, 0, 0, &rR );

	// Create vectors to hold the diagonal and sub-diagonal.
	FLA_Obj_create( dt_real, min_m_n,      1, 0, 0, &d );
	FLA_Obj_create( dt_real, min_m_n-1,    1, 0, 0, &e );

	// Create matrices to hold the left and right Givens scalars.
	FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G );
	FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H );

	// Create matrices to hold the left and right Givens matrices.
	FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG );
	FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH );
	FLA_Obj_create( dt,      m_A,     n_A,     0, 0, &W );

	if ( m_A >= n_A )
	{
		if ( m_A < crossover_ratio * n_A )
		{
			dtime_temp = FLA_Clock();
			{
			// Reduce the matrix to bidiagonal form.
			// Apply scalars to rotate elements on the sub-diagonal to the real domain.
			// Extract the diagonal and sub-diagonal from A.
			FLA_Bidiag_UT( A, T, S );
			FLA_Bidiag_UT_realify( A, rL, rR );
			FLA_Bidiag_UT_extract_diagonals( A, d, e );
			}
			*dtime_bred = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Form U and V.
			FLA_Bidiag_UT_form_U( A, T, U );
			FLA_Bidiag_UT_form_V( A, S, V );
			}
			*dtime_appq = FLA_Clock() - dtime_temp;

			// Apply the realifying scalars in rL and rR to U and V, respectively.
			{
				FLA_Obj UL, UR;
				FLA_Obj VL, VR;

				FLA_Part_1x2( U,   &UL, &UR,   min_m_n, FLA_LEFT );
				FLA_Part_1x2( V,   &VL, &VR,   min_m_n, FLA_LEFT );

				FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, UL );
				FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL );
			}

			dtime_temp = FLA_Clock();
			{
			// Perform a singular value decomposition on the bidiagonal matrix.
			r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg );
			}
			*dtime_bsvd = FLA_Clock() - dtime_temp;
		}
		else // if ( crossover_ratio * n_A <= m_A )
		{
			FLA_Obj TQ, R;
			FLA_Obj AT,
			        AB;
			FLA_Obj UL, UR;

			//FLA_QR_UT_create_T( A, &TQ );
			FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ );

			dtime_temp = FLA_Clock();
			{
			// Perform a QR factorization on A and form Q in U.
			FLA_QR_UT( A, TQ );
			}
			*dtime_qrfa = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			FLA_QR_UT_form_Q( A, TQ, U );
			}
			*dtime_appq = FLA_Clock() - dtime_temp;

			FLA_Obj_free( &TQ );

			// Set the lower triangle of R to zero and then copy the upper
			// triangle of A to R.
			FLA_Part_2x1( A,   &AT,
			                   &AB,   n_A, FLA_TOP );
			FLA_Obj_create( dt, n_A, n_A, 0, 0, &R );
			FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R );
			FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R );

			dtime_temp = FLA_Clock();
			{
			// Reduce the matrix to bidiagonal form.
			// Apply scalars to rotate elements on the superdiagonal to the real domain.
			// Extract the diagonal and superdiagonal from A.
			FLA_Bidiag_UT( R, T, S );
			FLA_Bidiag_UT_realify( R, rL, rR );
			FLA_Bidiag_UT_extract_diagonals( R, d, e );
			}
			*dtime_bred = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Form V from right Householder vectors in upper triangle of R.
			FLA_Bidiag_UT_form_V( R, S, V );

			// Form U in R.
			FLA_Bidiag_UT_form_U( R, T, R );
			}
			*dtime_appq += FLA_Clock() - dtime_temp;

			// Apply the realifying scalars in rL and rR to U and V, respectively.
			FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE,    rL, R );
			FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V );

			dtime_temp = FLA_Clock();
			{
			// Perform a singular value decomposition on the bidiagonal matrix.
			r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg );
			}
			*dtime_bsvd = FLA_Clock() - dtime_temp;

			dtime_temp = FLA_Clock();
			{
			// Multiply R into U, storing the result in A and then copying back
			// to U.
			FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );
			FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
			          FLA_ONE, UL, R, FLA_ZERO, A );
			FLA_Copy( A, UL );
			}
			*dtime_gemm = FLA_Clock() - dtime_temp;

			FLA_Obj_free( &R );
		}
	}
	else // if ( m_A < n_A )
	{
		FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
	}

	// Copy the converged eigenvalues to the output vector.
	FLA_Copy( d, s );

	// Sort the singular values and singular vectors in descending order.
	FLA_Sort_svd( FLA_BACKWARD, s, U, V );

	FLA_Obj_free( &T );
	FLA_Obj_free( &S );
	FLA_Obj_free( &rL );
	FLA_Obj_free( &rR );
	FLA_Obj_free( &d );
	FLA_Obj_free( &e );
	FLA_Obj_free( &G );
	FLA_Obj_free( &H );
	FLA_Obj_free( &RG );
	FLA_Obj_free( &RH );
	FLA_Obj_free( &W );

	return r_val;
}
Exemple #24
0
void libfla_test_apqut_experiment( test_params_t params,
                                   unsigned int  var,
                                   char*         sc_str,
                                   FLA_Datatype  datatype,
                                   unsigned int  p_cur,
                                   unsigned int  pci,
                                   unsigned int  n_repeats,
                                   signed int    impl,
                                   double*       perf,
                                   double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m, n;
	unsigned int min_m_n;
	signed int   m_input;
	signed int   n_input;
	FLA_Side     side;
	FLA_Trans    trans;
	FLA_Direct   direct;
	FLA_Store    storev;
	FLA_Obj      A, T, W, B, eye, norm;
	FLA_Obj      B_save;
	FLA_Obj      A_test, T_test, W_test, B_test;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side );
	FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans );
	FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct );
	FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev );

	// We want to make sure the Apply_Q_UT routines work with rectangular
	// matrices. So we use m > n when testing with column-wise storage (via
	// QR factorization) and m < n when testing with row-wise storage (via
	// LQ factorization).
	if ( storev == FLA_COLUMNWISE )
	{
		m_input = -1;
		n_input = -1;
		//m_input = -1;
		//n_input = -1;
	}
	else // if ( storev == FLA_ROWWISE )
	{
		m_input = -1;
		n_input = -1;
		//m_input = -1;
		//n_input = -1;
	}

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur * abs(n_input);
	else               n = p_cur;

	// Compute the minimum dimension.
	min_m_n = min( m, n );

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T );
	if ( storev == FLA_COLUMNWISE )
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B );
	else
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B );

	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye );

	FLA_Apply_Q_UT_create_workspace( T, B, &W );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Initialize the test matrices.
	FLA_Random_matrix( A );
	FLA_Set_to_identity( B );
	FLA_Set_to_identity( eye );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE )
			FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		else // if ( storev == FLA_ROWWISE )
			FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
		FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		A_test = A;
		T_test = T;
		W_test = W;
		B_test = B;
	}

	// Compute a Householder factorization.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test );
		else                            FLASH_LQ_UT( A_test, T_test );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test );
		else                            FLA_LQ_UT( A_test, T_test );
	}

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( B_save, B_test );
		else
			FLA_Copy_external( B_save, B_test );

		time = FLA_Clock();

		libfla_test_apqut_impl( impl, side, trans, direct, storev,
		                        A_test, T_test, W_test, B_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Multiply by its conjugate-transpose to get what should be (near) identity
	// and then subtract from actual identity to get what should be (near) zero.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_flatten( B_test, B );
		FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
		                   FLA_ONE, B, B, FLA_MINUS_ONE, eye );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
		                   FLA_ONE, B, B, FLA_MINUS_ONE, eye );
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &T_test );
		FLASH_Obj_free( &W_test );
		FLASH_Obj_free( &B_test );
	}

	// Compute the norm of eye, which contains I - Q * Q'.
	FLA_Norm1( eye, norm );
	FLA_Obj_extract_real_scalar( norm, residual );

	// Compute the performance of the best experiment repeat.
	*perf = (  4.0 *       m * min_m_n * n -
	           2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Free the supporting flat objects.
	FLA_Obj_free( &B_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
	FLA_Obj_free( &W );
	FLA_Obj_free( &B );
	FLA_Obj_free( &eye );
	FLA_Obj_free( &norm );
}
Exemple #25
0
int main(int argc, char *argv[])
{
  int 
    m_input,
    m,
    p_first, p_last, p_inc,
    p,
    n_repeats,
    param_combo,
    i,
    n_param_combos = N_PARAM_COMBOS;

  dim_t b_flash;
  dim_t n_threads;

  FLA_Datatype datatype;
  FLA_Uplo     uplo;
  FLA_Inv      inv;
  
  char *colors = "brkgmcbrkg";
  char *ticks  = "o+*xso+*xs";
  char m_dim_desc[14];
  char m_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff;

  FLA_Obj
    A, B, norm;
  

  FLA_Init();


  fprintf( stdout, "%c number of repeats: ", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c enter FLASH blocksize: ", '%' );
  scanf( "%u", &b_flash );
  fprintf( stdout, "%c %u\n", '%', b_flash );

  fprintf( stdout, "%c enter problem size first, last, inc: ", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' );
  scanf( "%d", &m_input );
  fprintf( stdout, "%c %d\n", '%', m_input );

  fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' );
  scanf( "%d", &n_threads );
  fprintf( stdout, "%c %d\n", '%', n_threads );

  fprintf( stdout, "\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }


  //datatype = FLA_FLOAT;
  //datatype = FLA_DOUBLE;
  //datatype = FLA_COMPLEX;
  datatype = FLA_DOUBLE_COMPLEX;

  FLASH_Queue_set_num_threads( n_threads );

  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {
    m = m_input;

    if( m < 0 ) m = p / abs(m_input);

    for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){

      if ( pc_str[param_combo][0] == 'i' ) inv = FLA_INVERSE;
      else                                 inv = FLA_NO_INVERSE;

      if ( pc_str[param_combo][1] == 'l' ) uplo = FLA_LOWER_TRIANGULAR;
      else                                 uplo = FLA_UPPER_TRIANGULAR;

      FLASH_Obj_create( datatype, m, m, 1, &b_flash, &A );
      FLASH_Obj_create( datatype, m, m, 1, &b_flash, &B );

      FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

      FLASH_Random_spd_matrix( uplo, A );
      FLASH_Hermitianize( uplo, A );
      
      FLASH_Random_spd_matrix( uplo, B );
      FLASH_Chol( uplo, B );

      
      fprintf( stdout, "data_eig_gest_%s( %d, 1:3 ) = [ %d  ", pc_str[param_combo], i, p );
      fflush( stdout );

      time_Eig_gest( param_combo, FLA_ALG_FRONT, n_repeats, m,
                     inv, uplo, A, B, norm, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );


      fprintf( stdout, " ]; \n" );
      fflush( stdout );

      FLASH_Obj_free( &A );
      FLASH_Obj_free( &B );
      FLA_Obj_free( &norm );
    }

    fprintf( stdout, "\n" );
  }

/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 0; i < n_param_combos; i++ ) {
    fprintf( stdout, "plot( data_eig_gest_%s( :,1 ), data_eig_gest_%s( :, 2 ), '%c:%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
    fprintf( stdout, "plot( data_eig_gest_%s( :,1 ), data_eig_gest_%s( :, 4 ), '%c-.%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
  }

  fprintf( stdout, "legend( ... \n" );

  for ( i = 0; i < n_param_combos; i++ )
    fprintf( stdout, "'ref\\_eig_gest\\_%s', 'fla\\_eig_gest\\_%s', ... \n", pc_str[i], pc_str[i] );

  fprintf( stdout, "'Location', 'SouthEast' ); \n" );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME eig_gest front-end performance (%s)' );\n", m_dim_desc );
  fprintf( stdout, "print -depsc eig_gest_front_%s.eps\n", m_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/

  FLA_Finalize();

  return 0;
}
void time_Hevd_lv_components(
               int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
               FLA_Obj A, FLA_Obj l,
               double* dtime, double* diff1, double* diff2, double* gflops,
               double* dtime_tred, double* gflops_tred,
               double* dtime_tevd, double* gflops_tevd,
               double* dtime_appq, double* gflops_appq, int* k_perf )
{
  int     i;
  double  k;
  double  dtime_save      = 1.0e9;
  double  dtime_tred_save = 1.0e9;
  double  dtime_tevd_save = 1.0e9;
  double  dtime_appq_save = 1.0e9;
  double  flops_tred;
  double  flops_tevd;
  double  flops_appq;
  double  mult_tred;
  double  mult_tevd;
  double  mult_appq;

  FLA_Obj A_save, Z;

  if (
       ( variant == -3 ) ||
       ( variant == -4 ) ||
       ( variant == -5 ) ||
       //( variant == 0 ) ||
       //( variant == -1 ) ||
       //( variant == -2 ) ||
       //( variant == 1 ) ||
       //( variant == 2 ) ||
       //( variant == 3 ) ||
       //( variant == 4 ) ||
       FALSE
     )
  {
    *gflops      = 0.0;
    *dtime       = 0.0;
    *diff1       = 0.0;
    *diff2       = 0.0;
    *dtime_tred  = 0.0;
    *dtime_tevd  = 0.0;
    *dtime_appq  = 0.0;
    *gflops_tred = 0.0;
    *gflops_tevd = 0.0;
    *gflops_appq = 0.0;
    *k_perf      = 0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z );

  FLA_Copy_external( A, A_save );

  for ( i = 0 ; i < n_repeats; i++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case -3:
    {
      *k_perf = 0;
      REF_Hevd_lv( A, l,
                   dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -4:
    {
      *k_perf = 0;
      REF_Hevdd_lv( A, l,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -5:
    {
      *k_perf = 0;
      REF_Hevdr_lv( A, l, Z,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case 0:
    {
      *k_perf = 0;
      REF_Hevd_lv_components( A, l,
                              dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -1:
    {
      *k_perf = 0;
      REF_Hevdd_lv_components( A, l,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -2:
    {
      *k_perf = 0;
      REF_Hevdr_lv_components( A, l, Z,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 1
    case 1:
    {
      *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 2
    case 2:
    {
      *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    if ( *dtime < dtime_save )
    {
      dtime_save      = *dtime;
      dtime_tred_save = *dtime_tred;
      dtime_tevd_save = *dtime_tevd;
      dtime_appq_save = *dtime_appq;
    }
  }

  *dtime      = dtime_save;
  *dtime_tred = dtime_tred_save;
  *dtime_tevd = dtime_tevd_save;
  *dtime_appq = dtime_appq_save;

//if ( variant == -3 || variant == 0 )
//printf( "\ndtime is %9.3e\n", *dtime );

  {
    FLA_Obj V, A_rev_evd, norm, eye;

    if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  flops_tred = ( ( 4.0 / 3.0 )   * m * m * m );
  flops_tevd = (   4.5           * k * m * m     +
                   3.0           * k * m * m * m );

  if ( variant == -1 || variant == -2 ||
       variant == -4 || variant == -5 )
    flops_appq = ( 2.0           * m * m * m );
  else
    flops_appq = ( 4.0 / 3.0     * m * m * m );

/*
  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops      = ( 4.0 * flops_tred + 
                     2.0 * flops_tevd + 
                     4.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9;
  }
  else
  {
    *gflops      = ( 1.0 * flops_tred + 
                     1.0 * flops_tevd + 
                     1.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9;
  }
*/

  if ( FLA_Obj_is_complex( A ) )
  {
    mult_tred = 4.0;
    mult_tevd = 2.0;
    mult_appq = 4.0;
  }
  else
  {
    mult_tred = 1.0;
    mult_tevd = 1.0;
    mult_appq = 1.0;
  }

  *gflops = ( mult_tred * flops_tred + 
              mult_tevd * flops_tevd + 
              mult_appq * flops_appq ) / *dtime / 1e9;

  *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9;
  *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9;
  *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &Z );
}
FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
                                       double* dtime_tred, double* dtime_tevd, double* dtime_appq )
{
	FLA_Error    r_val = FLA_SUCCESS;
	FLA_Uplo     uplo = FLA_LOWER_TRIANGULAR;
	FLA_Datatype dt;
	FLA_Datatype dt_real;
	FLA_Datatype dt_comp;
	FLA_Obj      T, r, d, e, G, R, W;
	FLA_Obj      d0, e0, ls, pu;
	dim_t        mn_A;
	dim_t        n_G = k_accum;
	double       dtime_temp;

	mn_A    = FLA_Obj_length( A );
	dt      = FLA_Obj_datatype( A );
	dt_real = FLA_Obj_datatype_proj_to_real( A );
	dt_comp = FLA_Obj_datatype_proj_to_complex( A );

	*dtime_tred = 1;
	*dtime_tevd = 1;
	*dtime_appq = 1;

	// If the matrix is a scalar, then the EVD is easy.
	if ( mn_A == 1 )
	{
		FLA_Copy( A, l );
		FLA_Set( FLA_ONE, A );

		return FLA_SUCCESS;
	}

	// Create a matrix to hold block Householder transformations.
	FLA_Tridiag_UT_create_T( A, &T );

	// Create a vector to hold the realifying scalars.
	FLA_Obj_create( dt,      mn_A,     1, 0, 0, &r );

	// Create vectors to hold the diagonal and sub-diagonal.
	FLA_Obj_create( dt_real, mn_A,     1, 0, 0, &d );
	FLA_Obj_create( dt_real, mn_A-1,   1, 0, 0, &e );
	FLA_Obj_create( dt_real, mn_A,     1, 0, 0, &d0 );
	FLA_Obj_create( dt_real, mn_A-1,   1, 0, 0, &e0 );
	FLA_Obj_create( dt_real, mn_A,     1, 0, 0, &pu );
	FLA_Obj_create( FLA_INT, mn_A,     1, 0, 0, &ls );
	FLA_Obj_create( dt_comp, mn_A-1, n_G, 0, 0, &G );
	FLA_Obj_create( dt_real, mn_A,  mn_A, 0, 0, &R );
	FLA_Obj_create( dt,      mn_A,  mn_A, 0, 0, &W );

  dtime_temp = FLA_Clock();
  {
	// Reduce the matrix to tridiagonal form.
	FLA_Tridiag_UT( uplo, A, T );
  }
  *dtime_tred = FLA_Clock() - dtime_temp;

	// Apply scalars to rotate elements on the sub-diagonal to the real domain.
	FLA_Tridiag_UT_realify( uplo, A, r );

	// Extract the diagonal and sub-diagonal from A.
	FLA_Tridiag_UT_extract_diagonals( uplo, A, d, e );

  dtime_temp = FLA_Clock();
  {
	// Form Q, overwriting A.
	FLA_Tridiag_UT_form_Q( uplo, A, T );
  }
  *dtime_appq = FLA_Clock() - dtime_temp;

	// Apply the scalars in r to Q.
	FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A );

	// Find the eigenvalues only.
	FLA_Copy( d, d0 ); FLA_Copy( e, e0 );
	//r_val = FLA_Tevd_n_opt_var1( n_iter_max, d0, e0, G, A );
{
	int info;
	double* buff_d = FLA_DOUBLE_PTR( d0 );
	double* buff_e = FLA_DOUBLE_PTR( e0 );
	dsterf_( &mn_A, buff_d, buff_e, &info );
}
	FLA_Sort( FLA_FORWARD, d0 );
	FLA_Set( FLA_ZERO, ls );
	FLA_Set( FLA_ZERO, pu );

  dtime_temp = FLA_Clock();
  {
	// Perform an eigenvalue decomposition on the tridiagonal matrix.
	r_val = FLA_Tevd_v_opt_var4( n_iter_max, d, e, d0, ls, pu, G, R, W, A, b_alg );
  }
  *dtime_tevd = FLA_Clock() - dtime_temp;

	// Copy the converged eigenvalues to the output vector.
	FLA_Copy( d, l );

	// Sort the eigenvalues and eigenvectors in ascending order.
	FLA_Sort_evd( FLA_FORWARD, l, A );

	FLA_Obj_free( &T );
	FLA_Obj_free( &r );
	FLA_Obj_free( &d );
	FLA_Obj_free( &e );
	FLA_Obj_free( &d0 );
	FLA_Obj_free( &pu );
	FLA_Obj_free( &e0 );
	FLA_Obj_free( &ls );
	FLA_Obj_free( &G );
	FLA_Obj_free( &R );
	FLA_Obj_free( &W );

	return r_val;
}
Exemple #28
0
int main(int argc, char *argv[])
{
  int 
    datatype,
    m_input,
    m,
    p_first, p_last, p_inc,
    p,
    n_repeats,
    param_combo,
    i, j,
    n_param_combos = N_PARAM_COMBOS;

  int sign;
  
  char *colors = "brkgmcbrkg";
  char *ticks  = "o+*xso+*xs";
  char m_dim_desc[14];
  char n_dim_desc[14];
  char m_dim_tag[10];
  char n_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff;

  FLA_Obj
    A, C, C_ref, scale, isgn, norm;
  

  FLA_Init();


  fprintf( stdout, "%c number of repeats:", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c Enter sign (-1 or 1):", '%' );
  scanf( "%d", &sign );
  fprintf( stdout, "%c %d\n", '%', sign );

  fprintf( stdout, "%c enter problem size first, last, inc:", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' );
  scanf( "%d", &m_input );
  fprintf( stdout, "%c %d\n", '%', m_input );


  fprintf( stdout, "\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }

  if ( 0 < sign )
    isgn = FLA_ONE;
  else
    isgn = FLA_MINUS_ONE;

  //datatype = FLA_FLOAT;
  datatype = FLA_DOUBLE;
  //datatype = FLA_COMPLEX;
  //datatype = FLA_DOUBLE_COMPLEX;


  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {
    m = m_input;

    if( m < 0 ) m = p / abs(m_input);

    for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ){

      FLA_Obj_create( datatype, m, m, 0, 0, &A );
      FLA_Obj_create( datatype, m, m, 0, 0, &C );
      FLA_Obj_create( datatype, m, m, 0, 0, &C_ref );

      FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &scale );
      FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

      FLA_Random_tri_matrix( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A );
      FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, A );
      FLA_Norm1( A, norm );
      FLA_Shift_diag( FLA_NO_CONJUGATE, norm, A );

      FLA_Random_matrix( C );
      FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C );

      fprintf( stdout, "data_lyap_%s( %d, 1:5 ) = [ %d  ", pc_str[param_combo], i, p );
      fflush( stdout );


      time_Lyap( param_combo, FLA_ALG_REFERENCE, n_repeats, m,
                 isgn, A, C, C_ref, scale, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      time_Lyap( param_combo, FLA_ALG_FRONT, n_repeats, m,
                 isgn, A, C, C_ref, scale, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );


      fprintf( stdout, " ]; \n" );
      fflush( stdout );

      FLA_Obj_free( &A );
      FLA_Obj_free( &C );
      FLA_Obj_free( &C_ref );
      FLA_Obj_free( &scale );
      FLA_Obj_free( &norm );
    }

    fprintf( stdout, "\n" );
  }


/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 0; i < n_param_combos; i++ ) {
    fprintf( stdout, "plot( data_lyap_%s( :,1 ), data_lyap_%s( :, 2 ), '%c:%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
    fprintf( stdout, "plot( data_lyap_%s( :,1 ), data_lyap_%s( :, 4 ), '%c-.%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
  }

  fprintf( stdout, "legend( ... \n" );

  for ( i = 0; i < n_param_combos; i++ )
    fprintf( stdout, "'ref\\_lyap\\_%s', 'fla\\_lyap\\_%s', ... \n", pc_str[i], pc_str[i] );

  fprintf( stdout, "'Location', 'SouthEast' ); \n" );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME lyap front-end performance (%s)' );\n", m_dim_desc );
  fprintf( stdout, "print -depsc lyap_front_%s.eps\n", m_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/

  FLA_Finalize( );

  return 0;
}
Exemple #29
0
void time_Lyap(
                int param_combo, int type, int nrepeats, int m,
                FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale,
                double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_save, norm;

  if ( param_combo == 0 && type == FLA_ALG_FRONT )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save );
  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm );

  FLASH_Copy( C, C_save );

  for ( irep = 0 ; irep < nrepeats; irep++ )
  {
    FLASH_Copy( C_save, C );

    *dtime = FLA_Clock();

    switch( param_combo ){

    case 0:{
      switch( type ){
      //case FLA_ALG_REFERENCE:
      //  REF_Lyap( FLA_NO_TRANSPOSE, isgn, A_flat, C_flat, scale );
      //  break;
      case FLA_ALG_FRONT:
        FLASH_Lyap( FLA_NO_TRANSPOSE, isgn, A, C, scale );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 1:{
      switch( type ){
      //case FLA_ALG_REFERENCE:
      //  REF_Lyap( FLA_CONJ_TRANSPOSE, isgn, A_flat, C_flat, scale );
      //  break;
      case FLA_ALG_FRONT:
        FLASH_Lyap( FLA_CONJ_TRANSPOSE, isgn, A, C, scale );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

/*
  if ( type == FLA_ALG_REFERENCE )
  {
    FLASH_Obj_hierarchify( C_flat, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLASH_Max_elemwise_diff( C, C_ref );
  }
*/

  {
    FLA_Obj X, W;

    FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X );
    FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W );

    FLASH_Copy( C, X );
    FLASH_Hermitianize( FLA_UPPER_TRIANGULAR, X );

    if ( param_combo == 0 )
    {
      FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,   FLA_ONE, A, X, FLA_ZERO, W );
      FLASH_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, X, A, FLA_ONE,  W );
    }
    else if ( param_combo == 1 )
    {
      FLASH_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W );
      FLASH_Gemm( FLA_NO_TRANSPOSE,   FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE,  W );
    }

    FLASH_Scal( isgn, W );

    FLASH_Axpy( FLA_MINUS_ONE, C_save, W );
    FLASH_Norm1( W, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

    FLASH_Obj_free( &X );
    FLASH_Obj_free( &W );
  }

  *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( C ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLASH_Copy( C_save, C );

  FLASH_Obj_free( &C_save );
  FLA_Obj_free( &norm );
}
Exemple #30
0
void libfla_test_hessut_experiment( test_params_t params,
                                    unsigned int  var,
                                    char*         sc_str,
                                    FLA_Datatype  datatype,
                                    unsigned int  p_cur,
                                    unsigned int  pci,
                                    unsigned int  n_repeats,
                                    signed int    impl,
                                    double*       perf,
                                    double*       residual )
{
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	FLA_Obj      A, T, W, Qh, AQ, QhAQ, norm;
	FLA_Obj      AT, AB;
	FLA_Obj      QhT, QhB;
	FLA_Obj      A_save;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

	if ( impl == FLA_TEST_FLAT_FRONT_END ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &W );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &W );
	}

	// Initialize the test matrices.
	FLA_Random_matrix( A );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );

	// Create auxiliary matrices to be used when checking the result.
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Qh );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &AQ );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &QhAQ );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		FLA_Copy_external( A_save, A );
		
		time = FLA_Clock();

		libfla_test_hessut_impl( impl, A, T );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = ( 10.0 / 3.0 * m * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Check the result by computing R - Q' A_orig Q.
	FLA_Set_to_identity( Qh );
	FLA_Part_2x1( Qh,   &QhT,
	                    &QhB,   1, FLA_TOP );
	FLA_Part_2x1( A,    &AT,
	                    &AB,    1, FLA_TOP );
	FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE,
	                AB, T, W, QhB );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
	          FLA_ONE, A_save, Qh, FLA_ZERO, AQ );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
	          FLA_ONE, Qh, AQ, FLA_ZERO, QhAQ );
	FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, AB );
	*residual = FLA_Max_elemwise_diff( A, QhAQ );

	// Free the supporting flat objects.
	FLA_Obj_free( &W );
	FLA_Obj_free( &Qh );
	FLA_Obj_free( &AQ );
	FLA_Obj_free( &QhAQ );
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
}