FLA_Error FLA_Obj_create_buffer_task( dim_t rs, dim_t cs, FLA_Obj obj, void* cntl )
{
  FLA_Error r_val;

  r_val = FLA_Obj_create_buffer( rs, cs, &obj );

  FLA_Set( FLA_ZERO, obj );

  return r_val;
}
예제 #2
0
FLA_Error FLA_Set_to_identity( FLA_Obj A )
{
  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_Set_to_identity_check( A );

  FLA_Set( FLA_ZERO, A );

  FLA_Set_diag( FLA_ONE, A );

  return FLA_SUCCESS;
}
예제 #3
0
FLA_Error FLA_Hess_UT_blk_var4( FLA_Obj A, FLA_Obj T )
{
  FLA_Obj  ATL,   ATR,      A00, A01, A02, 
           ABL,   ABR,      A10, A11, A12,
                            A20, A21, A22;
  FLA_Obj  UT,              U0,
           UB,              U1,
                            U2;
  FLA_Obj  YT,              Y0,
           YB,              Y1,
                            Y2;
  FLA_Obj  ZT,              Z0,
           ZB,              Z1,
                            Z2;
  FLA_Obj  TL,    TR,       T0, T1, T2; 

  FLA_Obj  U, Y, Z;
  FLA_Obj  ABR_l;
  FLA_Obj  UB_l, U2_l;
  FLA_Obj  YB_l, Y2_l;
  FLA_Obj  ZB_l, Z2_l;
  FLA_Obj  WT_l;
  FLA_Obj  T1_tl;
  FLA_Obj  none, none2, none3;
  FLA_Obj  UB_tl,
           UB_bl;
  FLA_Datatype datatype_A;
  dim_t        m_A;
  dim_t        b_alg, b, bb;

  b_alg      = FLA_Obj_length( T );

  datatype_A = FLA_Obj_datatype( A );
  m_A        = FLA_Obj_length( A );

  FLA_Obj_create( datatype_A, m_A,    b_alg, 0, 0, &U );
  FLA_Obj_create( datatype_A, m_A,    b_alg, 0, 0, &Y );
  FLA_Obj_create( datatype_A, m_A,    b_alg, 0, 0, &Z );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );
  FLA_Part_2x1( U,    &UT, 
                      &UB,            0, FLA_TOP );
  FLA_Part_2x1( Y,    &YT, 
                      &YB,            0, FLA_TOP );
  FLA_Part_2x1( Z,    &ZT, 
                      &ZB,            0, FLA_TOP );
  FLA_Part_1x2( T,    &TL,  &TR,      0, FLA_LEFT ); 

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) )
  {
    b = min( FLA_Obj_length( ABR ), b_alg );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );
    FLA_Repart_2x1_to_3x1( UT,                &U0, 
                        /* ** */            /* ** */
                                              &U1, 
                           UB,                &U2,        b, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( YT,                &Y0, 
                        /* ** */            /* ** */
                                              &Y1, 
                           YB,                &Y2,        b, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( ZT,                &Z0, 
                        /* ** */            /* ** */
                                              &Z1, 
                           ZB,                &Z2,        b, FLA_BOTTOM );
    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, /**/ &T1, &T2,
                           b, FLA_RIGHT );

    /*------------------------------------------------------------*/

    FLA_Part_2x2( T1,     &T1_tl, &none,   
                          &none2, &none3,   b, b, FLA_TL ); 

    bb = min( FLA_Obj_length( ABR ) - 1, b_alg );

    FLA_Part_1x2( ABR,    &ABR_l, &none,    bb, FLA_LEFT ); 
    FLA_Part_1x2( UB,     &UB_l,  &none,    bb, FLA_LEFT ); 
    FLA_Part_1x2( YB,     &YB_l,  &none,    bb, FLA_LEFT ); 
    FLA_Part_1x2( ZB,     &ZB_l,  &none,    bb, FLA_LEFT ); 

    FLA_Part_2x1( UB_l,   &none,
                          &U2_l,             b, FLA_TOP );
    FLA_Part_2x1( YB_l,   &none,
                          &Y2_l,             b, FLA_TOP );
    FLA_Part_2x1( ZB_l,   &none,
                          &Z2_l,             b, FLA_TOP );

    // [ ABR, YB, ZB, T1 ] = FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1, b );
    //FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1_tl );
    //FLA_Hess_UT_step_ofu_var4( ABR, YB, ZB, T1_tl );
    FLA_Hess_UT_step_opt_var4( ABR, YB, ZB, T1_tl );

    // Build UB from ABR, with explicit unit subdiagonal and zeros.
    FLA_Copy_external( ABR_l, UB_l );
    FLA_Part_2x1( UB_l,   &UB_tl, 
                          &UB_bl,            1, FLA_TOP );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl );
    FLA_Set( FLA_ZERO, UB_tl );

    // ATR = ATR - ATR * UB * inv( triu( T ) ) * UB' );
    if ( FLA_Obj_length( ATR ) > 0 )
    {
      // NOTE: We use ZT as temporary workspace.
      FLA_Part_1x2( ZT,     &WT_l,  &none,    bb, FLA_LEFT ); 
      FLA_Part_2x2( T1,     &T1_tl, &none,   
                            &none2, &none3,   bb, bb, FLA_TL ); 

      // WT_l = ATR * UB_l * inv( triu( T ) ).
      FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                         FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l );
      FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, 
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l );

      // ATR = ATR - WT_l * UB_l'
      FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                         FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR );
    }

    // A22 = A22 - U2 * Y2' - Z2 * U2';
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_MINUS_ONE, U2_l, Y2_l, FLA_ONE, A22 );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_MINUS_ONE, Z2_l, U2_l, FLA_ONE, A22 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );
    FLA_Cont_with_3x1_to_2x1( &UT,                U0, 
                                                  U1, 
                            /* ** */           /* ** */
                              &UB,                U2,     FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &YT,                Y0, 
                                                  Y1, 
                            /* ** */           /* ** */
                              &YB,                Y2,     FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &ZT,                Z0, 
                                                  Z1, 
                            /* ** */           /* ** */
                              &ZB,                Z2,     FLA_TOP );
    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, T1, /**/ T2,
                              FLA_LEFT );
  }

  FLA_Obj_free( &U );
  FLA_Obj_free( &Y );
  FLA_Obj_free( &Z );

  return FLA_SUCCESS;
}
예제 #4
0
파일: test_symm.c 프로젝트: flame/libflame
void libfla_test_symm_experiment( test_params_t params,
                                  unsigned int  var,
                                  char*         sc_str,
                                  FLA_Datatype  datatype,
                                  unsigned int  p_cur,
                                  unsigned int  pci,
                                  unsigned int  n_repeats,
                                  signed int    impl,
                                  double*       perf,
                                  double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	unsigned int n;
	signed int   n_input    = -1;
	FLA_Side     side;
	FLA_Uplo     uplo;
	FLA_Obj      A, B, C, x, y, z, w, norm;
	FLA_Obj      alpha, beta;
	FLA_Obj      C_save;
	FLA_Obj      A_test, B_test, C_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur / abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur / abs(n_input);
	else               n = p_cur;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side );
	FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo );

	// Create the matrices for the current operation.
	if ( side == FLA_LEFT )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, m, 1, 0, 0, &w );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, n, 1, 0, 0, &w );
	}
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C );

	// Create a norm scalar.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Initialize the test matrices.
	FLA_Random_symm_matrix( uplo, A );
	FLA_Random_matrix( B );
	FLA_Random_matrix( C );

	// Initialize the test vectors.
	FLA_Random_matrix( x );
    FLA_Set( FLA_ZERO, y );
    FLA_Set( FLA_ZERO, z );
    FLA_Set( FLA_ZERO, w );

	// Set constants.
	alpha = FLA_TWO;
	beta  = FLA_MINUS_ONE;

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
		FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test );
	}
	else
	{
		A_test = A;
		B_test = B;
		C_test = C;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( C_save, C_test );
		else
			FLA_Copy_external( C_save, C_test );
		
		time = FLA_Clock();

		libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Copy the solution to flat matrix X.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_flatten( C_test, C );
	}
	else
    {
		// No action needed since C_test and C refer to the same object.
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &B_test );
		FLASH_Obj_free( &C_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_free();

	// Compute the performance of the best experiment repeat.
	if ( side == FLA_LEFT )
		*perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
	else
		*perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute:
	//   y = C * x
	// and compare to
	//   z = ( beta * C_orig + alpha * A * B ) x      (side = left)
	//   z = ( beta * C_orig + alpha * B * A ) x      (side = right)
	FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y );

	if ( side == FLA_LEFT )
	{
		FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w );
		FLA_Symv_external( uplo,             alpha,   A, w, FLA_ZERO, z );
	}
	else
	{
		FLA_Symv_external( uplo,             FLA_ONE, A, x, FLA_ZERO, w );
		FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha,   B, w, FLA_ZERO, z );
	}
	FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z );

	// Compute || y - z ||.
	//FLA_Axpy_external( FLA_MINUS_ONE, y, z );
	//FLA_Nrm2_external( z, norm );
	//FLA_Obj_extract_real_scalar( norm, residual );
	*residual = FLA_Max_elemwise_diff( y, z );

	// Free the supporting flat objects.
	FLA_Obj_free( &C_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &B );
	FLA_Obj_free( &C );
	FLA_Obj_free( &x );
	FLA_Obj_free( &y );
	FLA_Obj_free( &z );
	FLA_Obj_free( &w );
	FLA_Obj_free( &norm );
}
예제 #5
0
FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
                                       double* dtime_tred, double* dtime_tevd, double* dtime_appq )
{
	FLA_Error    r_val = FLA_SUCCESS;
	FLA_Uplo     uplo = FLA_LOWER_TRIANGULAR;
	FLA_Datatype dt;
	FLA_Datatype dt_real;
	FLA_Datatype dt_comp;
	FLA_Obj      T, r, d, e, G, R, W;
	FLA_Obj      d0, e0, ls, pu;
	dim_t        mn_A;
	dim_t        n_G = k_accum;
	double       dtime_temp;

	mn_A    = FLA_Obj_length( A );
	dt      = FLA_Obj_datatype( A );
	dt_real = FLA_Obj_datatype_proj_to_real( A );
	dt_comp = FLA_Obj_datatype_proj_to_complex( A );

	*dtime_tred = 1;
	*dtime_tevd = 1;
	*dtime_appq = 1;

	// If the matrix is a scalar, then the EVD is easy.
	if ( mn_A == 1 )
	{
		FLA_Copy( A, l );
		FLA_Set( FLA_ONE, A );

		return FLA_SUCCESS;
	}

	// Create a matrix to hold block Householder transformations.
	FLA_Tridiag_UT_create_T( A, &T );

	// Create a vector to hold the realifying scalars.
	FLA_Obj_create( dt,      mn_A,     1, 0, 0, &r );

	// Create vectors to hold the diagonal and sub-diagonal.
	FLA_Obj_create( dt_real, mn_A,     1, 0, 0, &d );
	FLA_Obj_create( dt_real, mn_A-1,   1, 0, 0, &e );
	FLA_Obj_create( dt_real, mn_A,     1, 0, 0, &d0 );
	FLA_Obj_create( dt_real, mn_A-1,   1, 0, 0, &e0 );
	FLA_Obj_create( dt_real, mn_A,     1, 0, 0, &pu );
	FLA_Obj_create( FLA_INT, mn_A,     1, 0, 0, &ls );
	FLA_Obj_create( dt_comp, mn_A-1, n_G, 0, 0, &G );
	FLA_Obj_create( dt_real, mn_A,  mn_A, 0, 0, &R );
	FLA_Obj_create( dt,      mn_A,  mn_A, 0, 0, &W );

  dtime_temp = FLA_Clock();
  {
	// Reduce the matrix to tridiagonal form.
	FLA_Tridiag_UT( uplo, A, T );
  }
  *dtime_tred = FLA_Clock() - dtime_temp;

	// Apply scalars to rotate elements on the sub-diagonal to the real domain.
	FLA_Tridiag_UT_realify( uplo, A, r );

	// Extract the diagonal and sub-diagonal from A.
	FLA_Tridiag_UT_extract_diagonals( uplo, A, d, e );

  dtime_temp = FLA_Clock();
  {
	// Form Q, overwriting A.
	FLA_Tridiag_UT_form_Q( uplo, A, T );
  }
  *dtime_appq = FLA_Clock() - dtime_temp;

	// Apply the scalars in r to Q.
	FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A );

	// Find the eigenvalues only.
	FLA_Copy( d, d0 ); FLA_Copy( e, e0 );
	//r_val = FLA_Tevd_n_opt_var1( n_iter_max, d0, e0, G, A );
{
	int info;
	double* buff_d = FLA_DOUBLE_PTR( d0 );
	double* buff_e = FLA_DOUBLE_PTR( e0 );
	dsterf_( &mn_A, buff_d, buff_e, &info );
}
	FLA_Sort( FLA_FORWARD, d0 );
	FLA_Set( FLA_ZERO, ls );
	FLA_Set( FLA_ZERO, pu );

  dtime_temp = FLA_Clock();
  {
	// Perform an eigenvalue decomposition on the tridiagonal matrix.
	r_val = FLA_Tevd_v_opt_var4( n_iter_max, d, e, d0, ls, pu, G, R, W, A, b_alg );
  }
  *dtime_tevd = FLA_Clock() - dtime_temp;

	// Copy the converged eigenvalues to the output vector.
	FLA_Copy( d, l );

	// Sort the eigenvalues and eigenvectors in ascending order.
	FLA_Sort_evd( FLA_FORWARD, l, A );

	FLA_Obj_free( &T );
	FLA_Obj_free( &r );
	FLA_Obj_free( &d );
	FLA_Obj_free( &e );
	FLA_Obj_free( &d0 );
	FLA_Obj_free( &pu );
	FLA_Obj_free( &e0 );
	FLA_Obj_free( &ls );
	FLA_Obj_free( &G );
	FLA_Obj_free( &R );
	FLA_Obj_free( &W );

	return r_val;
}
예제 #6
0
int main(int argc, char *argv[])
{
  int 
    m_input,
    m,
    p_first, p_last, p_inc,
    p,
    k_accum,
    b_alg,
    n_iter_max,
    variant,
    n_repeats,
    i,
    n_variants = 2;

  char *colors = "brkgmcbrkg";
  char *ticks  = "o+*xso+*xs";
  char m_dim_desc[14];
  char m_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff1, diff2;

  FLA_Datatype datatype, dt_real;

  FLA_Obj
    A, l, Q, Ql, TT, r, d, e, A_orig, G, R, W2, de, alpha;

  FLA_Init();


  fprintf( stdout, "%c number of repeats:", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c enter n_iter_max (per eigenvalue): ", '%' );
  scanf( "%d", &n_iter_max );
  fprintf( stdout, "%c %d\n", '%', n_iter_max );

  fprintf( stdout, "%c enter number of sets of Givens rotations to accumulate:", '%' );
  scanf( "%d", &k_accum );
  fprintf( stdout, "%c %d\n", '%', k_accum );

  fprintf( stdout, "%c enter blocking size for application of G:", '%' );
  scanf( "%d", &b_alg );
  fprintf( stdout, "%c %d\n", '%', b_alg );

  fprintf( stdout, "%c enter problem size first, last, inc:", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' );
  scanf( "%d", &m_input );
  fprintf( stdout, "%c %d\n", '%', m_input );


  fprintf( stdout, "\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }


  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {

    m = m_input;

    if( m < 0 ) m = p / abs(m_input);

    //datatype = FLA_FLOAT;
    //datatype = FLA_DOUBLE;
    //datatype = FLA_COMPLEX;
    datatype = FLA_DOUBLE_COMPLEX;

    FLA_Obj_create( datatype,  m,         m, 0, 0, &A );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &A_orig );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &Q );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &Ql );
    FLA_Obj_create( datatype,  m,         1, 0, 0, &r );
    FLA_Obj_create( datatype,  m,         m, 0, 0, &W2 );
    FLA_Obj_create( datatype,  m-1, k_accum, 0, 0, &G );

	dt_real = FLA_Obj_datatype_proj_to_real( A );

    FLA_Obj_create( dt_real, m,      1, 0, 0, &l );
    FLA_Obj_create( dt_real, m,      1, 0, 0, &d );
    FLA_Obj_create( dt_real, m-1,    1, 0, 0, &e );
    FLA_Obj_create( dt_real, m,      m, 0, 0, &R );

	FLA_Obj_create( dt_real, 1,      1, 0, 0, &alpha );

	*FLA_DOUBLE_PTR( alpha ) = 1.0 / ( sqrt( sqrt( (double) m ) ) );

	FLA_Random_unitary_matrix( Q );

	//FLA_Fill_with_uniform_dist( FLA_ONE,   l );
	//FLA_Fill_with_inverse_dist( FLA_ONE,   l );
	FLA_Fill_with_geometric_dist( alpha,   l );


    {
      FLA_Copy( Q, Ql );
      FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, Ql );
      FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                FLA_ONE, Ql, Q, FLA_ZERO, A );
      FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A );
      FLA_Copy( A, A_orig );
    }

    FLA_Set( FLA_ZERO, l );
    FLA_Set( FLA_ZERO, Q );

	FLA_Tridiag_UT_create_T( A, &TT );
	FLA_Tridiag_UT( FLA_LOWER_TRIANGULAR, A, TT );
	FLA_Tridiag_UT_realify( FLA_LOWER_TRIANGULAR, A, r );
	FLA_Tridiag_UT_extract_diagonals( FLA_LOWER_TRIANGULAR, A, d, e );
	FLA_Tridiag_UT_form_Q( FLA_LOWER_TRIANGULAR, A, TT );
	FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A );
    FLA_Obj_free( &TT );

    time_Tevd_v( 0, FLA_ALG_REFERENCE, n_repeats, m, k_accum, b_alg, n_iter_max,
                 A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops );

    fprintf( stdout, "data_REFq( %d, 1:3 ) = [ %d %6.3lf %9.2e %6.2le %6.2le ]; \n", i, p, gflops, dtime, diff1, diff2 );
    fflush( stdout );

    for ( variant = 1; variant <= n_variants; variant++ ){
      
      fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p );
      fflush( stdout );

      time_Tevd_v( variant, FLA_ALG_UNB_OPT, n_repeats, m, k_accum, b_alg, n_iter_max,
                   A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops );

      fprintf( stdout, "%6.3lf %9.2e %6.2le %6.2le ", gflops, dtime, diff1, diff2 );
      fflush( stdout );

      fprintf( stdout, "];\n" );
      fflush( stdout );
    }

    fprintf( stdout, "\n" );

    FLA_Obj_free( &A );
    FLA_Obj_free( &A_orig );
    FLA_Obj_free( &Q );
    FLA_Obj_free( &Ql );
    FLA_Obj_free( &G );
    FLA_Obj_free( &W2 );
    FLA_Obj_free( &r );
    FLA_Obj_free( &l );
    FLA_Obj_free( &d );
    FLA_Obj_free( &e );
    FLA_Obj_free( &R );
    FLA_Obj_free( &alpha );
  }

/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 1; i <= n_variants; i++ ) {
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
    fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n",
            i, i, colors[ i-1 ], ticks[ i-1 ] );
  }

  fprintf( stdout, "legend( ... \n" );
  fprintf( stdout, "'Reference', ... \n" );

  for ( i = 1; i < n_variants; i++ )
    fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i );
  fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME Hevd_lv performance (%s, %s)' );\n", 
           m_dim_desc, n_dim_desc );
  fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/

  FLA_Finalize( );

  return 0;
}
예제 #7
0
FLA_Error FLA_Hess_UT_step_unb_var2( FLA_Obj A, FLA_Obj T )
{
  FLA_Obj  ATL,   ATR,      A00,  a01,     A02, 
           ABL,   ABR,      a10t, alpha11, a12t,
                            A20,  a21,     A22;
  FLA_Obj  TTL,   TTR,      T00,  t01,   T02, 
           TBL,   TBR,      t10t, tau11, t12t,
                            T20,  t21,   T22;
  FLA_Obj  yT,              y0,
           yB,              psi1,
                            y2;
  FLA_Obj  zT,              z0,
           zB,              zeta1,
                            z2;
  FLA_Obj  y, z;
           
  FLA_Obj  inv_tau11;
  FLA_Obj  minus_inv_tau11;
  FLA_Obj  first_elem;
  FLA_Obj  beta;
  FLA_Obj  conj_beta;
  FLA_Obj  dot_product;

  FLA_Obj  a21_t,
           a21_b;

  FLA_Datatype datatype_A;
  dim_t        m_A;
  dim_t        b_alg;


  b_alg      = FLA_Obj_length( T );

  datatype_A = FLA_Obj_datatype( A );
  m_A        = FLA_Obj_length( A );

  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &inv_tau11 );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &minus_inv_tau11 );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &first_elem );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &beta );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &conj_beta );
  FLA_Obj_create( datatype_A, 1,   1, 0, 0, &dot_product );
  FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y );
  FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );
  FLA_Part_2x2( T,    &TTL, &TTR,
                      &TBL, &TBR,     0, 0, FLA_TL );
  FLA_Part_2x1( y,    &yT, 
                      &yB,            0, FLA_TOP );
  FLA_Part_2x1( z,    &zT, 
                      &zB,            0, FLA_TOP );

  while ( FLA_Obj_length( ATL ) < b_alg )
  {
    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00,  /**/ &a01,     &A02,
                        /* ************* */   /* ************************** */
                                                &a10t, /**/ &alpha11, &a12t,
                           ABL, /**/ ABR,       &A20,  /**/ &a21,     &A22,
                           1, 1, FLA_BR );
    FLA_Repart_2x2_to_3x3( TTL, /**/ TTR,       &T00,  /**/ &t01,   &T02,
                        /* ************* */   /* ************************** */
                                                &t10t, /**/ &tau11, &t12t,
                           TBL, /**/ TBR,       &T20,  /**/ &t21,   &T22,
                           1, 1, FLA_BR );
    FLA_Repart_2x1_to_3x1( yT,                &y0, 
                        /* ** */            /* **** */
                                              &psi1, 
                           yB,                &y2,        1, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( zT,                &z0, 
                        /* ** */            /* ***** */
                                              &zeta1, 
                           zB,                &z2,        1, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    if ( FLA_Obj_length( A22 ) > 0 )
    {
      FLA_Part_2x1( a21,    &a21_t,
                            &a21_b,        1, FLA_TOP );

      // [ u21, tau11, a21 ] = House( a21 );
      FLA_Househ2_UT( FLA_LEFT,
                      a21_t,
                      a21_b, tau11 );

      // inv_tau11            =  1 / tau11;
      // minus_inv_tau11      = -1 / tau11;
      FLA_Set( FLA_ONE, inv_tau11 );
      FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 );
      FLA_Copy( inv_tau11, minus_inv_tau11 );
      FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 );

      // Save first element of a21_t and set it to one so we can use a21 as
      // u21 in subsequent computations. We will restore a21_t later on.
      FLA_Copy( a21_t, first_elem );
      FLA_Set( FLA_ONE, a21_t );

      // y21 = A22' * u21;
      FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 );

      // z21 = A22 * u21;
      FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 );

      // beta      = u21' * z21 / 2;
      // conj_beta = conj(beta);
      FLA_Dotc( FLA_CONJUGATE, a21, z2, beta );
      FLA_Inv_scal( FLA_TWO, beta );
      FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta );

      // y21' = ( y21' - beta / tau * u21' ) / tau;
      // y21  = ( y21 - conj(beta) / tau * u21 ) / tau;
      FLA_Scal( minus_inv_tau11, conj_beta );
      FLA_Axpy( conj_beta, a21, y2 );
      FLA_Scal( inv_tau11, y2 );

      // z21 = ( z21 - beta / tau * u21 ) / tau;
      FLA_Scal( minus_inv_tau11, beta );
      FLA_Axpy( beta, a21, z2 );
      FLA_Scal( inv_tau11, z2 );

      // a12t = a12t * ( I - u21 * u21' / tau );
      //      = a12t - ( a12t * u21 ) * u21' / tau;
      FLA_Dot( a12t, a21, dot_product );
      FLA_Scal( minus_inv_tau11, dot_product );
      FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t );

      // A02 = A02 * ( I - u21 * u21' / tau );
      //     = A02 - ( A02 * u21 ) * u21' / tau;
      FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 );
      FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 );

      // A22 = A22 - u21 * y21' - z21 * u21';
      FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 );
      FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 );

      // t01 = U20' * u21;
      FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 );

      // Restore first element of a21.
      FLA_Copy( first_elem, a21_t );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00,  a01,     /**/ A02,
                                                     a10t, alpha11, /**/ a12t,
                            /* ************** */  /* ************************ */
                              &ABL, /**/ &ABR,       A20,  a21,     /**/ A22,
                              FLA_TL );
    FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR,       T00,  t01,   /**/ T02,
                                                     t10t, tau11, /**/ t12t,
                            /* ************** */  /* ************************ */
                              &TBL, /**/ &TBR,       T20,  t21,   /**/ T22,
                              FLA_TL );
    FLA_Cont_with_3x1_to_2x1( &yT,                y0, 
                                                  psi1, 
                            /* ** */           /* **** */
                              &yB,                y2,     FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &zT,                z0, 
                                                  zeta1, 
                            /* ** */           /* ***** */
                              &zB,                z2,     FLA_TOP );
  }

  FLA_Obj_free( &inv_tau11 );
  FLA_Obj_free( &minus_inv_tau11 );
  FLA_Obj_free( &first_elem );
  FLA_Obj_free( &beta );
  FLA_Obj_free( &conj_beta );
  FLA_Obj_free( &dot_product );
  FLA_Obj_free( &y );
  FLA_Obj_free( &z );

  return FLA_SUCCESS;
}
예제 #8
0
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Obj      A, A_flame, A_lapack, C;
  int          m;
  FLA_Error    init_result; 

  FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame;
  FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack;
  testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, 
    *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack;
  int lwork, info, is_flame;
  
  if ( argc == 3 ) {
    m = atoi(argv[1]);
    is_flame = atoi(argv[2]);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m is_flame\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       is_flame : 1 yes, 0 no\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  fprintf( stdout, "lapack2flame: %d x %d: \n", m, m);

  FLA_Obj_create( datatype, m, m, 0, 0, &A );
  FLA_Random_matrix( A ); 
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame  );
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack );

  FLA_Obj_create( datatype, m, m, 0, 0, &C );
  FLA_Random_matrix( C ); 


  if ( is_flame ) {
    fprintf( stdout, " flame executed\n");
    FLA_Bidiag_UT_create_T( A_flame, &TU, &TV );

    FLA_Bidiag_UT( A_flame, TU, TV );
    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame );
    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame );

    FLA_Bidiag_UT_form_U( U_flame, TU, U_flame );
    FLA_Bidiag_UT_form_V( V_flame, TV, V_flame );
    
    FLA_Obj_create( datatype, m,      1, 0, 0, &d_flame );
    FLA_Obj_create( datatype, m - 1,  1, 0, 0, &e_flame );
    FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame );

    FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame );
    {
      FLA_Obj BTL, BTR, BBL, BBR;
      FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL );
      FLA_Set_diagonal_matrix( d_flame, B_flame );
      FLA_Set_diagonal_matrix( e_flame, BTR );
    }

    if (1) {
      fprintf( stdout, " - FLAME ----------\n");
      FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------");
    }
  } else {
    fprintf( stdout, " lapack executed\n");

    FLA_Obj_create( datatype, m, 1, 0, 0, &tauq );
    FLA_Obj_create( datatype, m, 1, 0, 0, &taup );
    FLA_Obj_create( datatype, m,      1, 0, 0, &d_lapack );
    FLA_Obj_create( datatype, m - 1,  1, 0, 0, &e_lapack );

    buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack );
    buff_tauq     = (testtype*)FLA_Obj_buffer_at_view( tauq );
    buff_taup     = (testtype*)FLA_Obj_buffer_at_view( taup );
    buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack );
    buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack );

    lwork = 32*m;
    
    FLA_Obj_create( datatype, lwork, 1, 0, 0, &W );
    buff_W = (testtype*)FLA_Obj_buffer_at_view( W );
    sgebrd_( &m, &m, 
             buff_A_lapack, &m,
             buff_d_lapack,
             buff_e_lapack,
             buff_tauq,
             buff_taup,
             buff_W,
             &lwork,
             &info );

    FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack );
    FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack );
    
    FLA_Copy( A_lapack, U_lapack );
    FLA_Copy( A_lapack, V_lapack );

    buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack );
    buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack );

    sorgbr_( "Q", &m, &m, &m,
             buff_U_lapack, &m,
             buff_tauq, 
             buff_W,
             &lwork,
             &info );
    
    sorgbr_( "P", &m, &m, &m,
             buff_V_lapack, &m,
             buff_taup,
             buff_W,
             &lwork,
             &info );

    FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack );
    {
      FLA_Obj BTL, BTR, BBL, BBR;
      FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL );
      FLA_Set_diagonal_matrix( d_lapack, B_lapack );
      FLA_Set_diagonal_matrix( e_lapack, BTR );
    }
    
    FLA_Obj_free( &W );    


    if (1) {
      fprintf( stdout, " - LAPACK ----------\n");
      FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------");
    }
  }

  {
    testtype     dummy;
    int          zero = 0, one = 1;
    FLA_Obj      D_lapack;

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack );

    if ( is_flame ) {
      buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame );
      buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame );
      buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame );
      buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame );
    }

    FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W );
    buff_W = (testtype*)FLA_Obj_buffer_at_view( W );
    sbdsqr_( "U", &m, &m, &m, &zero, 
             buff_d_lapack, buff_e_lapack, 
             buff_V_lapack, &m, 
             buff_U_lapack, &m, 
             &dummy, &one, 
             buff_W, &info );
    FLA_Obj_free( &W );
    if (info != 0)
      printf( " Error info = %d\n", info );

    if ( is_flame )
      FLA_Set_diagonal_matrix( d_flame, D_lapack );
    else
      FLA_Set_diagonal_matrix( d_lapack, D_lapack );

    if ( is_flame ) {
      fprintf( stdout, " - FLAME ----------\n");
      FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------");
    } else {
      fprintf( stdout, " - LAPACK ----------\n");
      FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------");
    }

    FLA_Obj_free( &D_lapack );
  }

  if ( is_flame ) {
    FLA_Obj_free( &TU );
    FLA_Obj_free( &TV );
    FLA_Obj_free( &U_flame );
    FLA_Obj_free( &V_flame );
    FLA_Obj_free( &d_flame );
    FLA_Obj_free( &e_flame );
    FLA_Obj_free( &B_flame );
  } else {
    FLA_Obj_free( &tauq );
    FLA_Obj_free( &taup );
    FLA_Obj_free( &d_lapack );
    FLA_Obj_free( &e_lapack );
    FLA_Obj_free( &U_lapack );
    FLA_Obj_free( &V_lapack );
    FLA_Obj_free( &B_lapack );
  }
  FLA_Obj_free( &A );
  FLA_Obj_free( &A_flame );
  FLA_Obj_free( &A_lapack );

  FLA_Obj_free( &C );

  FLA_Finalize_safe( init_result );     
}
예제 #9
0
int main(int argc, char *argv[])
{
  int 
    datatype,
    n_input,
    mB_input, mC_input, mD_input,
    mB, mC, mD, n,
    p_first, p_last, p_inc,
    p,
    b_alg,
    variant,
    n_repeats,
    i,
    n_variants = 1;
  
  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff;

  FLA_Obj
    B, C, D, T, R, E;
  

  FLA_Init();


  fprintf( stdout, "%c number of repeats:", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c enter algorithmic blocksize:", '%' );
  scanf( "%d", &b_alg );
  fprintf( stdout, "%c %d\n", '%', b_alg );

  fprintf( stdout, "%c enter problem size first, last, inc:", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter n (-1 means bind to problem size): ", '%' );
  scanf( "%d", &n_input );
  fprintf( stdout, "%c %d\n", '%', n_input );

  fprintf( stdout, "%c enter mB mC mD (-1 means bind to problem size): ", '%' );
  scanf( "%d %d %d", &mB_input, &mC_input, &mD_input );
  fprintf( stdout, "%c %d %d %d\n", '%', mB_input, mC_input, mD_input );


  fprintf( stdout, "\nclear all;\n\n" );



  //datatype = FLA_FLOAT;
  //datatype = FLA_DOUBLE;
  //datatype = FLA_COMPLEX;
  datatype = FLA_DOUBLE_COMPLEX;

  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {
    mB = mB_input;
    mC = mC_input;
    mD = mD_input;
    n  = n_input;

    if( mB < 0 ) mB = p / abs(mB_input);
    if( mC < 0 ) mC = p / abs(mC_input);
    if( mD < 0 ) mD = p / abs(mD_input);
    if( n  < 0 ) n  = p / abs(n_input);

    for ( variant = 0; variant < n_variants; variant++ ){
      
      FLA_Obj_create( datatype, mB, n, 0, 0, &B );
      FLA_Obj_create( datatype, mC, n, 0, 0, &C );
      FLA_Obj_create( datatype, mD, n, 0, 0, &D );
      FLA_Obj_create( datatype, b_alg, n, 0, 0, &T );
      FLA_Obj_create( datatype, n,  n, 0, 0, &R );
      FLA_Obj_create( datatype, n,  n, 0, 0, &E );

      FLA_Random_matrix( B );
      FLA_Random_matrix( C );
      FLA_Random_matrix( D );

      FLA_Set( FLA_ZERO, R );
      FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, B, FLA_ONE, R );
      FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, D, FLA_ONE, R );
      FLA_Chol( FLA_UPPER_TRIANGULAR, R );

      FLA_Set( FLA_ZERO, E );
      FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, B, FLA_ONE, E );
      FLA_Herk_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, C, FLA_ONE, E );
      FLA_Chol( FLA_UPPER_TRIANGULAR, E );

      fprintf( stdout, "data_uddate_ut( %d, 1:5 ) = [ %d  ", i, p );
      fflush( stdout );

      time_UDdate_UT( variant, FLA_ALG_FRONT, n_repeats, mB, mC, mD, n,
                      B, C, D, T, R, E, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      fprintf( stdout, " ]; \n" );
      fflush( stdout );

      FLA_Obj_free( &B );
      FLA_Obj_free( &C );
      FLA_Obj_free( &D );
      FLA_Obj_free( &T );
      FLA_Obj_free( &R );
      FLA_Obj_free( &E );
    }

    fprintf( stdout, "\n" );
  }

/*
  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 0; i < n_variants; i++ ) {
    fprintf( stdout, "plot( data_qr_ut( :,1 ), data_qr_ut( :, 2 ), '%c:%c' ); \n",
            colors[ i ], ticks[ i ] );
    fprintf( stdout, "plot( data_qr_ut( :,1 ), data_qr_ut( :, 4 ), '%c-.%c' ); \n",
            colors[ i ], ticks[ i ] );
  }

  fprintf( stdout, "legend( ... \n" );

  for ( i = 0; i < n_variants; i++ )
    fprintf( stdout, "'ref\\_qr\\_ut', 'fla\\_qr\\_ut', ... \n" );

  fprintf( stdout, "'Location', 'SouthEast' ); \n" );

  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME UDdate_UT front-end performance (%s, %s)' );\n", 
           m_dim_desc, n_dim_desc );
  fprintf( stdout, "print -depsc qr_ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );
*/

  FLA_Finalize( );

  return 0;
}
예제 #10
0
파일: lq.c 프로젝트: anaptyxis/libflame
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Obj      A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z;
  dim_t        m, n, k;
  dim_t        min_m_n;
  FLA_Error    init_result; 
  double       residual_A, residual_Axy;
  int          use_form_q = 1;

  if ( argc == 4 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    k = atoi(argv[3]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n k\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       k : number of house holder vectors applied for testing\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME LQ^H setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_LQ_UT_create_T( A, &T );
  
  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );
  FLA_Copy( A, A_copy );

  // LQ test ( A = L Q^H )
  FLA_LQ_UT( A, T );

  // Create Q (identity), L (A_copy)
  FLA_Obj_create( datatype, m, n, 0, 0, &Q  ); FLA_Set_to_identity( Q  );
  FLA_Obj_create( datatype, m, m, 0, 0, &D  );

  FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk );
  FLA_Obj_create( datatype, k, k, 0, 0, &Dk  );

  
  FLA_Obj_create( datatype, m, m, 0, 0, &L );

  // Q^H := I H_{0}^H ... H_{k-1}^H
  if ( use_form_q ) {
    FLA_LQ_UT_form_Q( A, T, Q );
  } else {
    FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W );
    FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    A, T, W, Q );
    FLA_Obj_free( &W );
  }

  // D := Q^T Q
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, Q, Q, FLA_ZERO, D ); 

  // Qk := I H0 ... Hk
  FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT );
  FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP );

  if ( use_form_q ) {
    // Overwrite the result to test FLAME API
    FLA_Set( FLA_ZERO, Qk );
    FLA_Copy( Ak, Qk );
    FLA_LQ_UT_form_Q( Ak, Tk, Qk );
  } else {
    FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W );
    FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    Ak, Tk, W, Qk );
    FLA_Obj_free( &W );
  }

  // Dk := Qk^T Qk
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); 

  // L := A (Q^H)^H
  if ( use_form_q ) {
    // Note that the formed Q is actually Q^H; transb should be carefully assigned.
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                       FLA_ONE, A_copy, Q, FLA_ZERO, L );
  } else {
    FLA_Apply_Q_UT_create_workspace( T, L, &W );
    FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    A, T, W, L );
    FLA_Obj_free( &W );
  }
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, L, Q, FLA_ZERO, A_recovered ); 

  // Create vectors for testing
  FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x );
  FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y );
  FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z );

  // x is given
  FLA_Set( FLA_ONE, x );

  // y := Ax
  FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y );
  
  // z := L (Q^H) x , libflame
  FLA_Apply_Q_UT_create_workspace( T, x, &W );
  FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                  A, T, W, x );
  FLA_Obj_free( &W );

  if ( m < n )
    FLA_Part_2x1( x, &x, &W, m, FLA_TOP );
  else 
    FLA_Part_1x2( L, &L, &W, n, FLA_LEFT );    

  FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z );
  
  // Comapre (A_copy, A_recovered), (y,z) and (y,w)
  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );
  residual_Axy  = FLA_Max_elemwise_diff( y, z );

  if ( 1 || residual_A > EPS || residual_Axy > EPS ) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k);
    fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n",
             residual_A, residual_Axy ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &T );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );
  FLA_Obj_free( &L );
  FLA_Obj_free( &Q );
  FLA_Obj_free( &Qk );
  FLA_Obj_free( &D );
  FLA_Obj_free( &Dk );

  FLA_Obj_free( &x );
  FLA_Obj_free( &y );
  FLA_Obj_free( &z );
                   
  FLA_Finalize_safe( init_result );     
}
예제 #11
0
int main( int argc, char** argv ) {
  FLA_Datatype comptype = COMPTYPE;
  FLA_Datatype realtype = REALTYPE;
  dim_t        m;
  FLA_Obj      a, aT, aB, a0, a1, a2;
  FLA_Obj      v, vT, vB, v0, v1, v2;
  FLA_Error    init_result; 
  int          use_abs = 1;

  if ( argc == 3 ) {
    m = atoi(argv[1]);
    use_abs = atoi(argv[2]);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m use_abs\n", argv[0]);
    fprintf(stderr, "       m       : test vector length\n");
    fprintf(stderr, "       use_abs : 0 - norm (realtype), 1 - abs (complex type)\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 )
    return 0;

  FLA_Init_safe( &init_result );          
  
  FLA_Obj_create( comptype, m, 1, 0, 0, &a );
  FLA_Obj_create( use_abs ? comptype : realtype, m, 1, 0, 0, &v );

  FLA_Random_matrix( a );
  FLA_Set( FLA_ZERO, v );

  FLA_Obj_fshow( stdout,  "- a -", a, "% 6.4e", "--" );

  // Normalize a vector
  FLA_Part_2x1( a,    &aT,  
                      &aB,      0, FLA_TOP );
  FLA_Part_2x1( v,    &vT,  
                      &vB,      0, FLA_TOP );
  while ( FLA_Obj_length( aB ) > 0 ) {
    FLA_Repart_2x1_to_3x1( aT, &a0,
                               &a1,
                           aB, &a2, 1, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( vT, &v0,
                               &v1,
                           vB, &v2, 1, FLA_BOTTOM );
    // --------------------------------------------
    if ( use_abs ) { // a and v are complex datatype
      FLA_Copy( a1, v1 );
      FLA_Absolute_value( v1 );
    } else {         // v is real datatype 
      FLA_Nrm2( a1, v1 );
    }
    if ( FLA_Obj_equals( v1, FLA_ZERO ) )
      printf( " ZERO DETECTED\n" );
    else
      FLA_Inv_scal( v1, a1 ); // Normalize the scalar
    // --------------------------------------------
    FLA_Cont_with_3x1_to_2x1( &aT, a0,
                                   a1,
                              &aB, a2, FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &vT, v0,
                                   v1,
                              &vB, v2, FLA_TOP );
  }

  FLA_Obj_fshow( stdout,  "- a -", a, "% 6.4e", "--" );
  FLA_Obj_fshow( stdout,  "- v -", v, "% 6.4e", "--" );

  // Check whether it is normalized
  FLA_Part_2x1( a,    &aT,  
                      &aB,      0, FLA_TOP );
  FLA_Part_2x1( v,    &vT,  
                      &vB,      0, FLA_TOP );
  while ( FLA_Obj_length( aB ) > 0 ) {
    FLA_Repart_2x1_to_3x1( aT, &a0,
                               &a1,
                           aB, &a2, 1, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( vT, &v0,
                               &v1,
                           vB, &v2, 1, FLA_BOTTOM );
    // --------------------------------------------
    if ( use_abs ) { // a and v are same datatype
      FLA_Copy( a1, v1 );
      FLA_Absolute_value( v1 );
    } else {         // v is realdatatype 
      FLA_Nrm2( a1, v1 );
    }
    // --------------------------------------------
    FLA_Cont_with_3x1_to_2x1( &aT, a0,
                                   a1,
                              &aB, a2, FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &vT, v0,
                                   v1,
                              &vB, v2, FLA_TOP );
  }

  FLA_Obj_fshow( stdout, " - all should be one - ", v, "% 6.4e", "--");

  FLA_Obj_free( &a );
  FLA_Obj_free( &v );

  FLA_Finalize_safe( init_result );     
}
FLA_Error FLA_Fill_with_logarithmic_dist( FLA_Obj alpha, FLA_Obj x )
{
	FLA_Obj      lT,              l0,
	             lB,              lambda1,
	                              l2;
	FLA_Obj      l, k, alpha2;
	FLA_Datatype dt_real;
	dim_t        n_x;


	if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
		FLA_Fill_with_logarithmic_dist_check( alpha, x );

	dt_real = FLA_Obj_datatype_proj_to_real( x );
	n_x     = FLA_Obj_vector_dim( x );

	// Create a local counter to increment as we create the distribution.
	FLA_Obj_create( dt_real, 1,   1, 0, 0, &k );

	// Create a local vector l. We will work with this vector, which is
	// the same length as x, so that we can use vertical partitioning.
	FLA_Obj_create( dt_real, n_x, 1, 0, 0, &l );

	// Create a local real scalar alpha2 of the same precision as
	// alpha. Then copy alpha to alpha2, which will convert the
	// complex value to real, if necessary (ie: if alpha is complex).
	FLA_Obj_create( dt_real, 1,   1, 0, 0, &alpha2 );
	FLA_Copy( alpha, alpha2 );

	// Initialize k to 0.
	FLA_Set( FLA_ZERO, k );

	FLA_Part_2x1( l,    &lT,
	                    &lB,            0, FLA_TOP );

	while ( FLA_Obj_length( lB ) > 0 )
	{
		FLA_Repart_2x1_to_3x1( lT,                &l0,
		                    /* ** */            /* ******* */
		                                          &lambda1,
		                       lB,                &l2,        1, FLA_BOTTOM );

		/*------------------------------------------------------------*/

		// lambda1 = alpha^k;
		FLA_Pow( alpha2, k, lambda1 );

		// k = k + 1;
		FLA_Mult_add( FLA_ONE, FLA_ONE, k );

		/*------------------------------------------------------------*/

		FLA_Cont_with_3x1_to_2x1( &lT,                l0,
		                                              lambda1,
		                        /* ** */           /* ******* */
		                          &lB,                l2,     FLA_TOP );
	}

	// Normalize by last element.
	FLA_Part_2x1( l,    &lT,
	                    &lB,            1, FLA_BOTTOM );
	FLA_Inv_scal( lB, l );

	// Overwrite x with the distribution we created in l.
	FLA_Copy( l, x );

	FLA_Obj_free( &l );
	FLA_Obj_free( &k );
	FLA_Obj_free( &alpha2 );

	return FLA_SUCCESS;
}
예제 #13
0
FLA_Error FLA_Nrm2_external( FLA_Obj x, FLA_Obj norm_x )
{
  FLA_Datatype datatype;
  int          num_elem;
  int          inc_x;

  if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) 
    FLA_Nrm2_check( x, norm_x );

  if ( FLA_Obj_has_zero_dim( x ) )
  {
    FLA_Set( FLA_ZERO, norm_x );
    return FLA_SUCCESS;
  }

  datatype = FLA_Obj_datatype( x );

  inc_x    = FLA_Obj_vector_inc( x );
  num_elem = FLA_Obj_vector_dim( x );


  switch ( datatype ){

  case FLA_FLOAT:
  {
    float *buff_x      = ( float * ) FLA_FLOAT_PTR( x );
    float *buff_norm_x = ( float * ) FLA_FLOAT_PTR( norm_x );

    bli_snrm2( num_elem,
               buff_x, inc_x,
               buff_norm_x );

    break;
  }

  case FLA_DOUBLE:
  {
    double *buff_x      = ( double * ) FLA_DOUBLE_PTR( x );
    double *buff_norm_x = ( double * ) FLA_DOUBLE_PTR( norm_x );

    bli_dnrm2( num_elem,
               buff_x, inc_x,
               buff_norm_x );

    break;
  }

  case FLA_COMPLEX:
  {
    scomplex *buff_x      = ( scomplex * ) FLA_COMPLEX_PTR( x );
    float    *buff_norm_x = ( float    * ) FLA_COMPLEX_PTR( norm_x );

    bli_cnrm2( num_elem,
               buff_x, inc_x,
               buff_norm_x );

    break;
  }

  case FLA_DOUBLE_COMPLEX:
  {
    dcomplex *buff_x      = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( x );
    double   *buff_norm_x = ( double   * ) FLA_DOUBLE_COMPLEX_PTR( norm_x );

    bli_znrm2( num_elem,
               buff_x, inc_x,
               buff_norm_x );

    break;
  }

  }
  
  return FLA_SUCCESS;
}
예제 #14
0
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Datatype realtype = REALTYPE;
  FLA_Obj      
    A, TU, TV, 
    A_copy, A_recovered,
    U, V, Vb, B, Be, d, e, 
    DU, DV;

  FLA_Obj     
    ATL, ATR,
    ABL, ABR, Ae;

  FLA_Uplo     uplo;
  dim_t        m, n, min_m_n;
  FLA_Error    init_result; 

  double       residual_A = 0.0;

  if ( argc == 3 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME Bidiag setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_Bidiag_UT_create_T( A, &TU, &TV );

  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 
  {
    scomplex *buff_A = FLA_Obj_buffer_at_view( A );
    buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01;  
    buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01;
  }

  //FLA_Set_to_identity( A );
  //FLA_Scal( FLA_MINUS_ONE, A );

  if ( m >= n ) {
    uplo = FLA_UPPER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
    Ae = ATR; 
  } else {
    uplo = FLA_LOWER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
    Ae = ABL;
  }

  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );

  // Bidiag test
  {
    FLA_Obj      norm;
    FLA_Bool     apply_scale;

    FLA_Obj_create( realtype, 1,1, 0,0, &norm );

    FLA_Max_abs_value( A, norm );
    apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); 

    if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A );
    FLA_Bidiag_UT( A, TU, TV );
    if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); 

    FLA_Obj_free( &norm );
  }


  // Orthonomal basis U, V. 
  FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U );
  FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V );

  FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE,   U );
  FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); 

  if ( FLA_Obj_is_complex( A ) ){
    FLA_Obj rL, rR;
    
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL );
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR );

    FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------");
    FLA_Bidiag_UT_realify( A, rL, rR );
    FLA_Obj_fshow( stdout, " - Factor    realified - ", A, "% 6.4e", "------");

    FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------");

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U );
    FLA_Apply_diag_matrix( FLA_LEFT,  FLA_CONJUGATE, rR, V );

    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
  }

  // U^H U
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU );
  FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, U, U, FLA_ZERO, DU );

  // V^H V
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV );
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, V, V, FLA_ZERO, DV );
  
  // Recover the matrix
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B );
  FLA_Set( FLA_ZERO, B );

  // Set B
  FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d );  
  FLA_Set_diagonal_vector( A, d );
  FLA_Set_diagonal_matrix( d, B );
  FLA_Obj_free( &d );

  if ( min_m_n > 1 ) {
    FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e );  
    FLA_Set_diagonal_vector( Ae, e );
    if ( uplo == FLA_UPPER_TRIANGULAR ) {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
      Be = ATR;
    } else {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
      Be = ABL;
    }
    FLA_Set_diagonal_matrix( e, Be );
    FLA_Obj_free( &e );
  }

  // Vb := B (V^H)
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb );
  FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE,
                     FLA_NONUNIT_DIAG, FLA_ONE, B, Vb );

  // A := U Vb
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                     FLA_ONE, U, Vb, FLA_ZERO, A_recovered );

  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );

  if (1) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U'U - ", DU,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - VV' - ", DV,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n);
    fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &TU );
  FLA_Obj_free( &TV );

  FLA_Obj_free( &B );

  FLA_Obj_free( &U );
  FLA_Obj_free( &V );
  FLA_Obj_free( &Vb );

  FLA_Obj_free( &DU );
  FLA_Obj_free( &DV );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );


  FLA_Finalize_safe( init_result );     
}
예제 #15
0
int test_gemm( FILE* stream, param_t param, result_t *result)
{
    FLA_Datatype datatype = param.datatype;
    FLA_Trans    transa = param.trans[0], transb = param.trans[1];
    FLA_Obj      A, B, C, x, y, z, w;
    FLA_Obj      alpha, beta;
    double       time, time_min =  MAX_TIME_VALUE;
    unsigned int i,
             m = param.dims[0],
             n = param.dims[1],
             k = param.dims[2],
             repeat = param.repeat;
    int          is_trans, is_complex;

    // Create matrices.
    is_trans = (transa == FLA_NO_TRANSPOSE);
    FLA_Obj_create( datatype, (is_trans ? m:k), (is_trans ? k:m), 0,0, &A );
    is_trans = (transb == FLA_NO_TRANSPOSE);
    FLA_Obj_create( datatype, (is_trans ? k:n), (is_trans ? n:k), 0,0, &B );
    FLA_Obj_create( datatype,                m,                n, 0,0, &C );

    FLA_Obj_create( datatype, n, 1, 0, 0, &x );
    FLA_Obj_create( datatype, m, 1, 0, 0, &y );
    FLA_Obj_create( datatype, m, 1, 0, 0, &z );
    FLA_Obj_create( datatype, k, 1, 0, 0, &w );

    // Initialize the test matrices.
    FLA_Random_matrix( A );
    FLA_Random_matrix( B );
    FLA_Random_matrix( C );

    FLA_Random_matrix( x );
    FLA_Set( FLA_ZERO, y );
    FLA_Set( FLA_ZERO, w );
    FLA_Set( FLA_ZERO, z );

    // Constants.
    alpha = FLA_MINUS_ONE;
    beta  = FLA_ZERO;

    // Repeat the experiment repeat times and record results.
    for ( i = 0; i < repeat; ++i )
    {
        time = FLA_Clock();
        FLA_Gemm_external( transa, transb, alpha, A, B, beta, C );
        time = FLA_Clock() - time;

        time_min = min( time_min, time );
    }

    is_complex = FLA_Obj_is_complex( C );
    result->performance = ( FMULS * FP_PER_MUL(is_complex) +
                            FADDS * FP_PER_ADD(is_complex) )/time_min/FLOPS_PER_UNIT_PERF;

    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y );
    FLA_Gemv_external( transb,           FLA_ONE, B, x, FLA_ZERO, w );
    FLA_Gemv_external( transa,           alpha,   A, w, FLA_ZERO, z );

    result->residual = FLA_Max_elemwise_diff( y, z );

    FLA_Obj_free( &A );
    FLA_Obj_free( &B );
    FLA_Obj_free( &C );
    FLA_Obj_free( &x );
    FLA_Obj_free( &y );
    FLA_Obj_free( &z );
    FLA_Obj_free( &w );

    return 0;
}
예제 #16
0
int main(int argc, char *argv[])
{
  int 
    datatype,
    n_blocks_m,
    n_threads,
    m_input, n_input,
    m, n,
    p_first, p_last, p_inc,
    p,
    n_repeats,
    param_combo,
    i,
    n_param_combos = N_PARAM_COMBOS;
  
  dim_t
    nb_flash, nb_alg;

  char *colors = "brkgmcbrkgmcbrkgmc";
  char *ticks  = "o+*xso+*xso+*xso+*xs";
  char m_dim_desc[14];
  char n_dim_desc[14];
  char m_dim_tag[10];
  char n_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff;

  FLA_Obj
    A, A_flat_ref, A_flat, B, B_flat, D, D_flat, t, T, T_flat;
  
  FLA_Init( );


  fprintf( stdout, "%c number of repeats: ", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c enter algorithmic blocksize: ", '%' );
  scanf( "%u", &nb_alg );
  fprintf( stdout, "%c %u\n", '%', nb_alg );

  fprintf( stdout, "%c enter problem size first, last, inc: ", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' );
  scanf( "%d%d", &m_input, &n_input );
  fprintf( stdout, "%c %d %d\n", '%', m_input, n_input );

  fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' );
  scanf( "%d", &n_threads );
  fprintf( stdout, "%c %d\n", '%', n_threads );


  fprintf( stdout, "\nclear all;\n\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }
  if     ( n_input >  0 ) {
    sprintf( n_dim_desc, "n = %d", n_input );
    sprintf( n_dim_tag,  "n%dc", n_input);
  }
  else if( n_input <  -1 ) {
    sprintf( n_dim_desc, "n = p/%d", -n_input );
    sprintf( n_dim_tag,  "n%dp", -n_input );
  }
  else if( n_input == -1 ) {
    sprintf( n_dim_desc, "n = p" );
    sprintf( n_dim_tag,  "n%dp", 1 );
  }

  //datatype = FLA_FLOAT;
  //datatype = FLA_DOUBLE;
  //datatype = FLA_COMPLEX;
  datatype = FLA_DOUBLE_COMPLEX;

  FLASH_Queue_set_num_threads( n_threads );

  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {
    m = m_input;
    n = n_input;

    if( m < 0 ) m = p / abs(m_input);
    if( n < 0 ) n = p / abs(n_input);

    nb_flash = n;

    for ( param_combo = 0; param_combo < n_param_combos; param_combo++ )
    {

      FLA_Obj_create( datatype, m,          nb_flash, &A_flat );
      FLA_Obj_create( datatype, m,          nb_flash, &A_flat_ref );
      FLA_Obj_create( datatype, m,          nb_flash, &T_flat );
      FLA_Obj_create( datatype, nb_flash,   1,        &t );

      FLASH_Obj_create( datatype, m, nb_flash, 1, &nb_flash, &A );
      n_blocks_m = FLA_Obj_length( A );
      FLASH_Obj_create_ext( datatype, nb_alg * n_blocks_m, nb_flash, 1, &nb_alg, &nb_flash, &T );

      FLA_Set( FLA_ZERO, T_flat );
      FLASH_Set( FLA_ZERO, T );

      FLASH_Random_matrix( A );
      FLASH_Obj_flatten( A, A_flat );

      FLA_Part_2x1( A,   &B,
                         &D,   1, FLA_TOP );

      FLA_Part_2x1( A_flat,   &B_flat,
                              &D_flat,   FLA_Obj_width( A_flat ), FLA_TOP );

      FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, *(FLASH_OBJ_PTR_AT(B)) );
      FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, B_flat );


      fprintf( stdout, "data_qr2ut_%s( %d, 1:5 ) = [ %d  ", pc_str[param_combo], i, p );
      fflush( stdout );

      time_QR2_UT( param_combo, FLA_ALG_REFERENCE, n_repeats, m, n,
                    A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      time_QR2_UT( param_combo, FLA_ALG_FRONT, n_repeats, m, n,
                    A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );


      fprintf( stdout, " ]; \n" );
      fflush( stdout );

      FLA_Obj_free( &A_flat );
      FLA_Obj_free( &A_flat_ref );
      FLA_Obj_free( &T_flat );
      FLA_Obj_free( &t );

      FLASH_Obj_free( &A );
      FLASH_Obj_free( &T );
    }

    fprintf( stdout, "\n" );
  }

  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 0; i < n_param_combos; i++ ) {
    fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 2 ), '%c:%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
    fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 4 ), '%c-.%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
  }

  fprintf( stdout, "legend( ... \n" );

  for ( i = 0; i < n_param_combos; i++ )
    fprintf( stdout, "'ref\\_qr2ut\\_%s', 'fla\\_qr2ut\\_%s', ... \n", pc_str[i], pc_str[i] );

  fprintf( stdout, "'Location', 'SouthEast' ); \n" );


  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME qr2ut front-end performance (%s, %s)' );\n",
           m_dim_desc, n_dim_desc );
  fprintf( stdout, "print -depsc qr2ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );

  FLA_Finalize( );

  return 0;
}