Ejemplo n.º 1
0
FLA_Error FLA_QR_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X )
{
  FLA_Obj W, Y;
  FLA_Obj AT, AB;
  FLA_Obj YT, YB;

  // Check parameters.
  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_QR_UT_solve_check( A, T, B, X );

  FLA_Apply_Q_UT_create_workspace( T, B, &W );

  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &Y );

  FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE,
                  A, T, W, Y );

  FLA_Part_2x1( A,   &AT,
                     &AB,    FLA_Obj_width( A ), FLA_TOP );
  FLA_Part_2x1( Y,   &YT,
                     &YB,    FLA_Obj_width( A ), FLA_TOP );

  FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                     FLA_NONUNIT_DIAG, FLA_ONE, AT, YT );

  FLA_Copy_external( YT, X );

  FLA_Obj_free( &Y );
  FLA_Obj_free( &W );

  return FLA_SUCCESS;
}
Ejemplo n.º 2
0
int phonopy_pinv_libflame(double *matrix,
			  double *eigvals,
			  const int size,
			  const double cutoff)
{
  FLA_Obj A, B, l;
  /* FLA_Obj C; */
  double *inv_eigvals;
  int i;

  inv_eigvals = (double*)malloc(sizeof(double) * size);
  
  FLA_Init();
  FLA_Obj_create_without_buffer(FLA_DOUBLE, size, size, &A);
  FLA_Obj_attach_buffer(matrix, 0, 0, &A);
  
  FLA_Obj_create_without_buffer(FLA_DOUBLE, size, 1, &l);
  FLA_Obj_attach_buffer(eigvals, 0, 0, &l);

  /* Eigensolver */
  FLA_Obj_create_copy_of(FLA_NO_TRANSPOSE, A, &B);
  FLA_Hevd(FLA_EVD_WITH_VECTORS, FLA_LOWER_TRIANGULAR, B, l);

  /* SVD */
  /* FLA_Obj_create(FLA_DOUBLE, size, size, 0, 0, &B); */
  /* use U */
  /* FLA_Svd(FLA_SVD_VECTORS_ALL, FLA_SVD_VECTORS_NONE, A, l, B, C); */
  /* use V */
  /* FLA_Svd(FLA_SVD_VECTORS_NONE, FLA_SVD_VECTORS_ALL, A, l, C, B); */
  
  FLA_Obj_free_without_buffer(&l);
  
  for (i = 0; i < size; i++) {
    if (eigvals[i] < cutoff) {
      inv_eigvals[i] = 0;
    } else {
      inv_eigvals[i] = 1.0 / sqrt(eigvals[i]);
    }
  }
  
  FLA_Obj_create_without_buffer(FLA_DOUBLE, size, 1, &l);
  FLA_Obj_attach_buffer(inv_eigvals, 0, 0, &l);
  
  FLA_Apply_diag_matrix(FLA_RIGHT, FLA_NO_CONJUGATE, l, B);
  FLA_Syrk(FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, B, FLA_ZERO, A);
  FLA_Symmetrize(FLA_LOWER_TRIANGULAR, A);
  
  FLA_Obj_free_without_buffer(&A);
  FLA_Obj_free_without_buffer(&l);
  FLA_Obj_free(&B);

  FLA_Finalize();

  free(inv_eigvals);
  
  return 0;
}
Ejemplo n.º 3
0
FLA_Error FLA_UDdate_UT_update_rhs( FLA_Obj T, FLA_Obj bR,
                                    FLA_Obj C, FLA_Obj bC,
                                    FLA_Obj D, FLA_Obj bD )
{
	FLA_Obj W;
	FLA_Obj bC_copy;
	FLA_Obj bD_copy;

	// Check parameters.
	if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
		FLA_UDdate_UT_update_rhs_check( T, bR, C, bC, D, bD );

	// Create workspace according to the algorithmic blocksize (length of T)
	// and the number of right-hand sides.
	FLA_Apply_QUD_UT_create_workspace( T, bR, &W );
	
	// Make temporary copies of the bC and bD right-hand side objects so we
	// don't destory their original contents.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, bC, &bC_copy );
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, bD, &bD_copy );

	// Apply the updowndating Q' to the right-hand sides.
	FLA_Apply_QUD_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE,
	                  T, W,
	                     bR,
	                  C, bC_copy,
	                  D, bD_copy );

	// Free the temporary objects.
	FLA_Obj_free( &bC_copy );
	FLA_Obj_free( &bD_copy );

	// Free the workspace object.
	FLA_Obj_free( &W );

	return FLA_SUCCESS;
}
Ejemplo n.º 4
0
void libfla_test_symm_experiment( test_params_t params,
                                  unsigned int  var,
                                  char*         sc_str,
                                  FLA_Datatype  datatype,
                                  unsigned int  p_cur,
                                  unsigned int  pci,
                                  unsigned int  n_repeats,
                                  signed int    impl,
                                  double*       perf,
                                  double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	unsigned int n;
	signed int   n_input    = -1;
	FLA_Side     side;
	FLA_Uplo     uplo;
	FLA_Obj      A, B, C, x, y, z, w, norm;
	FLA_Obj      alpha, beta;
	FLA_Obj      C_save;
	FLA_Obj      A_test, B_test, C_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur / abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur / abs(n_input);
	else               n = p_cur;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side );
	FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo );

	// Create the matrices for the current operation.
	if ( side == FLA_LEFT )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, m, 1, 0, 0, &w );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, n, 1, 0, 0, &w );
	}
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C );

	// Create a norm scalar.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Initialize the test matrices.
	FLA_Random_symm_matrix( uplo, A );
	FLA_Random_matrix( B );
	FLA_Random_matrix( C );

	// Initialize the test vectors.
	FLA_Random_matrix( x );
    FLA_Set( FLA_ZERO, y );
    FLA_Set( FLA_ZERO, z );
    FLA_Set( FLA_ZERO, w );

	// Set constants.
	alpha = FLA_TWO;
	beta  = FLA_MINUS_ONE;

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
		FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test );
	}
	else
	{
		A_test = A;
		B_test = B;
		C_test = C;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( C_save, C_test );
		else
			FLA_Copy_external( C_save, C_test );
		
		time = FLA_Clock();

		libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Copy the solution to flat matrix X.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_flatten( C_test, C );
	}
	else
    {
		// No action needed since C_test and C refer to the same object.
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &B_test );
		FLASH_Obj_free( &C_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_free();

	// Compute the performance of the best experiment repeat.
	if ( side == FLA_LEFT )
		*perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
	else
		*perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute:
	//   y = C * x
	// and compare to
	//   z = ( beta * C_orig + alpha * A * B ) x      (side = left)
	//   z = ( beta * C_orig + alpha * B * A ) x      (side = right)
	FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y );

	if ( side == FLA_LEFT )
	{
		FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w );
		FLA_Symv_external( uplo,             alpha,   A, w, FLA_ZERO, z );
	}
	else
	{
		FLA_Symv_external( uplo,             FLA_ONE, A, x, FLA_ZERO, w );
		FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha,   B, w, FLA_ZERO, z );
	}
	FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z );

	// Compute || y - z ||.
	//FLA_Axpy_external( FLA_MINUS_ONE, y, z );
	//FLA_Nrm2_external( z, norm );
	//FLA_Obj_extract_real_scalar( norm, residual );
	*residual = FLA_Max_elemwise_diff( y, z );

	// Free the supporting flat objects.
	FLA_Obj_free( &C_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &B );
	FLA_Obj_free( &C );
	FLA_Obj_free( &x );
	FLA_Obj_free( &y );
	FLA_Obj_free( &z );
	FLA_Obj_free( &w );
	FLA_Obj_free( &norm );
}
Ejemplo n.º 5
0
FLA_Error FLA_Sort( FLA_Direct direct, FLA_Obj x )
{
	FLA_Datatype datatype;
	FLA_Obj      x_use;
	dim_t        m_x;
	dim_t        inc_x;

	if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
		FLA_Sort_check( direct, x );

	datatype = FLA_Obj_datatype( x );

	m_x      = FLA_Obj_vector_dim( x );
	inc_x    = FLA_Obj_vector_inc( x );

	// If the vector does not have unit stride, copy it to a temporary vector
	// that does have unit stride.
	if ( inc_x != 1 )
	{
		FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, x, &x_use );
		inc_x = FLA_Obj_vector_inc( x_use );
	}
	else
	{
		x_use = x;
	}

	switch ( datatype )
	{
		case FLA_FLOAT:
		{
			float* x_p = ( float* ) FLA_FLOAT_PTR( x_use );

			if ( direct == FLA_FORWARD )
				FLA_Sort_f_ops( m_x,
				                x_p, inc_x );
			else // if ( direct == FLA_BACKWARD )
				FLA_Sort_b_ops( m_x,
				                x_p, inc_x );

			break;
		}

		case FLA_DOUBLE:
		{
			double* x_p = ( double* ) FLA_DOUBLE_PTR( x_use );

			if ( direct == FLA_FORWARD )
				FLA_Sort_f_opd( m_x,
				                x_p, inc_x );
			else // if ( direct == FLA_BACKWARD )
				FLA_Sort_b_opd( m_x,
				                x_p, inc_x );

			break;
		}

	}

	if ( inc_x != 1 )
	{
		FLA_Copy( x_use, x );
		FLA_Obj_free( &x_use );
	}

	return FLA_SUCCESS;
}
Ejemplo n.º 6
0
void libfla_test_qrut_experiment( test_params_t params,
                                  unsigned int  var,
                                  char*         sc_str,
                                  FLA_Datatype  datatype,
                                  unsigned int  p_cur,
                                  unsigned int  pci,
                                  unsigned int  n_repeats,
                                  signed int    impl,
                                  double*       perf,
                                  double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m, n;
	unsigned int min_m_n;
	signed int   m_input    = -2;
	signed int   n_input    = -1;
	FLA_Obj      A, T, x, b, y, norm;
	FLA_Obj      A_save;
	FLA_Obj      A_test, T_test, x_test, b_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur * abs(n_input);
	else               n = p_cur;

	// Compute the minimum dimension.
	min_m_n = min( m, n );

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A );

	if ( impl == FLA_TEST_FLAT_FRONT_END ||
	     ( impl == FLA_TEST_FLAT_BLK_VAR && var == 1 ) )
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T );
	else if ( var == 2 )
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], min_m_n, min_m_n, &T );
	else
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], 1, min_m_n, &T );

	// Initialize the test matrices.
	FLA_Random_matrix( A );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );

	// Create vectors to form a linear system.
	FLA_Obj_create( datatype, n, 1, 0, 0, &x );
	FLA_Obj_create( datatype, m, 1, 0, 0, &b );
	FLA_Obj_create( datatype, n, 1, 0, 0, &y );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Create a random right-hand side vector.
	FLA_Random_matrix( b );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		FLASH_Obj_create_hier_copy_of_flat( b, 1, &b_flash, &b_test );
		FLASH_Obj_create_hier_copy_of_flat( x, 1, &b_flash, &x_test );
	}
	else
	{
		A_test = A;
		T_test = T;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_qrut_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( A_save, A_test );
		else
			FLA_Copy_external( A_save, A_test );
		
		time = FLA_Clock();

		libfla_test_qrut_impl( impl, A_test, T_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Perform a linear solve with the result.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_QR_UT_solve( A_test, T_test, b_test, x_test );
		FLASH_Obj_flatten( x_test, x );
	}
	else
    {
		FLA_QR_UT_solve( A_test, T_test, b, x );
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &T_test );
		FLASH_Obj_free( &b_test );
		FLASH_Obj_free( &x_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_qrut_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = (         2.0   * m * n * n - 
	          ( 2.0 / 3.0 ) * n * n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute the residual.
	FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_save, x, FLA_MINUS_ONE, b );
	FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b, FLA_ZERO, y );
	FLA_Nrm2_external( y, norm );
	FLA_Obj_extract_real_scalar( norm, residual );

	// Free the supporting flat objects.
	FLA_Obj_free( &x );
	FLA_Obj_free( &b );
	FLA_Obj_free( &y );
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
}
Ejemplo n.º 7
0
void time_Tevd_v(
               int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max,
               FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l,
               double *dtime, double *diff1, double* diff2, double *gflops )
{
  int irep;

  double
    k, dtime_old = 1.0e9;

  FLA_Obj
    A_save, G_save, d_save, e_save;

  if (
       //( variant == 0 ) ||
       //( variant == 1 && type == FLA_ALG_UNB_OPT ) ||
       //( variant == 2 && type == FLA_ALG_UNB_OPT ) ||
       FALSE
     )
  {
    *dtime  = 0.0;
    *gflops = 0.0;
    *diff1  = 0.0;
    *diff2  = 0.0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( G, G_save );
  FLA_Copy_external( d, d_save );
  FLA_Copy_external( e, e_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( G_save, G );
    FLA_Copy_external( d_save, d );
    FLA_Copy_external( e_save, e );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      REF_Tevd_v( d, e, A );
      break;

    // Time variant 1
    case 1:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg );
        break;
      }
      break;
    }

    // Time variant 2
    case 2:
    {
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg );
        break;
      }
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }
  {
    FLA_Obj V, A_rev_evd, norm, eye;

	FLA_Copy( d, l );

//FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" );
//FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" );
	FLA_Sort_evd( FLA_FORWARD, l, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );


    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

/*
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A, D, FLA_ZERO, A_rev_evd );
    FLA_Copy( A_rev_evd, D );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, D, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );
*/
//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );
    //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

/*
FLA_Obj_free( &EL );
FLA_Obj_free( &EU );
FLA_Obj_free( &D );
FLA_Obj_free( &dc );
FLA_Obj_free( &ec );
*/

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                2.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }
  else 
  {
    *gflops = (
                      (       4.5 * k * m * m     ) +
                1.0 * (       3.0 * k * m * m * m ) ) / 
              dtime_old / 1e9;
  }

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( G_save, G );
  FLA_Copy_external( d_save, d );
  FLA_Copy_external( e_save, e );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &G_save );
  FLA_Obj_free( &d_save );
  FLA_Obj_free( &e_save );
}
Ejemplo n.º 8
0
void time_Hevd_lv_components(
               int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
               FLA_Obj A, FLA_Obj l,
               double* dtime, double* diff1, double* diff2, double* gflops,
               double* dtime_tred, double* gflops_tred,
               double* dtime_tevd, double* gflops_tevd,
               double* dtime_appq, double* gflops_appq, int* k_perf )
{
  int     i;
  double  k;
  double  dtime_save      = 1.0e9;
  double  dtime_tred_save = 1.0e9;
  double  dtime_tevd_save = 1.0e9;
  double  dtime_appq_save = 1.0e9;
  double  flops_tred;
  double  flops_tevd;
  double  flops_appq;
  double  mult_tred;
  double  mult_tevd;
  double  mult_appq;

  FLA_Obj A_save, Z;

  if (
       ( variant == -3 ) ||
       ( variant == -4 ) ||
       ( variant == -5 ) ||
       //( variant == 0 ) ||
       //( variant == -1 ) ||
       //( variant == -2 ) ||
       //( variant == 1 ) ||
       //( variant == 2 ) ||
       //( variant == 3 ) ||
       //( variant == 4 ) ||
       FALSE
     )
  {
    *gflops      = 0.0;
    *dtime       = 0.0;
    *diff1       = 0.0;
    *diff2       = 0.0;
    *dtime_tred  = 0.0;
    *dtime_tevd  = 0.0;
    *dtime_appq  = 0.0;
    *gflops_tred = 0.0;
    *gflops_tevd = 0.0;
    *gflops_appq = 0.0;
    *k_perf      = 0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z );

  FLA_Copy_external( A, A_save );

  for ( i = 0 ; i < n_repeats; i++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case -3:
    {
      *k_perf = 0;
      REF_Hevd_lv( A, l,
                   dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -4:
    {
      *k_perf = 0;
      REF_Hevdd_lv( A, l,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -5:
    {
      *k_perf = 0;
      REF_Hevdr_lv( A, l, Z,
                    dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case 0:
    {
      *k_perf = 0;
      REF_Hevd_lv_components( A, l,
                              dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -1:
    {
      *k_perf = 0;
      REF_Hevdd_lv_components( A, l,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    case -2:
    {
      *k_perf = 0;
      REF_Hevdr_lv_components( A, l, Z,
                               dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 1
    case 1:
    {
      *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    // Time variant 2
    case 2:
    {
      *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg,
                                             dtime_tred, dtime_tevd, dtime_appq );
      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    if ( *dtime < dtime_save )
    {
      dtime_save      = *dtime;
      dtime_tred_save = *dtime_tred;
      dtime_tevd_save = *dtime_tevd;
      dtime_appq_save = *dtime_appq;
    }
  }

  *dtime      = dtime_save;
  *dtime_tred = dtime_tred_save;
  *dtime_tevd = dtime_tevd_save;
  *dtime_appq = dtime_appq_save;

//if ( variant == -3 || variant == 0 )
//printf( "\ndtime is %9.3e\n", *dtime );

  {
    FLA_Obj V, A_rev_evd, norm, eye;

    if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A );

    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); 
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); 
    FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );

//FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
 
    FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd );
    FLA_Norm_frob( A_rev_evd, norm );
    FLA_Obj_extract_real_scalar( norm, diff1 );

    FLA_Set_to_identity( eye );
	FLA_Copy( V, A_rev_evd );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff2 );

    FLA_Obj_free( &V );
    FLA_Obj_free( &A_rev_evd );
    FLA_Obj_free( &eye );
    FLA_Obj_free( &norm );
  }

  k = 2.00;

  flops_tred = ( ( 4.0 / 3.0 )   * m * m * m );
  flops_tevd = (   4.5           * k * m * m     +
                   3.0           * k * m * m * m );

  if ( variant == -1 || variant == -2 ||
       variant == -4 || variant == -5 )
    flops_appq = ( 2.0           * m * m * m );
  else
    flops_appq = ( 4.0 / 3.0     * m * m * m );

/*
  if ( FLA_Obj_is_complex( A ) )
  {
    *gflops      = ( 4.0 * flops_tred + 
                     2.0 * flops_tevd + 
                     4.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9;
  }
  else
  {
    *gflops      = ( 1.0 * flops_tred + 
                     1.0 * flops_tevd + 
                     1.0 * flops_appq ) / *dtime      / 1e9;

    *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9;
    *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9;
    *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9;
  }
*/

  if ( FLA_Obj_is_complex( A ) )
  {
    mult_tred = 4.0;
    mult_tevd = 2.0;
    mult_appq = 4.0;
  }
  else
  {
    mult_tred = 1.0;
    mult_tevd = 1.0;
    mult_appq = 1.0;
  }

  *gflops = ( mult_tred * flops_tred + 
              mult_tevd * flops_tevd + 
              mult_appq * flops_appq ) / *dtime / 1e9;

  *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9;
  *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9;
  *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &Z );
}
Ejemplo n.º 9
0
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Obj      A, A_flame, A_lapack, C;
  int          m;
  FLA_Error    init_result; 

  FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame;
  FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack;
  testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, 
    *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack;
  int lwork, info, is_flame;
  
  if ( argc == 3 ) {
    m = atoi(argv[1]);
    is_flame = atoi(argv[2]);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m is_flame\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       is_flame : 1 yes, 0 no\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  fprintf( stdout, "lapack2flame: %d x %d: \n", m, m);

  FLA_Obj_create( datatype, m, m, 0, 0, &A );
  FLA_Random_matrix( A ); 
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame  );
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack );

  FLA_Obj_create( datatype, m, m, 0, 0, &C );
  FLA_Random_matrix( C ); 


  if ( is_flame ) {
    fprintf( stdout, " flame executed\n");
    FLA_Bidiag_UT_create_T( A_flame, &TU, &TV );

    FLA_Bidiag_UT( A_flame, TU, TV );
    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame );
    FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame );

    FLA_Bidiag_UT_form_U( U_flame, TU, U_flame );
    FLA_Bidiag_UT_form_V( V_flame, TV, V_flame );
    
    FLA_Obj_create( datatype, m,      1, 0, 0, &d_flame );
    FLA_Obj_create( datatype, m - 1,  1, 0, 0, &e_flame );
    FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame );

    FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame );
    {
      FLA_Obj BTL, BTR, BBL, BBR;
      FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL );
      FLA_Set_diagonal_matrix( d_flame, B_flame );
      FLA_Set_diagonal_matrix( e_flame, BTR );
    }

    if (1) {
      fprintf( stdout, " - FLAME ----------\n");
      FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------");
    }
  } else {
    fprintf( stdout, " lapack executed\n");

    FLA_Obj_create( datatype, m, 1, 0, 0, &tauq );
    FLA_Obj_create( datatype, m, 1, 0, 0, &taup );
    FLA_Obj_create( datatype, m,      1, 0, 0, &d_lapack );
    FLA_Obj_create( datatype, m - 1,  1, 0, 0, &e_lapack );

    buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack );
    buff_tauq     = (testtype*)FLA_Obj_buffer_at_view( tauq );
    buff_taup     = (testtype*)FLA_Obj_buffer_at_view( taup );
    buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack );
    buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack );

    lwork = 32*m;
    
    FLA_Obj_create( datatype, lwork, 1, 0, 0, &W );
    buff_W = (testtype*)FLA_Obj_buffer_at_view( W );
    sgebrd_( &m, &m, 
             buff_A_lapack, &m,
             buff_d_lapack,
             buff_e_lapack,
             buff_tauq,
             buff_taup,
             buff_W,
             &lwork,
             &info );

    FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack );
    FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack );
    
    FLA_Copy( A_lapack, U_lapack );
    FLA_Copy( A_lapack, V_lapack );

    buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack );
    buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack );

    sorgbr_( "Q", &m, &m, &m,
             buff_U_lapack, &m,
             buff_tauq, 
             buff_W,
             &lwork,
             &info );
    
    sorgbr_( "P", &m, &m, &m,
             buff_V_lapack, &m,
             buff_taup,
             buff_W,
             &lwork,
             &info );

    FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack );
    {
      FLA_Obj BTL, BTR, BBL, BBR;
      FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL );
      FLA_Set_diagonal_matrix( d_lapack, B_lapack );
      FLA_Set_diagonal_matrix( e_lapack, BTR );
    }
    
    FLA_Obj_free( &W );    


    if (1) {
      fprintf( stdout, " - LAPACK ----------\n");
      FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------");
    }
  }

  {
    testtype     dummy;
    int          zero = 0, one = 1;
    FLA_Obj      D_lapack;

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack );

    if ( is_flame ) {
      buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame );
      buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame );
      buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame );
      buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame );
    }

    FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W );
    buff_W = (testtype*)FLA_Obj_buffer_at_view( W );
    sbdsqr_( "U", &m, &m, &m, &zero, 
             buff_d_lapack, buff_e_lapack, 
             buff_V_lapack, &m, 
             buff_U_lapack, &m, 
             &dummy, &one, 
             buff_W, &info );
    FLA_Obj_free( &W );
    if (info != 0)
      printf( " Error info = %d\n", info );

    if ( is_flame )
      FLA_Set_diagonal_matrix( d_flame, D_lapack );
    else
      FLA_Set_diagonal_matrix( d_lapack, D_lapack );

    if ( is_flame ) {
      fprintf( stdout, " - FLAME ----------\n");
      FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------");
    } else {
      fprintf( stdout, " - LAPACK ----------\n");
      FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------");
      FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------");
    }

    FLA_Obj_free( &D_lapack );
  }

  if ( is_flame ) {
    FLA_Obj_free( &TU );
    FLA_Obj_free( &TV );
    FLA_Obj_free( &U_flame );
    FLA_Obj_free( &V_flame );
    FLA_Obj_free( &d_flame );
    FLA_Obj_free( &e_flame );
    FLA_Obj_free( &B_flame );
  } else {
    FLA_Obj_free( &tauq );
    FLA_Obj_free( &taup );
    FLA_Obj_free( &d_lapack );
    FLA_Obj_free( &e_lapack );
    FLA_Obj_free( &U_lapack );
    FLA_Obj_free( &V_lapack );
    FLA_Obj_free( &B_lapack );
  }
  FLA_Obj_free( &A );
  FLA_Obj_free( &A_flame );
  FLA_Obj_free( &A_lapack );

  FLA_Obj_free( &C );

  FLA_Finalize_safe( init_result );     
}
Ejemplo n.º 10
0
void libfla_test_apqut_experiment( test_params_t params,
                                   unsigned int  var,
                                   char*         sc_str,
                                   FLA_Datatype  datatype,
                                   unsigned int  p_cur,
                                   unsigned int  pci,
                                   unsigned int  n_repeats,
                                   signed int    impl,
                                   double*       perf,
                                   double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m, n;
	unsigned int min_m_n;
	signed int   m_input;
	signed int   n_input;
	FLA_Side     side;
	FLA_Trans    trans;
	FLA_Direct   direct;
	FLA_Store    storev;
	FLA_Obj      A, T, W, B, eye, norm;
	FLA_Obj      B_save;
	FLA_Obj      A_test, T_test, W_test, B_test;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side );
	FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans );
	FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct );
	FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev );

	// We want to make sure the Apply_Q_UT routines work with rectangular
	// matrices. So we use m > n when testing with column-wise storage (via
	// QR factorization) and m < n when testing with row-wise storage (via
	// LQ factorization).
	if ( storev == FLA_COLUMNWISE )
	{
		m_input = -1;
		n_input = -1;
		//m_input = -1;
		//n_input = -1;
	}
	else // if ( storev == FLA_ROWWISE )
	{
		m_input = -1;
		n_input = -1;
		//m_input = -1;
		//n_input = -1;
	}

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur * abs(n_input);
	else               n = p_cur;

	// Compute the minimum dimension.
	min_m_n = min( m, n );

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T );
	if ( storev == FLA_COLUMNWISE )
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B );
	else
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B );

	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye );

	FLA_Apply_Q_UT_create_workspace( T, B, &W );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Initialize the test matrices.
	FLA_Random_matrix( A );
	FLA_Set_to_identity( B );
	FLA_Set_to_identity( eye );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE )
			FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		else // if ( storev == FLA_ROWWISE )
			FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
		FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		A_test = A;
		T_test = T;
		W_test = W;
		B_test = B;
	}

	// Compute a Householder factorization.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test );
		else                            FLASH_LQ_UT( A_test, T_test );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test );
		else                            FLA_LQ_UT( A_test, T_test );
	}

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( B_save, B_test );
		else
			FLA_Copy_external( B_save, B_test );

		time = FLA_Clock();

		libfla_test_apqut_impl( impl, side, trans, direct, storev,
		                        A_test, T_test, W_test, B_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Multiply by its conjugate-transpose to get what should be (near) identity
	// and then subtract from actual identity to get what should be (near) zero.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_flatten( B_test, B );
		FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
		                   FLA_ONE, B, B, FLA_MINUS_ONE, eye );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
		                   FLA_ONE, B, B, FLA_MINUS_ONE, eye );
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &T_test );
		FLASH_Obj_free( &W_test );
		FLASH_Obj_free( &B_test );
	}

	// Compute the norm of eye, which contains I - Q * Q'.
	FLA_Norm1( eye, norm );
	FLA_Obj_extract_real_scalar( norm, residual );

	// Compute the performance of the best experiment repeat.
	*perf = (  4.0 *       m * min_m_n * n -
	           2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Free the supporting flat objects.
	FLA_Obj_free( &B_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
	FLA_Obj_free( &W );
	FLA_Obj_free( &B );
	FLA_Obj_free( &eye );
	FLA_Obj_free( &norm );
}
Ejemplo n.º 11
0
void libfla_test_eig_gest_experiment( test_params_t params,
                                      unsigned int  var,
                                      char*         sc_str,
                                      FLA_Datatype  datatype,
                                      unsigned int  p_cur,
                                      unsigned int  pci,
                                      unsigned int  n_repeats,
                                      signed int    impl,
                                      double*       perf,
                                      double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	FLA_Uplo     inv;
	FLA_Uplo     uplo;
	FLA_Obj      A, B, Y, norm;
	FLA_Obj      A_save, B_save;
	FLA_Obj      A_test, B_test, Y_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur / abs(m_input);
	else               m = p_cur;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv );
	FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo );

	if ( inv == FLA_NO_INVERSE &&
         ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) ||
	       ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) ||
	       ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) )
       )
	{
		*perf     = 0.0;
		*residual = 0.0;
		return;
	}

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B );

	// Initialize the test matrices.
	FLA_Random_spd_matrix( uplo, A );
    FLA_Scalr( uplo, FLA_TWO, A );
	FLA_Hermitianize( uplo, A );

	FLA_Random_spd_matrix( uplo, B );
    FLA_Scalr( uplo, FLA_TWO, B );
	FLA_Chol( uplo, B );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test );
		FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
	}
	else
	{
		A_test = A;
		Y_test = Y;
		B_test = B;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_eig_gest_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
		{
			FLASH_Obj_hierarchify( A_save, A_test );
			FLASH_Obj_hierarchify( B_save, B_test );
		}
		else
		{
			FLA_Copy_external( A_save, A_test );
			FLA_Copy_external( B_save, B_test );
		}
		
		time = FLA_Clock();
		
		libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Check our solution.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLA_Trans trans_left, trans_right;
		
		FLASH_Hermitianize( uplo, A_test );
		
		if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) ||
		     ( inv == FLA_INVERSE    && uplo == FLA_UPPER_TRIANGULAR ) )
		{
			trans_left  = FLA_CONJ_TRANSPOSE;
			trans_right = FLA_NO_TRANSPOSE;
		}
		else
		{
			trans_left  = FLA_NO_TRANSPOSE;
			trans_right = FLA_CONJ_TRANSPOSE;
		}

		if ( inv == FLA_NO_INVERSE )
		{
			FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
			FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
		}
		else // if ( inv == FLA_INVERSE )
		{
			FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
			FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
			            FLA_ONE, B_test, A_test );
		}
		FLASH_Obj_flatten( A_test, A );
	}
	else
	{
		FLA_Trans trans_left, trans_right;

		FLA_Hermitianize( uplo, A_test );

		if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) ||
		     ( inv == FLA_INVERSE    && uplo == FLA_UPPER_TRIANGULAR ) )
		{
			trans_left  = FLA_CONJ_TRANSPOSE;
			trans_right = FLA_NO_TRANSPOSE;
		}
		else
		{
			trans_left  = FLA_NO_TRANSPOSE;
			trans_right = FLA_CONJ_TRANSPOSE;
		}

		if ( inv == FLA_NO_INVERSE )
		{
			FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			          FLA_ONE, B_test, A_test );
			FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
		              FLA_ONE, B_test, A_test );
		}
		else // if ( inv == FLA_INVERSE )
		{
			FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG,
			          FLA_ONE, B_test, A_test );
			FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG,
		              FLA_ONE, B_test, A_test );
		}
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &Y_test );
		FLASH_Obj_free( &B_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_eig_gest_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute the residual.
	FLA_Axpy_external( FLA_MINUS_ONE, A_save, A );
	FLA_Norm1( A, norm );
	FLA_Obj_extract_real_scalar( norm, residual );

	// Free the supporting flat objects.
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );
	FLA_Obj_free( &B_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &Y );
	FLA_Obj_free( &B );
}
Ejemplo n.º 12
0
void time_LU(
              int pivot_combo, int type, int nrepeats, int m, int n, dim_t nb_alg, dim_t nb_flash,
              FLA_Obj A, FLA_Obj p, FLA_Obj x, FLA_Obj b, FLA_Obj norm,
              double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj AH_save, b_save;
  FLA_Obj AH, pH, bH, LH;

  FLASH_LU_incpiv_create_hier_matrices( A, 1, &nb_flash, nb_alg,
                                        &AH, &pH, &LH );
  FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_flash, &bH );

  FLASH_Obj_create_copy_of( FLA_NO_TRANSPOSE, AH, &AH_save );

  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, b, &b_save );


  for ( irep = 0 ; irep < nrepeats; irep++ )
  {
    FLASH_Copy( AH_save, AH );

    *dtime = FLA_Clock();

    switch( pivot_combo ){

    case 0:
    {
      switch( type )
      {
      case FLA_ALG_FRONT_OPT0:
        FLASH_LU_incpiv_noopt( AH, pH, LH );
        break;
      case FLA_ALG_FRONT_OPT1:
        FLASH_LU_incpiv_opt1( AH, pH, LH );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  {
    FLASH_FS_incpiv( AH, pH, LH, bH );
    FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                AH, bH );

    FLASH_Obj_flatten( bH, x );

    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE,
                       A, x, FLA_MINUS_ONE, b );

    FLA_Nrm2_external( b, norm );
    FLA_Obj_extract_real_scalar( norm, diff );
  }

  *gflops = 2.0 / 3.0 * m * m * n /
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy( b_save, b );

  FLASH_Obj_free( &AH );
  FLASH_Obj_free( &pH );
  FLASH_Obj_free( &bH );
  FLASH_Obj_free( &LH );

  FLA_Obj_free( &b_save );
  FLASH_Obj_free( &AH_save );
}
Ejemplo n.º 13
0
void libfla_test_hessut_experiment( test_params_t params,
                                    unsigned int  var,
                                    char*         sc_str,
                                    FLA_Datatype  datatype,
                                    unsigned int  p_cur,
                                    unsigned int  pci,
                                    unsigned int  n_repeats,
                                    signed int    impl,
                                    double*       perf,
                                    double*       residual )
{
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	FLA_Obj      A, T, W, Qh, AQ, QhAQ, norm;
	FLA_Obj      AT, AB;
	FLA_Obj      QhT, QhB;
	FLA_Obj      A_save;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

	if ( impl == FLA_TEST_FLAT_FRONT_END ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &W );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &W );
	}

	// Initialize the test matrices.
	FLA_Random_matrix( A );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );

	// Create auxiliary matrices to be used when checking the result.
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Qh );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &AQ );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &QhAQ );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		FLA_Copy_external( A_save, A );
		
		time = FLA_Clock();

		libfla_test_hessut_impl( impl, A, T );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = ( 10.0 / 3.0 * m * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Check the result by computing R - Q' A_orig Q.
	FLA_Set_to_identity( Qh );
	FLA_Part_2x1( Qh,   &QhT,
	                    &QhB,   1, FLA_TOP );
	FLA_Part_2x1( A,    &AT,
	                    &AB,    1, FLA_TOP );
	FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE,
	                AB, T, W, QhB );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
	          FLA_ONE, A_save, Qh, FLA_ZERO, AQ );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
	          FLA_ONE, Qh, AQ, FLA_ZERO, QhAQ );
	FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, AB );
	*residual = FLA_Max_elemwise_diff( A, QhAQ );

	// Free the supporting flat objects.
	FLA_Obj_free( &W );
	FLA_Obj_free( &Qh );
	FLA_Obj_free( &AQ );
	FLA_Obj_free( &QhAQ );
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
}
Ejemplo n.º 14
0
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Datatype realtype = REALTYPE;
  FLA_Obj      
    A, TU, TV, 
    A_copy, A_recovered,
    U, V, Vb, B, Be, d, e, 
    DU, DV;

  FLA_Obj     
    ATL, ATR,
    ABL, ABR, Ae;

  FLA_Uplo     uplo;
  dim_t        m, n, min_m_n;
  FLA_Error    init_result; 

  double       residual_A = 0.0;

  if ( argc == 3 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME Bidiag setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_Bidiag_UT_create_T( A, &TU, &TV );

  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 
  {
    scomplex *buff_A = FLA_Obj_buffer_at_view( A );
    buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01;  
    buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01;
  }

  //FLA_Set_to_identity( A );
  //FLA_Scal( FLA_MINUS_ONE, A );

  if ( m >= n ) {
    uplo = FLA_UPPER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
    Ae = ATR; 
  } else {
    uplo = FLA_LOWER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
    Ae = ABL;
  }

  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );

  // Bidiag test
  {
    FLA_Obj      norm;
    FLA_Bool     apply_scale;

    FLA_Obj_create( realtype, 1,1, 0,0, &norm );

    FLA_Max_abs_value( A, norm );
    apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); 

    if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A );
    FLA_Bidiag_UT( A, TU, TV );
    if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); 

    FLA_Obj_free( &norm );
  }


  // Orthonomal basis U, V. 
  FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U );
  FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V );

  FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE,   U );
  FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); 

  if ( FLA_Obj_is_complex( A ) ){
    FLA_Obj rL, rR;
    
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL );
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR );

    FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------");
    FLA_Bidiag_UT_realify( A, rL, rR );
    FLA_Obj_fshow( stdout, " - Factor    realified - ", A, "% 6.4e", "------");

    FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------");

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U );
    FLA_Apply_diag_matrix( FLA_LEFT,  FLA_CONJUGATE, rR, V );

    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
  }

  // U^H U
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU );
  FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, U, U, FLA_ZERO, DU );

  // V^H V
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV );
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, V, V, FLA_ZERO, DV );
  
  // Recover the matrix
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B );
  FLA_Set( FLA_ZERO, B );

  // Set B
  FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d );  
  FLA_Set_diagonal_vector( A, d );
  FLA_Set_diagonal_matrix( d, B );
  FLA_Obj_free( &d );

  if ( min_m_n > 1 ) {
    FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e );  
    FLA_Set_diagonal_vector( Ae, e );
    if ( uplo == FLA_UPPER_TRIANGULAR ) {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
      Be = ATR;
    } else {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
      Be = ABL;
    }
    FLA_Set_diagonal_matrix( e, Be );
    FLA_Obj_free( &e );
  }

  // Vb := B (V^H)
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb );
  FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE,
                     FLA_NONUNIT_DIAG, FLA_ONE, B, Vb );

  // A := U Vb
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                     FLA_ONE, U, Vb, FLA_ZERO, A_recovered );

  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );

  if (1) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U'U - ", DU,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - VV' - ", DV,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n);
    fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &TU );
  FLA_Obj_free( &TV );

  FLA_Obj_free( &B );

  FLA_Obj_free( &U );
  FLA_Obj_free( &V );
  FLA_Obj_free( &Vb );

  FLA_Obj_free( &DU );
  FLA_Obj_free( &DV );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );


  FLA_Finalize_safe( init_result );     
}