Ejemplo n.º 1
0
FLA_Error FLASH_LU_piv( FLA_Obj A, FLA_Obj p )
{
  FLA_Error r_val = FLA_SUCCESS;

  // Check parameters.
  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_LU_piv_check( A, p );

  // *** The current LU_piv algorithm implemented assumes that
  // the matrix has a hierarchical depth of 1. We check for that here, because
  // we anticipate that we'll use a more general algorithm in the future, and
  // we don't want to forget to remove the constraint. ***
  if ( FLASH_Obj_depth( A ) != 1 )
  {
    FLA_Print_message( "FLASH_LU_piv() currently only supports matrices of depth 1",
                       __FILE__, __LINE__ );
    FLA_Abort();
  }

  // Begin a parallel region.
  FLASH_Queue_begin();

  // Invoke FLA_LU_piv_internal() with large control tree.
  FLA_LU_piv_internal( A, p, flash_lu_piv_cntl );

  // End the parallel region.
  FLASH_Queue_end();

  // Check for singularity.
  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    r_val = FLASH_LU_find_zero_on_diagonal( A );

  return r_val;
}
Ejemplo n.º 2
0
FLA_Error FLA_Check_error_code_helper( int code, char* file, int line )
{
  if ( code == FLA_SUCCESS )
    return code;

  //if ( /* fatal error checking enabled */ )
  if ( TRUE )
  {
    if ( FLA_ERROR_CODE_MAX <= code && code <= FLA_ERROR_CODE_MIN )
    {
      FLA_Print_message( FLA_Error_string_for_code( code ),
                         file, line );
      FLA_Abort();
    }
    else
    {
      FLA_Print_message( FLA_Error_string_for_code( FLA_UNDEFINED_ERROR_CODE ),
                         file, line );
      FLA_Abort();
    }
  }

  return code;
}
Ejemplo n.º 3
0
void F77_fla_obj_show( char* prefix, int* m, int* n, void* buffer, int* ldim )
{
    FLA_Error    init_result;
    FLA_Datatype datatype;
    FLA_Obj      A;

    switch( *prefix ) {
    case 'i':
    case 'I':
        datatype = FLA_INT;
        break;
    case 's':
    case 'S':
        datatype = FLA_FLOAT;
        break;
    case 'd':
    case 'D':
        datatype = FLA_DOUBLE;
        break;
    case 'c':
    case 'C':
        datatype = FLA_COMPLEX;
        break;
    case 'z':
    case 'Z':
        datatype = FLA_DOUBLE_COMPLEX;
        break;
    default:
        fprintf(stderr, "Invalid prefix %c, where i,s,d,c,z are allowed.\n", *prefix);
        FLA_Abort();
    }

    FLA_Init_safe( &init_result );

    FLA_Obj_create_without_buffer( datatype, *m, *n, &A );
    FLA_Obj_attach_buffer( buffer, 1, *ldim, &A );
    FLA_Obj_fshow( stdout,
                   "= F77_FLA_OBJ_SHOW =", A, "% 6.4e",
                   "=-=-=-=-=-=-=-=-=-=-\n");
    FLA_Obj_free_without_buffer( &A );

    FLA_Finalize_safe( init_result );
}
Ejemplo n.º 4
0
FLA_Error FLASH_Apply_Q_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B )
{
  FLA_Error r_val;
  dim_t     b_alg;

  // Check parameters.
  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_Apply_Q_UT_check( side, trans, direct, storev, A, T, W, B );

  // Inspect the length of TTL to get the blocksize used by the QR/LQ
  // factorization, which will be our inner blocksize for Apply_Q_UT.
  b_alg = FLASH_Obj_scalar_length_tl( T );

  // The traditional (non-incremental) Apply_Q_UT algorithm-by-blocks
  // requires that the algorithmic blocksize be equal to the storage
  // blocksize.
  if ( b_alg != FLASH_Obj_scalar_width_tl( T ) )
  {
    FLA_Print_message( "FLASH_Apply_Q_UT() requires that b_alg == b_store",
                       __FILE__, __LINE__ );
    FLA_Abort();
  }

  // Adjust the blocksize of the control tree node for the flat subproblem.
  if ( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ) != NULL )
    FLA_Blocksize_set( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ),
                       b_alg, b_alg, b_alg, b_alg );

  // Begin a parallel region.
  FLASH_Queue_begin();
  
  // Invoke FLA_Apply_Q_UT_internal() with the standard control tree.
  r_val = FLA_Apply_Q_UT_internal( side, trans, direct, storev, A, T, W, B,
                                   flash_apqut_cntl_blas );

  // End the parallel region.
  FLASH_Queue_end();

  return r_val;
}
Ejemplo n.º 5
0
int FLA_task_determine_matrix_size( FLA_Obj A, FLA_Quadrant from )
{
	int r_val = 0;
	
	// Determine the size of the matrix dimension along which we are moving.
	switch( from )
	{
		case FLA_TOP:
		case FLA_BOTTOM:
		{
			r_val = FLA_Obj_length( A );
			break;
		}
		case FLA_LEFT:
		case FLA_RIGHT:
		{
			r_val = FLA_Obj_width( A );
			break;
		}
		case FLA_TL:
		case FLA_TR:
		case FLA_BL:
		case FLA_BR:
		{
			// If A happens to be the full object, we need to use min_dim() here
			// because the matrix might be rectangular. If A is the processed
			// partition, it is very probably square, and min_dim() doesn't hurt.
			r_val = FLA_Obj_min_dim( A );
			break;
		}
		default:
			FLA_Print_message( "Unexpected default in switch statement!", __FILE__, __LINE__ );
			FLA_Abort();
	}

	return r_val;
}
Ejemplo n.º 6
0
int FLA_Task_compute_blocksize( int tag, FLA_Obj A, FLA_Obj A_proc, FLA_Quadrant from )
{
	int n_threads = FLA_Queue_get_num_threads();
	int A_size, A_proc_size;
	int n_part;
	int b;
	
	// Determine the sizes of the matrix partitions.
	A_size      = FLA_task_determine_matrix_size( A, from );
	A_proc_size = FLA_task_determine_matrix_size( A_proc, from );
	
	// Determine the raw blocksize value.
	n_part      = FLA_task_get_num_partitions( n_threads, tag );
	
	// Determine the blocksize based on the sign of the value from
	// _get_num_partitions().
	if( n_part > 0 )
	{
		b = FLA_task_determine_absolute_blocksize( A_size,
	                                               A_proc_size,
	                                               n_part );
	}
	else if( n_part < 0 )
	{
	    b = FLA_task_determine_relative_blocksize( A_size,
	                                               A_proc_size,
	                                               abs(n_part) );
	}
	else
	{
		FLA_Print_message( "Detected blocksize of 0!", __FILE__, __LINE__ );
        FLA_Abort();
	}

	return b;
}
Ejemplo n.º 7
0
FLA_Error FLA_Tevd_v_opz_var2( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               double*   buff_R, int rs_R, int cs_R,
                               dcomplex* buff_W, int rs_W, int cs_W,
                               dcomplex* buff_U, int rs_U, int cs_U,
                               int       b_alg )
{
	dcomplex  one   = bl1_z1();
	double    rone  = bl1_d1();
	double    rzero = bl1_d0();

	dcomplex* G;
	double*   d1;
	double*   e1;
	int       r_val;
	int       done;
	int       m_G_sweep_max;
	int       ij_begin;
	int       ijTL, ijBR;
	int       m_A11;
	int       n_iter_perf;
	int       n_U_apply;
	int       total_deflations;
	int       n_deflations;
	int       n_iter_prev;
	int       n_iter_perf_sweep_max;

	// Initialize our completion flag.
	done = FALSE;

	// Initialize a counter that holds the maximum number of rows of G
	// that we would need to initialize for the next sweep.
	m_G_sweep_max = m_A - 1;

	// Initialize a counter for the total number of iterations performed.
	n_iter_prev = 0;

	// Initialize R to identity.
	bl1_dident( m_A,
	            buff_R, rs_R, cs_R );

	// Iterate until the matrix has completely deflated.
	for ( total_deflations = 0; done != TRUE; )
	{

		// Initialize G to contain only identity rotations.
		bl1_zsetm( m_G_sweep_max,
		           n_G,
		           &one,
		           buff_G, rs_G, cs_G );

		// Keep track of the maximum number of iterations performed in the
		// current sweep. This is used when applying the sweep's Givens
		// rotations.
		n_iter_perf_sweep_max = 0;

		// Perform a sweep: Move through the matrix and perform a tridiagonal
		// EVD on each non-zero submatrix that is encountered. During the
		// first time through, ijTL will be 0 and ijBR will be m_A - 1.
		for ( ij_begin = 0; ij_begin < m_A;  )
		{

#ifdef PRINTF
if ( ij_begin == 0 )
printf( "FLA_Tevd_v_opz_var2: beginning new sweep (ij_begin = %d)\n", ij_begin );
#endif

			// Search for the first submatrix along the diagonal that is
			// bounded by zeroes (or endpoints of the matrix). If no
			// submatrix is found (ie: if the entire subdiagonal is zero
			// then FLA_FAILURE is returned. This function also inspects
			// subdiagonal elements for proximity to zero. If a given
			// element is close enough to zero, then it is deemed
			// converged and manually set to zero.
			r_val = FLA_Tevd_find_submatrix_opd( m_A,
			                                     ij_begin,
			                                     buff_d, inc_d,
			                                     buff_e, inc_e,
			                                     &ijTL,
			                                     &ijBR );

			// Verify that a submatrix was found. If one was not found,
			// then we are done with the current sweep. Furthermore, if
			// a submatrix was not found AND we began our search at the
			// beginning of the matrix (ie: ij_begin == 0), then the
			// matrix has completely deflated and so we are done with
			// Francis step iteration.
			if ( r_val == FLA_FAILURE )
			{
				if ( ij_begin == 0 )
				{
#ifdef PRINTF
printf( "FLA_Tevd_v_opz_var2: subdiagonal is completely zero.\n" );
printf( "FLA_Tevd_v_opz_var2: Francis iteration is done!\n" );
#endif
					done = TRUE;
				}

				// Break out of the current sweep so we can apply the last
				// remaining Givens rotations.
				break;
			}

			// If we got this far, then:
			//   (a) ijTL refers to the index of the first non-zero
			//       subdiagonal along the diagonal, and
			//   (b) ijBR refers to either:
			//       - the first zero element that occurs after ijTL, or
			//       - the the last diagonal element.
			// Note that ijTL and ijBR also correspond to the first and
			// last diagonal elements of the submatrix of interest. Thus,
			// we may compute the dimension of this submatrix as:
			m_A11 = ijBR - ijTL + 1;

#ifdef PRINTF
printf( "FLA_Tevd_v_opz_var2: ij_begin = %d\n", ij_begin );
printf( "FLA_Tevd_v_opz_var2: ijTL     = %d\n", ijTL );
printf( "FLA_Tevd_v_opz_var2: ijBR     = %d\n", ijBR );
printf( "FLA_Tevd_v_opz_var2: m_A11    = %d\n", m_A11 );
#endif

			// Adjust ij_begin, which gets us ready for the next subproblem, if
			// there is one.
			ij_begin = ijBR + 1;

			// Index to the submatrices upon which we will operate.
			d1 = buff_d + ijTL * inc_d;
			e1 = buff_e + ijTL * inc_e;
			G  = buff_G + ijTL * rs_G;

			// Search for a batch of eigenvalues, recursing on deflated
			// subproblems whenever a split occurs. Iteration continues
			// as long as
			//   (a) there is still matrix left to operate on, and
			//   (b) the number of iterations performed in this batch is
			//       less than n_G.
			// If/when either of the two above conditions fails to hold,
			// the function returns.
			n_deflations = FLA_Tevd_iteracc_v_opd_var1( m_A11,
			                                            n_G,
			                                            ijTL,
			                                            d1, inc_d,
			                                            e1, inc_e,
			                                            G,  rs_G, cs_G,
			                                            &n_iter_perf );

			// Record the number of deflations that we observed.
			total_deflations += n_deflations;

			// Update the maximum number of iterations performed in the
			// current sweep.
			n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );

#ifdef PRINTF
printf( "FLA_Tevd_v_opz_var2: deflations observed       = %d\n", n_deflations );
printf( "FLA_Tevd_v_opz_var2: total deflations observed = %d\n", total_deflations );
printf( "FLA_Tevd_v_opz_var2: num iterations            = %d\n", n_iter_perf );
#endif

			// Store the most recent value of ijBR in m_G_sweep_max.
			// When the sweep is done, this value will contain the minimum
			// number of rows of G we can apply and safely include all
			// non-identity rotations that were computed during the
			// eigenvalue searches.
			m_G_sweep_max = ijBR;

			// Make sure we haven't exceeded our maximum iteration count.
			if ( n_iter_prev >= m_A * n_iter_max )
			{
#ifdef PRINTF
printf( "FLA_Tevd_v_opz_var2: reached maximum total number of iterations: %d\n", n_iter_prev );
#endif
				FLA_Abort();
				//return FLA_FAILURE;
			}
		}

		// The sweep is complete. Now we must apply the Givens rotations
		// that were accumulated during the sweep.


		// Recall that the number of columns of U to which we apply
		// rotations is one more than the number of rotations.
		n_U_apply = m_G_sweep_max + 1;

		// Apply the Givens rotations that were computed as part of
		// the previous batch of iterations.
		//FLA_Apply_G_rf_bld_var8b( n_iter_perf_sweep_max,
		//FLA_Apply_G_rf_bld_var5b( n_iter_perf_sweep_max,
		FLA_Apply_G_rf_bld_var3b( n_iter_perf_sweep_max,
		//FLA_Apply_G_rf_bld_var9b( n_iter_perf_sweep_max,
		//FLA_Apply_G_rf_bld_var6b( n_iter_perf_sweep_max,
		                          m_U,
		                          n_U_apply,
		                          n_iter_prev,
		                          buff_G, rs_G, cs_G,
		                          buff_R, rs_R, cs_R,
		                          b_alg );

#ifdef PRINTF
printf( "FLA_Tevd_v_opz_var2: applying %d sets of Givens rotations\n", n_iter_perf_sweep_max );
#endif

		// Increment the total number of iterations previously performed.
		n_iter_prev += n_iter_perf_sweep_max;
	}

	// Copy the contents of Q to temporary storage.
	bl1_zcopymt( BLIS1_NO_TRANSPOSE,
	             m_A,
	             m_A,
	             buff_U, rs_U, cs_U,
	             buff_W, rs_W, cs_W );


	// Multiply Q by R, overwriting U.
	bl1_dgemm( BLIS1_NO_TRANSPOSE,
	           BLIS1_NO_TRANSPOSE,
	           2*m_A,
	           m_A,
	           m_A,
	           &rone,
	           ( double* )buff_W, rs_W, 2*cs_W,
	                      buff_R, rs_R,   cs_R,
	           &rzero,
	           ( double* )buff_U, rs_U, 2*cs_U );

	return n_iter_prev;
}