示例#1
0
文件: ATL_sprk.c 项目: certik/vendor
void Mjoin(PATL,sprk)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum ATLAS_UPLO UC, const int CP,
    const int N, const int K, const SCALAR alpha,
    const TYPE *A, const int IA, const int JA, const int lda,
    const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc)
{
   const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen);
   int j;
   #ifdef CacheEdge
      static const int CE_K = ((ATL_DivBySize(CacheEdge SHIFT)-(NBNB SHIFT)) /
                                           (NB*(NB+NB)))*NB;
   #else
      #define CE_K K
   #endif

   if ((!N) || ((SCALAR_IS_ZERO(alpha) || (!K)) && (SCALAR_IS_ONE(beta))))
      return;
   if (!K || SCALAR_IS_ZERO(alpha))
   {
      if (UC == CblasLower)
      {
         for (j=0; j != N; j++)
            Mjoin(PATL,scal)(N-j, beta, C+MindexP(UC2,IC+j,JC+j,ldc), 1);
      }
      else /* UC == CblasUpper */
      {
         for (j=0; j != N; j++)
            Mjoin(PATL,scal)(j+1, beta, C+MindexP(UC2,IC,JC+j,ldc), 1);
      }
      return;
   }
   Mjoin(PATL,sprk_rK)(UA, TA, UC, CP, N, K, CE_K, alpha, A, lda, beta, C, ldc);
}
示例#2
0
int umf_divcomplex
(
    double ar, double ai,	/* real and imaginary parts of a */
    double br, double bi,	/* real and imaginary parts of b */
    double *cr, double *ci	/* real and imaginary parts of c */
)
{
    double tr, ti, r, den ;
    if (SCALAR_ABS (br) >= SCALAR_ABS (bi))
    {
	r = bi / br ;
	den = br + r * bi ;
	tr = (ar + ai * r) / den ;
	ti = (ai - ar * r) / den ;
    }
    else
    {
	r = br / bi ;
	den = r * br + bi ;
	tr = (ar * r + ai) / den ;
	ti = (ai * r - ar) / den ;
    }
    *cr = tr ;
    *ci = ti ;
    return (SCALAR_IS_ZERO (den)) ;
}
示例#3
0
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T)
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *vbeta, void *C, const int ldc)
{
   void *vc;
   TYPE *c;
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0, zero=0.0;
   #else
      #define alpha valpha
      const TYPE *beta=vbeta;
      const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0};
   #endif

   if (K > SYRK_Xover)
   {
      vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N);
      ATL_assert(vc);
      c = ATL_AlignPtr(vc);
      CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N);
      if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc);
      else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc);
      #ifdef TCPLX
         else if ( SCALAR_IS_NONE(beta) )
            Mjoin(syr_put,_bn1)(N, c, beta, C, ldc);
         else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc);
      #endif
      else Mjoin(syr_put,_bX)(N, c, beta, C, ldc);
      free(vc);
   }
   else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda,
                            beta, C, ldc);
}
示例#4
0
文件: ATL_gpmm.c 项目: apollos/atlas
void Mjoin(PATL,gpmm)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum PACK_UPLO UB, const enum PACK_TRANS TB, const enum PACK_UPLO UC,
    const int M, const int N, const int K, const SCALAR alpha,
    const TYPE *A, const int IA, const int JA, const int lda,
    const TYPE *B, const int IB, const int JB, const int ldb,
    const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc)
{
   int j;
   #ifdef CacheEdge
      static const int CE_K = ((ATL_DivBySize(CacheEdge)-(NBNB SHIFT)) /
                                           (NB*(NB+NB)))*NB;
   #else
      #define CE_K K
   #endif
   if (!M || !N) return;
   if (!K || SCALAR_IS_ZERO(alpha))
   {
      for (j=0; j != N; j++)
         Mjoin(PATL,scal)(M, beta, C+MindexP(UC,IC,JC+j,ldc), 1);
      return;
   }
/*
 * Packed gpmm not yet implemented for complex,
 * so die if not really a dense gemm
 */
   #ifdef TCPLX
      ATL_assert (UA == PackGen && UB == PackGen && UC == PackGen);
      Mjoin(PATL,gemm)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   #else
   Mjoin(PATL,prankK)(UA, TA, UB, TB, M, N, K, CE_K, alpha,
          A+MindexP(UA,IA,JA,lda), Mpld(UA,JA,lda), B+MindexP(UB,IB,JB,ldb),
          Mpld(UB,JB,ldb), beta, UC, C+MindexP(UC,IB,JB,ldc), Mpld(UC,JC,ldc));
   #endif
}
示例#5
0
   int Mjoin(PATL,syr2kLT)
#endif
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
   int i;
   void *vc=NULL;
   TYPE *c;
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0, zero=0.0;
   #else
      #define alpha valpha
      const TYPE *beta=vbeta;
      const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0};
   #endif

   i = ATL_MulBySize(N)*N;
   if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i);
   if (vc == NULL) return(1);
   c = ATL_AlignPtr(vc);
   CgemmTN(N, N, K, alpha, A, lda, B, ldb, zero, c, N);
   if ( SCALAR_IS_ONE(beta) ) Mjoin(syr2k_put,_b1)(N, c, beta, C, ldc);
   else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr2k_put,_b0)(N, c, beta, C, ldc);
   #ifdef TCPLX
      else if (SCALAR_IS_NONE(beta)) Mjoin(syr2k_put,_bn1)(N, c, beta, C, ldc);
      else if (beta[1] == *zero) Mjoin(syr2k_put,_bXi0)(N, c, beta, C, ldc);
   #endif
   else Mjoin(syr2k_put,_bX)(N, c, beta, C, ldc);
   free(vc);
   return(0);
}
示例#6
0
void Cgemm(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
           const int M, const int N, const int K, const SCALAR alpha,
           const TYPE *A, const int lda, const TYPE *B, const int ldb,
           const SCALAR beta, TYPE *C, const int ldc)
/*
 * Error checks have been done by interface routine
 */
{
   if (!M  ||  !N) return;  /* quick return */
   if ( SCALAR_IS_ZERO(alpha) || !K)
   {
      #ifdef TREAL
         if (beta == ATL_rzero) Mjoin(PATL,gezero)(M, N, C, ldc);
         else if (beta != ATL_rone) Mjoin(PATL,gescal_bX)(M, N, beta, C, ldc);
      #else
         if (beta[1] == ATL_rzero)
         {
            if (*beta == ATL_rzero) Mjoin(PATL,gezero)(M, N, C, ldc);
            else if (*beta != ATL_rone)
               Mjoin(PATL,gescal_bXi0)(M, N, beta, C, ldc);
         }
         else Mjoin(PATL,gescal_bX)(M, N, beta, C, ldc);
      #endif
      return;
   }
   if (TA == AtlasNoTrans)
   {
      if (TB == AtlasNoTrans)
         CgemmNN(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#ifdef TCPLX
      else if (TB == AtlasConjTrans)
         CgemmNC(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#endif
      else
         CgemmNT(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   }
#ifdef TCPLX
   else if (TA == AtlasConjTrans)
   {
      if (TB == AtlasNoTrans)
         CgemmCN(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
      else if (TB == AtlasConjTrans)
         CgemmCC(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
      else
         CgemmCT(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   }
#endif
   else
   {
      if (TB == AtlasNoTrans)
         CgemmTN(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#ifdef TCPLX
      else if (TB == AtlasConjTrans)
         CgemmTC(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#endif
      else
         CgemmTT(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   }
}
示例#7
0
void Mjoin(PATL,mvnk_Mlt16)
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta, TYPE *Y, ATL_CINT incY)
/*
 * y = alpha*A*x + beta*y
 */
{
#ifdef TREAL
   const static ATL_MVFUNC mvfunc[15] = {ATL_mvn_Meq1,
                                         ATL_mvn_Meq2,
                                         ATL_mvn_Meq3,
                                         ATL_mvn_Meq4,
                                         ATL_mvn_Meq5,
                                         ATL_mvn_Meq6,
                                         ATL_mvn_Meq7,
                                         ATL_mvn_Meq8,
                                         ATL_mvn_Meq9,
                                         ATL_mvn_Meq10,
                                         ATL_mvn_Meq11,
                                         ATL_mvn_Meq12,
                                         ATL_mvn_Meq13,
                                         ATL_mvn_Meq14,
                                         ATL_mvn_Meq15
                                         };

   if ( M < 1 || N < 1 || (SCALAR_IS_ZERO(alpha) && SCALAR_IS_ONE(beta)) )
      return;
/*
 * Base max unrolling we use on how many regs we think we have
 */
   #ifdef ATL_GAS_x8664
   if (M > 14)
   #elif defined(ATL_GAS_x8632)
   if (M > 6)
   #else
   if (M > 15)
   #endif
   {
      Mjoin(PATL,mvnk_smallN)(M, N, alpha, A, lda, X, incX, beta, Y, incY);
      return;
   }
   mvfunc[M-1](M, N, alpha, A, lda, X, incX, beta, Y, incY);
#else
   #ifndef TUNING
   if (M <= 8)
      Mjoin(PATL,refgemv)(AtlasNoTrans, M, N, alpha, A, lda, X, incX,
                          beta, Y, incY);
   else
   #endif
      Mjoin(PATL,mvnk_smallN)(M, N, alpha, A, lda, X, incX, beta, Y, incY);
#endif
}
示例#8
0
void Mjoin( PATL, gbmv )
(
   const enum ATLAS_TRANS     TRANS,
   const int                  M,
   const int                  N,
   const int                  KL,
   const int                  KU,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * X,
   const int                  INCX,
   const SCALAR               BETA,
   TYPE                       * Y,
   const int                  INCY
)
{
/*
 * .. Local Variables ..
 */
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( M == 0 ) || ( N == 0 ) ||
       ( ( SCALAR_IS_ZERO( ALPHA ) ) && ( SCALAR_IS_ONE( BETA ) ) ) ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   {
      if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( M, BETA, Y, INCY );
      return;
   }

   Mjoin( PATL, refgbmv )( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, BETA,
                           Y, INCY );
/*
 * End of Mjoin( PATL, gbmv )
 */
}
示例#9
0
void good_axpy(const int N, const SCALAR alpha, const TYPE *X, const int incx,
               TYPE *Y, const int incy)
{
   int i;
   const int incX=incx+incx, incY=incy+incy;
   const register TYPE ra=*alpha, ia=alpha[1];
   register TYPE rx, ix;

   if ( SCALAR_IS_ZERO(alpha) ) return;
   for (i=0; i < N; i++, Y += incY, X += incX)
   {
      rx = *X; ix = X[1];
      *Y   += rx * ra - ix * ia;
      Y[1] += rx * ia + ix * ra;
   }
}
示例#10
0
文件: ATL_tger.c 项目: Leobin7/Kaldi
void Mjoin(Mjoin(PATL,t),MY_GER)
(ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X,
 ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda)
{
    ATL_INT mb, nb, mu, nu, nblks, nrblks, ncblks, ldaP;
    ATL_TGER_t pd;
    int P;
    static TYPE *A0=NULL, *A0e=NULL;

    if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha))  /* quick return if no-op */
        return;

    pd.M = M;
    pd.N = N;
    pd.incX = incX;
    pd.incY = incY;
    pd.lda = lda;
    pd.alpha = alpha;
    pd.X = X;
    pd.Y = Y;
    pd.A = A;
    pd.flg = (A0 == A || A0e == A+(M SHIFT)) ? 1 : 2;
    A0 = A;
    A0e = A+(M SHIFT);

    P = ATL_DivBySize(CacheEdge);
    P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
    P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P, since it hurts alignment */
//   printf("TGER, P=%d\n", P);
    P = Mmin(ATL_NTHREADS, P);
    /*
     * Make sure we don't overflow 32-bit integer lda
     */
    ldaP = P * lda;
    while ((size_t)ldaP != ((size_t)lda)*P)
    {
        P--;
        ldaP = P * lda;
    }
    if (P > 1)
        ATL_goparallel(P, MY_DOWORK_cols, &pd, NULL);
    else
        MY_GER1(M, N, alpha, X, incX, Y, incY, A, lda);
}
PRIVATE Int rescale_determinant
(
    Entry *d_mantissa,
    double *d_exponent
)
{
    double d_abs ;

    ABS (d_abs, *d_mantissa) ;

    if (SCALAR_IS_ZERO (d_abs))
    {
	/* the determinant is zero */
	*d_exponent = 0 ;
	return (FALSE) ;
    }

    if (SCALAR_IS_NAN (d_abs))
    {
	/* the determinant is NaN */
	return (FALSE) ;
    }

    while (d_abs < 1.)
    {
	SCALE (*d_mantissa, 10.0) ;
	*d_exponent = *d_exponent - 1.0 ;
	ABS (d_abs, *d_mantissa) ;
    }

    while (d_abs >= 10.)
    {
	SCALE (*d_mantissa, 0.1) ;
	*d_exponent = *d_exponent + 1.0 ;
	ABS (d_abs, *d_mantissa) ;
    }

    return (TRUE) ;
}
示例#12
0
void Mjoin(PATL,tgemv)
   (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
    const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
    const SCALAR beta, TYPE *Y, ATL_CINT incY)
{
   static size_t ALb=0, ALe=0;
   size_t at = (size_t) A;
   ATL_INT n, P, ldaP;
   ATL_TGEMV_t pd;
/*
 * quick return if possible.
 */
   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M;
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(NY, Y, incY);
         else
            Mjoin(PATL,scal)(NY, beta, Y, incY);
      }
      return;
   }
   pd.flg = (at >= ALb && at <= ALe) ? 1 : 0;
   ALb = (size_t)A;
   ALe = (size_t)(A+(M SHIFT));
   #ifdef TREAL
      pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0;
   #else
      if (TA != AtlasNoTrans)
      {
         if (TA == AtlasConj)
            pd.flg |= 4;
         else if (TA == AtlasTrans)
            pd.flg |= 2;
         else /* if (TA == AtlasConjTrans) */
            pd.flg |= (2|4);
      }
   #endif
   P = ATL_DivBySize(CacheEdge);
   P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
   P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P; it hurts alignment */
   P = Mmin(ATL_NTHREADS, P);
if (TA == AtlasNoTrans || TA == AtlasConj)
   P=1;
//fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N);
/*
 * Make sure we don't overflow 32-bit integer lda
 */
   ldaP = P * lda;
   while ((size_t)ldaP != ((size_t)lda)*P)
   {
      P--;
      ldaP = P * lda;
   }
   if (P > 1)
   {
      pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda;
      pd.alpha = alpha; pd.beta = beta;
      pd.X = X; pd.Y = Y; pd.A = A;
      pd.P = P;
      n = N / P;
      pd.n = n;
      pd.nr = N - n*P;
      if (pd.flg & 2)   /* Transpose case */
      {
         ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL);
         return;
      }
/*
 *    For gemvN, everyone needs a private M-length y.  Don't do this unless
 *    we are sure the combine cost is likely dominated by the parallelism
 */
      else if (n > Mmax(P,8))
      {
         int vrank;
         const TYPE *a;
         TYPE *y, *y0;
         #ifdef TCPLX
            TYPE one[2] = {ATL_rone, ATL_rzero};
            TYPE zero[2] = {ATL_rzero, ATL_rzero};
         #endif

         y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M)));
         ATL_assert(y);
         pd.Y = y;
         pd.incY = 1;
         #ifdef TREAL
            pd.alpha = ATL_rone;
            pd.beta  = ATL_rzero;
         #else
            pd.alpha = one;
            pd.beta  = zero;
         #endif
         ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd,
                        Mjoin(PATL,CombineMVN));
/*
 *       goparallel reduces all node's Ys to node 0's.  Extract his from the
 *       work array, and combine it with input array, applying both alpha
 *       and beta in the process
 */
         vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1;
         a = A + (lda SHIFT)*vrank;
         y = ATL_Align2Ptr(y, a);
         Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY);
         free(y0);
         return;
      }
   }
/*
 * If we haven't parallelized this thing, just do it serial
 */
   Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
}
示例#13
0
GLOBAL void UMF_2by2
(
    /* input, not modified: */
    Int n,		    /* A is n-by-n */
    const Int Ap [ ],	    /* size n+1 */
    const Int Ai [ ],	    /* size nz = Ap [n] */
    const double Ax [ ],    /* size nz if present */
#ifdef COMPLEX
    const double Az [ ],    /* size nz if present */
#endif
    double tol,		/* tolerance for determining whether or not an
			 * entry is numerically acceptable.  If tol <= 0
			 * then all numerical values ignored. */
    Int scale,		/* scaling to perform (none, sum, or max) */
    Int Cperm1 [ ],	/* singleton permutations */
#ifndef NDEBUG
    Int Rperm1 [ ],	/* not needed, since Rperm1 = Cperm1 for submatrix S */
#endif
    Int InvRperm1 [ ],	/* inverse of Rperm1 */
    Int n1,		/* number of singletons */
    Int nempty,		/* number of empty rows/cols */

    /* input, contents undefined on output: */
    Int Degree [ ],	/* Degree [j] is the number of off-diagonal
			 * entries in row/column j of S+S', where
			 * where S = A (Cperm1 [n1..], Rperm1 [n1..]).
			 * Note that S is not used, nor formed. */

    /* output: */
    Int P [ ],		/* P [k] = i means original row i is kth row in S(P,:)
			 * where S = A (Cperm1 [n1..], Rperm1 [n1..]) */
    Int *p_nweak,
    Int *p_unmatched,

    /* workspace (not defined on input or output): */
    Int Ri [ ],		/* of size >= max (nz, n) */
    Int Rp [ ],		/* of size n+1 */
    double Rs [ ],	/* of size n if present.  Rs = sum (abs (A),2) or
			 * max (abs (A),2), the sum or max of each row.  Unused
			 * if scale is equal to UMFPACK_SCALE_NONE. */
    Int Head [ ],	/* of size n.  Head pointers for bucket sort */
    Int Next [ ],	/* of size n.  Next pointers for bucket sort */
    Int Ci [ ],		/* size nz */
    Int Cp [ ]		/* size n+1 */
)
{

    /* ---------------------------------------------------------------------- */
    /* local variables */
    /* ---------------------------------------------------------------------- */

    Entry aij ;
    double cmax, value, rs, ctol, dvalue ;
    Int k, p, row, col, do_values, do_sum, do_max, do_scale, nweak, weak,
	p1, p2, dfound, unmatched, n2, oldrow, newrow, oldcol, newcol, pp ;
#ifdef COMPLEX
    Int split = SPLIT (Az) ;
#endif
#ifndef NRECIPROCAL
    Int do_recip = FALSE ;
#endif

#ifndef NDEBUG
    /* UMF_debug += 99 ; */
    DEBUGm3 (("\n ==================================UMF_2by2: tol %g\n", tol)) ;
    ASSERT (AMD_valid (n, n, Ap, Ai) == AMD_OK) ;
    for (k = n1 ; k < n - nempty ; k++)
    {
	ASSERT (Cperm1 [k] == Rperm1 [k]) ;
    }
#endif

    /* ---------------------------------------------------------------------- */
    /* determine scaling options */
    /* ---------------------------------------------------------------------- */

    /* use the values, but only if they are present */
    /* ignore the values if tol <= 0 */
    do_values = (tol > 0) && (Ax != (double *) NULL) ;
    if (do_values && (Rs != (double *) NULL))
    {
	do_sum = (scale == UMFPACK_SCALE_SUM) ;
	do_max = (scale == UMFPACK_SCALE_MAX) ;
    }
    else
    {
	/* no scaling */
	do_sum = FALSE ;
	do_max = FALSE ;
    }
    do_scale = do_max || do_sum ;
    DEBUGm3 (("do_values "ID" do_sum "ID" do_max "ID" do_scale "ID"\n",
	do_values, do_sum, do_max, do_scale)) ;

    /* ---------------------------------------------------------------------- */
    /* compute the row scaling, if requested */
    /* ---------------------------------------------------------------------- */

    /* see also umf_kernel_init */

    if (do_scale)
    {
#ifndef NRECIPROCAL
	double rsmin ;
#endif
	for (row = 0 ; row < n ; row++)
	{
	    Rs [row] = 0.0 ;
	}
	for (col = 0 ; col < n ; col++)
	{
	    p2 = Ap [col+1] ;
	    for (p = Ap [col] ; p < p2 ; p++)
	    {
		row = Ai [p] ;
		ASSIGN (aij, Ax, Az, p, split) ;
		APPROX_ABS (value, aij) ;
		rs = Rs [row] ;
		if (!SCALAR_IS_NAN (rs))
		{
		    if (SCALAR_IS_NAN (value))
		    {
			/* if any entry in a row is NaN, then the scale factor
			 * for the row is NaN.  It will be set to 1 later. */
			Rs [row] = value ;
		    }
		    else if (do_max)
		    {
			Rs [row] = MAX (rs, value) ;
		    }
		    else
		    {
			Rs [row] += value ;
		    }
		}
	    }
	}
#ifndef NRECIPROCAL
	rsmin = Rs [0] ;
	if (SCALAR_IS_ZERO (rsmin) || SCALAR_IS_NAN (rsmin))
	{
	    rsmin = 1.0 ;
	}
#endif
	for (row = 0 ; row < n ; row++)
	{
	    /* do not scale an empty row, or a row with a NaN */
	    rs = Rs [row] ;
	    if (SCALAR_IS_ZERO (rs) || SCALAR_IS_NAN (rs))
	    {
		Rs [row] = 1.0 ;
	    }
#ifndef NRECIPROCAL
	    rsmin = MIN (rsmin, Rs [row]) ;
#endif
	}

#ifndef NRECIPROCAL
	/* multiply by the reciprocal if Rs is not too small */
	do_recip = (rsmin >= RECIPROCAL_TOLERANCE) ;
	if (do_recip)
	{
	    /* invert the scale factors */
	    for (row = 0 ; row < n ; row++)
	    {
		Rs [row] = 1.0 / Rs [row] ;
	    }
	}
#endif
    }

    /* ---------------------------------------------------------------------- */
    /* compute the max in each column and find diagonal */
    /* ---------------------------------------------------------------------- */

    nweak = 0 ;

#ifndef NDEBUG
    for (k = 0 ; k < n ; k++)
    {
	ASSERT (Rperm1 [k] >= 0 && Rperm1 [k] < n) ;
	ASSERT (InvRperm1 [Rperm1 [k]] == k) ;
    }
#endif

    n2 = n - n1 - nempty ;

    /* use Ri to count the number of strong entries in each row */
    for (row = 0 ; row < n2 ; row++)
    {
	Ri [row] = 0 ;
    }

    pp = 0 ;
    ctol = 0 ;
    dvalue = 1 ;

    /* construct C = pruned submatrix, strong values only, column form */

    for (k = n1 ; k < n - nempty ; k++)
    {
	oldcol = Cperm1 [k] ;
	newcol = k - n1 ;
	Next [newcol] = EMPTY ;
	DEBUGm1 (("Column "ID" newcol "ID" oldcol "ID"\n", k, newcol, oldcol)) ;

	Cp [newcol] = pp ;

	dfound = FALSE ;
	p1 = Ap [oldcol] ;
	p2 = Ap [oldcol+1] ;
	if (do_values)
	{
	    cmax = 0 ;
	    dvalue = 0 ;

	    if (!do_scale)
	    {
		/* no scaling */
		for (p = p1 ; p < p2 ; p++)
		{
		    oldrow = Ai [p] ;
		    ASSERT (oldrow >= 0 && oldrow < n) ;
		    newrow = InvRperm1 [oldrow] - n1 ;
		    ASSERT (newrow >= -n1 && newrow < n2) ;
		    if (newrow < 0) continue ;
		    ASSIGN (aij, Ax, Az, p, split) ;
		    APPROX_ABS (value, aij) ;
		    /* if either cmax or value is NaN, define cmax as NaN */
		    if (!SCALAR_IS_NAN (cmax))
		    {
			if (SCALAR_IS_NAN (value))
			{
			    cmax = value ;
			}
			else
			{
			    cmax = MAX (cmax, value) ;
			}
		    }
		    if (oldrow == oldcol)
		    {
			/* we found the diagonal entry in this column */
			dvalue = value ;
			dfound = TRUE ;
			ASSERT (newrow == newcol) ;
		    }
		}
	    }
#ifndef NRECIPROCAL
	    else if (do_recip)
	    {
		/* multiply by the reciprocal */
		for (p = p1 ; p < p2 ; p++)
		{
		    oldrow = Ai [p] ;
		    ASSERT (oldrow >= 0 && oldrow < n) ;
		    newrow = InvRperm1 [oldrow] - n1 ;
		    ASSERT (newrow >= -n1 && newrow < n2) ;
		    if (newrow < 0) continue ;
		    ASSIGN (aij, Ax, Az, p, split) ;
		    APPROX_ABS (value, aij) ;
		    value *= Rs [oldrow] ;
		    /* if either cmax or value is NaN, define cmax as NaN */
		    if (!SCALAR_IS_NAN (cmax))
		    {
			if (SCALAR_IS_NAN (value))
			{
			    cmax = value ;
			}
			else
			{
			    cmax = MAX (cmax, value) ;
			}
		    }
		    if (oldrow == oldcol)
		    {
			/* we found the diagonal entry in this column */
			dvalue = value ;
			dfound = TRUE ;
			ASSERT (newrow == newcol) ;
		    }
		}
	    }
#endif
	    else
	    {
		/* divide instead */
		for (p = p1 ; p < p2 ; p++)
		{
		    oldrow = Ai [p] ;
		    ASSERT (oldrow >= 0 && oldrow < n) ;
		    newrow = InvRperm1 [oldrow] - n1 ;
		    ASSERT (newrow >= -n1 && newrow < n2) ;
		    if (newrow < 0) continue ;
		    ASSIGN (aij, Ax, Az, p, split) ;
		    APPROX_ABS (value, aij) ;
		    value /= Rs [oldrow] ;
		    /* if either cmax or value is NaN, define cmax as NaN */
		    if (!SCALAR_IS_NAN (cmax))
		    {
			if (SCALAR_IS_NAN (value))
			{
			    cmax = value ;
			}
			else
			{
			    cmax = MAX (cmax, value) ;
			}
		    }
		    if (oldrow == oldcol)
		    {
			/* we found the diagonal entry in this column */
			dvalue = value ;
			dfound = TRUE ;
			ASSERT (newrow == newcol) ;
		    }
		}
	    }

	    ctol = tol * cmax ;
	    DEBUGm1 (("    cmax col "ID" %g  ctol %g\n", oldcol, cmax, ctol)) ;
	}
	else
	{
	    for (p = p1 ; p < p2 ; p++)
	    {
		oldrow = Ai [p] ;
		ASSERT (oldrow >= 0 && oldrow < n) ;
		newrow = InvRperm1 [oldrow] - n1 ;
		ASSERT (newrow >= -n1 && newrow < n2) ;
		if (newrow < 0) continue ;
		Ci [pp++] = newrow ;
		if (oldrow == oldcol)
		{
		    /* we found the diagonal entry in this column */
		    ASSERT (newrow == newcol) ;
		    dfound = TRUE ;
		}
		/* count the entries in each column */
		Ri [newrow]++ ;
	    }
	}

	/* ------------------------------------------------------------------ */
	/* flag the weak diagonals */
	/* ------------------------------------------------------------------ */

	if (!dfound)
	{
	    /* no diagonal entry present */
	    weak = TRUE ;
	}
	else
	{
	    /* diagonal entry is present, check its value */
	    weak = (do_values) ?  WEAK (dvalue, ctol) : FALSE ;
	}
	if (weak)
	{
	    /* flag this column as weak */
	    DEBUG0 (("Weak!\n")) ;
	    Next [newcol] = IS_WEAK ;
	    nweak++ ;
	}

	/* ------------------------------------------------------------------ */
	/* count entries in each row that are not numerically weak */
	/* ------------------------------------------------------------------ */

	if (do_values)
	{
	    if (!do_scale)
	    {
		/* no scaling */
		for (p = p1 ; p < p2 ; p++)
		{
		    oldrow = Ai [p] ;
		    newrow = InvRperm1 [oldrow] - n1 ;
		    if (newrow < 0) continue ;
		    ASSIGN (aij, Ax, Az, p, split) ;
		    APPROX_ABS (value, aij) ;
		    weak = WEAK (value, ctol) ;
		    if (!weak)
		    {
			DEBUG0 (("    strong: row "ID": %g\n", oldrow, value)) ;
			Ci [pp++] = newrow ;
			Ri [newrow]++ ;
		    }
		}
	    }
#ifndef NRECIPROCAL
	    else if (do_recip)
	    {
		/* multiply by the reciprocal */
		for (p = p1 ; p < p2 ; p++)
		{
		    oldrow = Ai [p] ;
		    newrow = InvRperm1 [oldrow] - n1 ;
		    if (newrow < 0) continue ;
		    ASSIGN (aij, Ax, Az, p, split) ;
		    APPROX_ABS (value, aij) ;
		    value *= Rs [oldrow] ;
		    weak = WEAK (value, ctol) ;
		    if (!weak)
		    {
			DEBUG0 (("    strong: row "ID": %g\n", oldrow, value)) ;
			Ci [pp++] = newrow ;
			Ri [newrow]++ ;
		    }
		}
	    }
#endif
	    else
	    {
		/* divide instead */
		for (p = p1 ; p < p2 ; p++)
		{
		    oldrow = Ai [p] ;
		    newrow = InvRperm1 [oldrow] - n1 ;
		    if (newrow < 0) continue ;
		    ASSIGN (aij, Ax, Az, p, split) ;
		    APPROX_ABS (value, aij) ;
		    value /= Rs [oldrow] ;
		    weak = WEAK (value, ctol) ;
		    if (!weak)
		    {
			DEBUG0 (("    strong: row "ID": %g\n", oldrow, value)) ;
			Ci [pp++] = newrow ;
			Ri [newrow]++ ;
		    }
		}
	    }
	}
    }
    Cp [n2] = pp ;
    ASSERT (AMD_valid (n2, n2, Cp, Ci) == AMD_OK) ;

    if (nweak == 0)
    {
	/* nothing to do, quick return */
	DEBUGm2 (("\n =============================UMF_2by2: quick return\n")) ;
	for (k = 0 ; k < n ; k++)
	{
	    P [k] = k ;
	}
	*p_nweak = 0 ;
	*p_unmatched = 0 ;
	return ;
    }

#ifndef NDEBUG
    for (k = 0 ; k < n2 ; k++)
    {
	P [k] = EMPTY ;
    }
    for (k = 0 ; k < n2 ; k++)
    {
	ASSERT (Degree [k] >= 0 && Degree [k] < n2) ;
    }
#endif

    /* ---------------------------------------------------------------------- */
    /* find the 2-by-2 permutation */
    /* ---------------------------------------------------------------------- */

    /* The matrix S is now mapped to the index range 0 to n2-1.  We have
     * S = A (Rperm [n1 .. n-nempty-1], Cperm [n1 .. n-nempty-1]), and then
     * C = pattern of strong entries in S.  A weak diagonal k in S is marked
     * with Next [k] = IS_WEAK. */

    unmatched = two_by_two (n2, Cp, Ci, Degree, Next, Ri, P, Rp, Head) ;

    /* ---------------------------------------------------------------------- */

    *p_nweak = nweak ;
    *p_unmatched = unmatched ;

#ifndef NDEBUG
    DEBUGm4 (("UMF_2by2: weak "ID"  unmatched "ID"\n", nweak, unmatched)) ;
    for (row = 0 ; row < n ; row++)
    {
	DEBUGm2 (("P ["ID"] = "ID"\n", row, P [row])) ;
    }
    DEBUGm2 (("\n =============================UMF_2by2: done\n\n")) ;
#endif
}
GLOBAL void UMF_kernel_wrapup
(
    NumericType *Numeric,
    SymbolicType *Symbolic,
    WorkType *Work
)
{

    /* ---------------------------------------------------------------------- */
    /* local variables */
    /* ---------------------------------------------------------------------- */

    Entry pivot_value ;
    double d ;
    Entry *D ;
    Int i, k, col, row, llen, ulen, *ip, *Rperm, *Cperm, *Lilen, npiv, lp,
	*Uilen, *Lip, *Uip, *Cperm_init, up, pivrow, pivcol, *Lpos, *Upos, *Wr,
	*Wc, *Wp, *Frpos, *Fcpos, *Row_degree, *Col_degree, *Rperm_init,
	n_row, n_col, n_inner, zero_pivot, nan_pivot, n1 ;

#ifndef NDEBUG
    UMF_dump_matrix (Numeric, Work, FALSE) ;
#endif

    DEBUG0 (("Kernel complete, Starting Kernel wrapup\n")) ;
    n_row = Symbolic->n_row ;
    n_col = Symbolic->n_col ;
    n_inner = MIN (n_row, n_col) ;
    Rperm = Numeric->Rperm ;
    Cperm = Numeric->Cperm ;
    Lilen = Numeric->Lilen ;
    Uilen = Numeric->Uilen ;
    Upos = Numeric->Upos ;
    Lpos = Numeric->Lpos ;
    Lip = Numeric->Lip ;
    Uip = Numeric->Uip ;
    D = Numeric->D ;

    npiv = Work->npiv ;
    Numeric->npiv = npiv ;
    Numeric->ulen = Work->ulen ;

    ASSERT (n_row == Numeric->n_row) ;
    ASSERT (n_col == Symbolic->n_col) ;
    DEBUG0 (("Wrap-up: npiv "ID" ulen "ID"\n", npiv, Numeric->ulen)) ;
    ASSERT (npiv <= n_inner) ;

    /* this will be nonzero only if matrix is singular or rectangular */
    ASSERT (IMPLIES (npiv == n_col, Work->ulen == 0)) ;

    /* ---------------------------------------------------------------------- */
    /* find the smallest and largest entries in D */
    /* ---------------------------------------------------------------------- */

    for (k = 0 ; k < npiv ; k++)
    {
	pivot_value = D [k] ;
	ABS (d, pivot_value) ;
	zero_pivot = SCALAR_IS_ZERO (d) ;
	nan_pivot = SCALAR_IS_NAN (d) ;

	if (!zero_pivot)
	{
	    /* the pivot is nonzero, but might be Inf or NaN */
	    Numeric->nnzpiv++ ;
	}

	if (k == 0)
	{
	    Numeric->min_udiag = d ;
	    Numeric->max_udiag = d ;
	}
	else
	{
	    /* min (abs (diag (U))) behaves as follows:  If any entry is zero,
	       then the result is zero (regardless of the presence of NaN's).
	       Otherwise, if any entry is NaN, then the result is NaN.
	       Otherwise, the result is the smallest absolute value on the
	       diagonal of U.
	    */

	    if (SCALAR_IS_NONZERO (Numeric->min_udiag))
	    {
		if (zero_pivot || nan_pivot)
		{
		    Numeric->min_udiag = d ;
		}
		else if (!SCALAR_IS_NAN (Numeric->min_udiag))
		{
		    /* d and min_udiag are both non-NaN */
		    Numeric->min_udiag = MIN (Numeric->min_udiag, d) ;
		}
	    }

	    /*
	       max (abs (diag (U))) behaves as follows:  If any entry is NaN
	       then the result is NaN.  Otherise, the result is the largest
	       absolute value on the diagonal of U.
	    */

	    if (nan_pivot)
	    {
		Numeric->max_udiag = d ;
	    }
	    else if (!SCALAR_IS_NAN (Numeric->max_udiag))
	    {
		/* d and max_udiag are both non-NaN */
		Numeric->max_udiag = MAX (Numeric->max_udiag, d) ;
	    }
	}
    }

    /* ---------------------------------------------------------------------- */
    /* check if matrix is singular or rectangular */
    /* ---------------------------------------------------------------------- */

    Col_degree = Cperm ;	/* for NON_PIVOTAL_COL macro */
    Row_degree = Rperm ;	/* for NON_PIVOTAL_ROW macro */

    if (npiv < n_row)
    {
	/* finalize the row permutation */
	k = npiv ;
	DEBUGm3 (("Singular pivot rows "ID" to "ID"\n", k, n_row-1)) ;
	for (row = 0 ; row < n_row ; row++)
	{
	    if (NON_PIVOTAL_ROW (row))
	    {
		Rperm [row] = ONES_COMPLEMENT (k) ;
		DEBUGm3 (("Singular row "ID" is k: "ID" pivot row\n", row, k)) ;
		ASSERT (!NON_PIVOTAL_ROW (row)) ;
		Lpos [row] = EMPTY ;
		Uip [row] = EMPTY ;
		Uilen [row] = 0 ;
		k++ ;
	    }
	}
	ASSERT (k == n_row) ;
    }

    if (npiv < n_col)
    {
	/* finalize the col permutation */
	k = npiv ;
	DEBUGm3 (("Singular pivot cols "ID" to "ID"\n", k, n_col-1)) ;
	for (col = 0 ; col < n_col ; col++)
	{
	    if (NON_PIVOTAL_COL (col))
	    {
		Cperm [col] = ONES_COMPLEMENT (k) ;
		DEBUGm3 (("Singular col "ID" is k: "ID" pivot row\n", col, k)) ;
		ASSERT (!NON_PIVOTAL_COL (col)) ;
		Upos [col] = EMPTY ;
		Lip [col] = EMPTY ;
		Lilen [col] = 0 ;
		k++ ;
	    }
	}
	ASSERT (k == n_col) ;
    }

    if (npiv < n_inner)
    {
	/* finalize the diagonal of U */
	DEBUGm3 (("Diag of U is zero, "ID" to "ID"\n", npiv, n_inner-1)) ;
	for (k = npiv ; k < n_inner ; k++)
	{
	    CLEAR (D [k]) ;
	}
    }

    /* save the pattern of the last row of U */
    if (Numeric->ulen > 0)
    {
	DEBUGm3 (("Last row of U is not empty\n")) ;
	Numeric->Upattern = Work->Upattern ;
	Work->Upattern = (Int *) NULL ;
    }

    DEBUG2 (("Nnzpiv: "ID"  npiv "ID"\n", Numeric->nnzpiv, npiv)) ;
    ASSERT (Numeric->nnzpiv <= npiv) ;
    if (Numeric->nnzpiv < n_inner && !SCALAR_IS_NAN (Numeric->min_udiag))
    {
	/* the rest of the diagonal is zero, so min_udiag becomes 0,
	 * unless it is already NaN. */
	Numeric->min_udiag = 0.0 ;
    }

    /* ---------------------------------------------------------------------- */
    /* size n_row, n_col workspaces that can be used here: */
    /* ---------------------------------------------------------------------- */

    Frpos = Work->Frpos ;	/* of size n_row+1 */
    Fcpos = Work->Fcpos ;	/* of size n_col+1 */
    Wp = Work->Wp ;		/* of size MAX(n_row,n_col)+1 */
    /* Work->Upattern ;		cannot be used (in Numeric) */
    Wr = Work->Lpattern ;	/* of size n_row+1 */
    Wc = Work->Wrp ;		/* of size n_col+1 or bigger */

    /* ---------------------------------------------------------------------- */
    /* construct Rperm from inverse permutations */
    /* ---------------------------------------------------------------------- */

    /* use Frpos for temporary copy of inverse row permutation [ */

    for (pivrow = 0 ; pivrow < n_row ; pivrow++)
    {
	k = Rperm [pivrow] ;
	ASSERT (k < 0) ;
	k = ONES_COMPLEMENT (k) ;
	ASSERT (k >= 0 && k < n_row) ;
	Wp [k] = pivrow ;
	Frpos [pivrow] = k ;
    }
    for (k = 0 ; k < n_row ; k++)
    {
	Rperm [k] = Wp [k] ;
    }

    /* ---------------------------------------------------------------------- */
    /* construct Cperm from inverse permutation */
    /* ---------------------------------------------------------------------- */

    /* use Fcpos for temporary copy of inverse column permutation [ */

    for (pivcol = 0 ; pivcol < n_col ; pivcol++)
    {
	k = Cperm [pivcol] ;
	ASSERT (k < 0) ;
	k = ONES_COMPLEMENT (k) ;
	ASSERT (k >= 0 && k < n_col) ;
	Wp [k] = pivcol ;
	/* save a copy of the inverse column permutation in Fcpos */
	Fcpos [pivcol] = k ;
    }
    for (k = 0 ; k < n_col ; k++)
    {
	Cperm [k] = Wp [k] ;
    }

#ifndef NDEBUG
    for (k = 0 ; k < n_col ; k++)
    {
	col = Cperm [k] ;
	ASSERT (col >= 0 && col < n_col) ;
	ASSERT (Fcpos [col] == k) ;		/* col is the kth pivot */
    }
    for (k = 0 ; k < n_row ; k++)
    {
	row = Rperm [k] ;
	ASSERT (row >= 0 && row < n_row) ;
	ASSERT (Frpos [row] == k) ;		/* row is the kth pivot */
    }
#endif

#ifndef NDEBUG
    UMF_dump_lu (Numeric) ;
#endif

    /* ---------------------------------------------------------------------- */
    /* permute Lpos, Upos, Lilen, Lip, Uilen, and Uip */
    /* ---------------------------------------------------------------------- */

    for (k = 0 ; k < npiv ; k++)
    {
	pivrow = Rperm [k] ;
	Wr [k] = Uilen [pivrow] ;
	Wp [k] = Uip [pivrow] ;
    }

    for (k = 0 ; k < npiv ; k++)
    {
	Uilen [k] = Wr [k] ;
	Uip [k] = Wp [k] ;
    }

    for (k = 0 ; k < npiv ; k++)
    {
	pivrow = Rperm [k] ;
	Wp [k] = Lpos [pivrow] ;
    }

    for (k = 0 ; k < npiv ; k++)
    {
	Lpos [k] = Wp [k] ;
    }

    for (k = 0 ; k < npiv ; k++)
    {
	pivcol = Cperm [k] ;
	Wc [k] = Lilen [pivcol] ;
	Wp [k] = Lip [pivcol] ;
    }

    for (k = 0 ; k < npiv ; k++)
    {
	Lilen [k] = Wc [k] ;
	Lip [k] = Wp [k] ;
    }

    for (k = 0 ; k < npiv ; k++)
    {
	pivcol = Cperm [k] ;
	Wp [k] = Upos [pivcol] ;
    }

    for (k = 0 ; k < npiv ; k++)
    {
	Upos [k] = Wp [k] ;
    }

    /* ---------------------------------------------------------------------- */
    /* terminate the last Uchain and last Lchain */
    /* ---------------------------------------------------------------------- */

    Upos [npiv] = EMPTY ;
    Lpos [npiv] = EMPTY ;
    Uip [npiv] = EMPTY ;
    Lip [npiv] = EMPTY ;
    Uilen [npiv] = 0 ;
    Lilen [npiv] = 0 ;

    /* ---------------------------------------------------------------------- */
    /* convert U to the new pivot order */
    /* ---------------------------------------------------------------------- */

    n1 = Symbolic->n1 ;

    for (k = 0 ; k < n1 ; k++)
    {
	/* this is a singleton row of U */
	ulen = Uilen [k] ;
	DEBUG4 (("K "ID" New U.  ulen "ID" Singleton 1\n", k, ulen)) ;
	if (ulen > 0)
	{
	    up = Uip [k] ;
	    ip = (Int *) (Numeric->Memory + up) ;
	    for (i = 0 ; i < ulen ; i++)
	    {
		col = *ip ;
		DEBUG4 ((" old col "ID" new col "ID"\n", col, Fcpos [col]));
		ASSERT (col >= 0 && col < n_col) ;
		*ip++ = Fcpos [col] ;
	    }
	}
    }

    for (k = n1 ; k < npiv ; k++)
    {
	up = Uip [k] ;
	if (up < 0)
	{
	    /* this is the start of a new Uchain (with a pattern) */
	    ulen = Uilen [k] ;
	    DEBUG4 (("K "ID" New U.  ulen "ID" End_Uchain 1\n", k, ulen)) ;
	    if (ulen > 0)
	    {
		up = -up ;
		ip = (Int *) (Numeric->Memory + up) ;
		for (i = 0 ; i < ulen ; i++)
		{
		    col = *ip ;
		    DEBUG4 ((" old col "ID" new col "ID"\n", col, Fcpos [col]));
		    ASSERT (col >= 0 && col < n_col) ;
		    *ip++ = Fcpos [col] ;
		}
	    }
	}
    }

    ulen = Numeric->ulen ;
    if (ulen > 0)
    {
	/* convert last pivot row of U to the new pivot order */
	DEBUG4 (("K "ID" (last)\n", k)) ;
	for (i = 0 ; i < ulen ; i++)
	{
	    col = Numeric->Upattern [i] ;
	    DEBUG4 (("    old col "ID" new col "ID"\n", col, Fcpos [col])) ;
	    Numeric->Upattern [i] = Fcpos [col] ;
	}
    }

    /* Fcpos no longer needed ] */

    /* ---------------------------------------------------------------------- */
    /* convert L to the new pivot order */
    /* ---------------------------------------------------------------------- */

    for (k = 0 ; k < n1 ; k++)
    {
	llen = Lilen [k] ;
	DEBUG4 (("K "ID" New L.  llen "ID" Singleton col\n", k, llen)) ;
	if (llen > 0)
	{
	    lp = Lip [k] ;
	    ip = (Int *) (Numeric->Memory + lp) ;
	    for (i = 0 ; i < llen ; i++)
	    {
		row = *ip ;
		DEBUG4 (("    old row "ID" new row "ID"\n", row, Frpos [row])) ;
		ASSERT (row >= 0 && row < n_row) ;
		*ip++ = Frpos [row] ;
	    }
	}
    }

    for (k = n1 ; k < npiv ; k++)
    {
	llen = Lilen [k] ;
	DEBUG4 (("K "ID" New L.  llen "ID" \n", k, llen)) ;
	if (llen > 0)
	{
	    lp = Lip [k] ;
	    if (lp < 0)
	    {
		/* this starts a new Lchain */
		lp = -lp ;
	    }
	    ip = (Int *) (Numeric->Memory + lp) ;
	    for (i = 0 ; i < llen ; i++)
	    {
		row = *ip ;
		DEBUG4 (("    old row "ID" new row "ID"\n", row, Frpos [row])) ;
		ASSERT (row >= 0 && row < n_row) ;
		*ip++ = Frpos [row] ;
	    }
	}
    }

    /* Frpos no longer needed ] */

    /* ---------------------------------------------------------------------- */
    /* combine symbolic and numeric permutations */
    /* ---------------------------------------------------------------------- */

    Cperm_init = Symbolic->Cperm_init ;
    Rperm_init = Symbolic->Rperm_init ;

    for (k = 0 ; k < n_row ; k++)
    {
	Rperm [k] = Rperm_init [Rperm [k]] ;
    }

    for (k = 0 ; k < n_col ; k++)
    {
	Cperm [k] = Cperm_init [Cperm [k]] ;
    }

    /* Work object will be freed immediately upon return (to UMF_kernel */
    /* and then to UMFPACK_numeric). */
}
示例#15
0
Int KLU_rcond           /* return TRUE if successful, FALSE otherwise */
(
    KLU_symbolic *Symbolic,     /* input, not modified */
    KLU_numeric *Numeric,       /* input, not modified */
    KLU_common *Common          /* result in Common->rcond */
)
{
    double ukk, umin = 0, umax = 0 ;
    Entry *Udiag ;
    Int j, n ;

    /* ---------------------------------------------------------------------- */
    /* check inputs */
    /* ---------------------------------------------------------------------- */

    if (Common == NULL)
    {
        return (FALSE) ;
    }
    if (Symbolic == NULL)
    {
        Common->status = KLU_INVALID ;
        return (FALSE) ;
    }
    if (Numeric == NULL)
    {
        Common->rcond = 0 ;
        Common->status = KLU_SINGULAR ;
        return (TRUE) ;
    }
    Common->status = KLU_OK ;

    /* ---------------------------------------------------------------------- */
    /* compute rcond */
    /* ---------------------------------------------------------------------- */

    n = Symbolic->n ;
    Udiag = Numeric->Udiag ;
    for (j = 0 ; j < n ; j++)
    {
        /* get the magnitude of the pivot */
        ABS (ukk, Udiag [j]) ;
        if (SCALAR_IS_NAN (ukk) || SCALAR_IS_ZERO (ukk))
        {
            /* if NaN, or zero, the rcond is zero */
            Common->rcond = 0 ;
            Common->status = KLU_SINGULAR ;
            return (TRUE) ;
        }
        if (j == 0)
        {
            /* first pivot entry */
            umin = ukk ;
            umax = ukk ;
        }
        else
        {
            /* subsequent pivots */
            umin = MIN (umin, ukk) ;
            umax = MAX (umax, ukk) ;
        }
    }

    Common->rcond = umin / umax ;
    if (SCALAR_IS_NAN (Common->rcond) || SCALAR_IS_ZERO (Common->rcond))
    {
        /* this can occur if umin or umax are Inf or NaN */
        Common->rcond = 0 ;
        Common->status = KLU_SINGULAR ;
    }
    return (TRUE) ;
}
示例#16
0
文件: ATL_hpmvU.c 项目: certik/vendor
void Mjoin( PATL, hpmvU )
(
   const int                  N,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * X,
   const SCALAR               BETA,
   TYPE                       * Y
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, hpmvU ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n Hermitian matrix, supplied in packed form.
 *
 * This is a  recursive  version of the  algorithm.  For a more detailed
 * description of  the arguments of this function, see the reference im-
 * plementation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*gpmvT)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmvN)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
#ifdef TREAL
#define    one                ATL_rone
   TYPE                       beta0;
#else
   const TYPE                 * beta0, one[2] = { ATL_rone, ATL_rzero };
#endif
   TYPE                       * A0, * x0, * y0;
   int                        j, jb, jbs, lda = LDA, m, mb, nb;
/* ..
 * .. Executable Statements ..
 *
 */
   ATL_GetPartSPMV( A, N, &mb, &nb );

   beta0 = BETA;
   if(      SCALAR_IS_ZERO( beta0 ) )
   {
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b0_y1 );
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 );
   }
   else if( SCALAR_IS_ONE ( beta0 ) )
   {
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 );
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
   }
   else
   {
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_bX_y1 );
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 );
   }

   MUpnext( N, A, lda );
   x0 = (TYPE *)(X); X += (N SHIFT); y0 = (TYPE *)(Y); Y += (N SHIFT);

   for( j = 0; j < N; j += nb )
   {
      jb = N - j; jb = Mmin( jb, nb ); jbs = (jb SHIFT);
      MUpprev( jb, A, lda ); X -= jbs; Y -= jbs;

      if( ( m = N-j-jb ) != 0 )
      {
         A0 = (TYPE *)(A) - (m SHIFT);
         gpmvT( jb, m, one, A0, lda, x0, 1, beta0, Y,  1 );
         gpmvN( m, jb, one, A0, lda, X,  1, beta0, y0, 1 ); beta0 = one;
      }
      Mjoin( PATL, refhpmvU )( jb, one, A, lda, X, 1, beta0, Y, 1 );
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 ); beta0 = one;
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
   }
/*
 * End of Mjoin( PATL, hpmvU )
 */
}
示例#17
0
文件: ATL_mmIJK.c 项目: apollos/atlas
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
{
    size_t incA, incB, incC;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    const size_t incK = ATL_MulByNB((size_t)K);
    int N = N0;
    int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n;
    void *vA=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    MAT2BLK A2blk, B2blk;
    PUTBLK putblk;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    /*
     * If K sufficiently large, write to temporary C as safety measure;  otherwise
     * write directly to C
     */
    if (nKb < 12)
    {
        putblk = NULL;
        pC = C;
        if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1;
        else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0;
        else NBmm0 = NBmm_bX;
    }
    else
    {
        NBmm0 = NBmm_b0;
        vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB));
        if (!vC) return(-1);
        pC = ATL_AlignPtr(vC);
        if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1);
        else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0);
        else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1);
        else putblk = Mjoin(PATL,putblk_bX);
    }
    /*
     * Special case if we don't need to copy one or more input matrix
     */
    if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B))
    {
        if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) &&
                ATL_DataIsMinAligned(A))
        {
            i = NBNB;
            pA = (TYPE *) A;
            A = NULL;
            A2blk = NULL;
            incA = 0;
        }
        else
        {
            vA = malloc(ATL_Cachelen + ATL_MulBySize(incK));
            if (!vA)
            {
                free(vC);
                return(-1);
            }
            pA = ATL_AlignPtr(vA);
            if (TA == AtlasNoTrans)
            {
                incA = NB;
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1);
                else A2blk = Mjoin(PATL,row2blkT_aX);
            }
            else
            {
                incA = ATL_MulByNB(lda);
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1);
                else A2blk = Mjoin(PATL,col2blk_aX);
            }
        }
        Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA,
                           incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0);
        if (vA) free(vA);
        if (vC) free(vC);
        return(0);
    }
    i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
    if (i <= ATL_MaxMalloc) vA = malloc(i);
    if (!vA)
    {
        if (TA == AtlasNoTrans && TB == AtlasNoTrans)
        {
            if (vC) free(vC);
            return(1);
        }
        if (jb) n = nNb + 1;
        else n = nNb;
        for (j=2; !vA; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vA = malloc(h);
        }
        if (!vA)
        {
            if (vC) free(vC);
            return(-1);
        }
        n = ATL_MulByNB(k);
        jb2 = 0;
    }
    else
    {
        jb2 = jb;
        k = nNb;
        n = N;
    }
    pA = ATL_AlignPtr(vA);
    if (TB == AtlasNoTrans)
    {
        incB = ldb*n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1);
        else B2blk = Mjoin(PATL,col2blk2_aX);
    }
    else
    {
        incB = n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1);
        else B2blk = Mjoin(PATL,row2blkT2_aX);
    }
    if (TA == AtlasNoTrans)
    {
        incA = NB;
        A2blk = Mjoin(PATL,row2blkT_a1);
    }
    else
    {
        incA = ATL_MulByNB(lda);
        A2blk = Mjoin(PATL,col2blk_a1);
    }
    incC = ldc*n;
    pB = pA + incK;

    do
    {
        if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
        else B2blk(n, K, B, ldb, pB, alpha);
        Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                           incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0);
        N -= n;
        nNb -= k;
        if (N < n)
        {
            jb2 = jb;
            n = N;
            k = nNb;
        }
        C += incC;
        B += incB;
        if (!putblk) pC = C;
    }
    while (N);

    if (vC) free(vC);
    free(vA);
    return(0);
}
示例#18
0
void Mjoin(PATL,pputblk_diag)
   (const int M, const int N, const TYPE *V, const enum ATLAS_UPLO UC,
    TYPE *C, int ldc, int ldcinc, const SCALAR alpha, const SCALAR beta)
/*
 * Copies only the Upper or Lower portion of V to C
 */
{
   int i, j;

   if (UC == AtlasUpper)
   {
      if (SCALAR_IS_ZERO(beta))
      {
         if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = -V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = alpha * V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
      }
      else if (SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] += V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] -= V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] += alpha * V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
      }
      else
      {
         if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = beta*C[i] + V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = beta*C[i] - V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = beta*C[i] + alpha * V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
      }
   }
   else
   {
      if (SCALAR_IS_ZERO(beta))
      {
         if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = -V[i];
               C += ldc;
               V += M;
            }
         }
         else if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = V[i];
               C += ldc;
               V += M;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = alpha * V[i];
               C += ldc;
               V += M;
            }
         }
      }
      else if (SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] -= V[i];
               C += ldc;
               V += M;
            }
         }
         else if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] += V[i];
               C += ldc;
               V += M;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] += alpha * V[i];
               C += ldc;
               V += M;
            }
         }
      }
      else
      {
         if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = beta*C[i] - V[i];
               C += ldc;
               V += M;
            }
         }
         else if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = beta*C[i] + V[i];
               C += ldc;
               V += M;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = beta*C[i] + alpha * V[i];
               C += ldc;
               V += M;
            }
         }
      }
   }
}
示例#19
0
void Mjoin( PATL, ptgeadd )
(
   const int                  M,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const SCALAR               BETA,
   TYPE                       * C,
   const int                  LDC
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, ptgeadd ) adds an m-by-n matrix A to the matrix B.
 *
 * This is a multi-threaded version of the algorithm.
 *
 * Arguments
 * =========
 *
 * PTYPE   (input)                       const PT_MISC_TYPE_T *
 *         On entry, PTYPE  points  to the data structure containing the
 *         type information.
 *
 * NODE    (input)                       const unsigned int
 *         On entry, NODE specifies the current node number.
 *
 * THREADS (input)                       const unsigned int
 *         On entry, THREADS  specifies the number of threads to be used
 *         for the current operation.
 *
 * ATTR    (input)                       pthread_attr_t *
 *         On entry, ATTR  specifies  the  thread attribute object to be
 *         used for the node functions to be threaded.
 *
 * NB      (input)                       const int
 *         On entry, NB  specifies  the  blocksize  to  be  used for the
 *         problem size partitioning.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   pthread_attr_t             attr;
   PT_TREE_T                  root = NULL;
#ifdef TREAL
   TYPE                       alpha0 = (TYPE)(ALPHA),
                              beta0  = (TYPE)(BETA);
#endif
   void                       * alpha, * beta;
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( M <= 0 ) || ( N <= 0 ) ||
       ( SCALAR_IS_ZERO( ALPHA ) && SCALAR_IS_ONE( BETA ) ) ) return;

#ifdef TREAL
   alpha = (void *)(&alpha0); beta = (void *)(&beta0);
#else
   alpha = (void *)(ALPHA);   beta = (void *)(BETA);
#endif
   ATL_thread_init( &attr );
   root = Mjoin( PATL, ptgeadd_nt )( ATL_NTHREADS, &attr, M, N, alpha,
                                     (void *)(A), LDA, beta, (void *)(C),
                                     LDC );
   ATL_join_tree  ( root );
   ATL_free_tree  ( root );
   ATL_thread_exit( &attr );
/*
 * End of Mjoin( PATL, ptgeadd )
 */
}
示例#20
0
文件: ATL_syr2k.c 项目: AIDman/Kaldi
void Mjoin( PATL, syr2k )
(
   const enum ATLAS_UPLO      UPLO,
   const enum ATLAS_TRANS     TRANS,
   const int                  N,
   const int                  K,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * B,
   const int                  LDB,
   const SCALAR               BETA,
   TYPE                       * C,
   const int                  LDC
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, syr2k )  performs one of the @(syhe_comm) rank 2k operations
 *
 *    C := alpha * A * B' + alpha * B * A' + beta * C,
 *
 * or
 *
 *    C := alpha * A' * B + alpha * B' * A + beta * C,
 *
 * where alpha and beta are scalars, C is an n by n @(syhe_comm) matrix and
 * A and B are n by k matrices in the first case and k by n  matrices in
 * the second case.
 *
 * This is a  recursive  version of the  algorithm.  For a more detailed
 * description of  the arguments of this function, see the reference im-
 * plementation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
#ifdef TREAL
   TYPE                       alpha0 = (TYPE)(ALPHA), beta0 = (TYPE)(BETA);
   TYPE                       one = ATL_rone;
   TYPE                       * alpha, * beta;
#else
   TYPE                       one[2] = { ATL_rone, ATL_rzero };
   TYPE                       * alpha, * beta;
#endif
   RC3_FUN_SYR2K_T            ATL_rsyr2k;
   RC3_SYR2K_T                type;
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( N == 0 ) ||
       ( ( SCALAR_IS_ZERO( ALPHA ) || ( K == 0 ) ) && SCALAR_IS_ONE( BETA ) ) )
      return;

   if( ( SCALAR_IS_ZERO( ALPHA ) ) || ( K == 0 ) )
   { Mjoin( PATL, trscal )( UPLO, N, N, BETA, C, LDC ); return; }
#ifdef TREAL
   type.size = sizeof( TYPE );    type.one = (void *)(&one);
   alpha     = &alpha0;           beta     = &beta0;
#else
   type.size = sizeof( TYPE[2] ); type.one = (void *)one;
   alpha     = (TYPE *)(ALPHA);   beta     = (TYPE *)(BETA);
#endif

   if( TRANS == AtlasNoTrans )
   {
      type.Tgemm = Mjoin( PATL, gemmNT_RB );
      if( UPLO == AtlasUpper )
      { type.Tsyr2k = Mjoin( PATL, syr2kUN ); ATL_rsyr2k = ATL_rsyr2kUN; }
      else
      { type.Tsyr2k = Mjoin( PATL, syr2kLN ); ATL_rsyr2k = ATL_rsyr2kLN; }
   }
   else
   {
      type.Tgemm = Mjoin( PATL, gemmTN_RB );
      if( UPLO == AtlasUpper )
      { type.Tsyr2k = Mjoin( PATL, syr2kUT ); ATL_rsyr2k = ATL_rsyr2kUT; }
      else
      { type.Tsyr2k = Mjoin( PATL, syr2kLT ); ATL_rsyr2k = ATL_rsyr2kLT; }
   }

   ATL_rsyr2k( &type, N, K, (void *)(alpha), (void *)(A), LDA, (void *)(B),
               LDB, (void *)(beta), (void *)(C), LDC, SYR2K_NB );
/*
 * End of Mjoin( PATL, syr2k )
 */
}
示例#21
0
文件: ATL_trmm.c 项目: AIDman/Kaldi
void Mjoin( PATL, trmm )
(
   const enum ATLAS_SIDE      SIDE,
   const enum ATLAS_UPLO      UPLO,
   const enum ATLAS_TRANS     TRANS,
   const enum ATLAS_DIAG      DIAG,
   const int                  M,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   TYPE                       * B,
   const int                  LDB
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, trmm )  performs one of the matrix-matrix operations
 *
 *    B := alpha * op( A ) * B,   or    B := alpha * B * op( A ),
 *
 * where alpha is a scalar, B is an m by n matrix, A is a unit,  or non-
 * unit, upper or lower triangular matrix and op( X ) is one of
 *
 *    op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ).
 *
 * This is a  recursive  version of the  algorithm.  For a more detailed
 * description of  the arguments of this function, see the reference im-
 * plementation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
#ifdef TREAL
   TYPE                       alpha0 = (TYPE)(ALPHA);
   const TYPE                 one = ATL_rone;
#else
   TYPE                       one[2] = { ATL_rone, ATL_rzero };
#endif
   TYPE                       * alpha;
   RC3_FUN_TRMM_T             ATL_rtrmm;
   RC3_TRMM_T                 type;
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( M == 0 ) || ( N == 0 ) ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   { Mjoin( PATL, gescal )( M, N, ALPHA, B, LDB ); return; }

#ifdef TREAL
   type.size   = sizeof( TYPE );    type.one = (void *)(&one);
   alpha       = &alpha0;
#else
   type.size   = sizeof( TYPE[2] ); type.one = (void *)one;
   alpha       = (TYPE *)(ALPHA);
#endif

   if( SIDE == AtlasLeft )
   {
      if( TRANS == AtlasNoTrans )
      {
         type.Tgemm = Mjoin( PATL, gemmNN_RB );
         if( UPLO == AtlasUpper )
         {
            ATL_rtrmm = ATL_rtrmmLUN;
            if( DIAG == AtlasNonUnit ) type.Ttrmm = Mjoin( PATL, trmmLUNN );
            else                       type.Ttrmm = Mjoin( PATL, trmmLUNU );
         }
         else
         {
            ATL_rtrmm = ATL_rtrmmLLN;
            if( DIAG == AtlasNonUnit ) type.Ttrmm = Mjoin( PATL, trmmLLNN );
            else                       type.Ttrmm = Mjoin( PATL, trmmLLNU );
         }
      }
#ifdef TREAL
      else
#else
      else if( TRANS == AtlasTrans )
#endif
      {
         type.Tgemm = Mjoin( PATL, gemmTN_RB );
         if( UPLO == AtlasUpper)
         {
            ATL_rtrmm = ATL_rtrmmLUT;
            if( DIAG == AtlasNonUnit ) type.Ttrmm = Mjoin( PATL, trmmLUTN );
            else                       type.Ttrmm = Mjoin( PATL, trmmLUTU );
         }
         else
         {
            ATL_rtrmm = ATL_rtrmmLLT;
            if( DIAG == AtlasNonUnit ) type.Ttrmm = Mjoin( PATL, trmmLLTN );
            else                       type.Ttrmm = Mjoin( PATL, trmmLLTU );
         }
      }
#ifdef TCPLX
      else
      {
         type.Tgemm = Mjoin( PATL, gemmCN_RB );
         if( UPLO == AtlasUpper )
         {
            ATL_rtrmm = ATL_rtrmmLUC;
            if( DIAG == AtlasNonUnit ) type.Ttrmm = Mjoin( PATL, trmmLUCN );
            else                       type.Ttrmm = Mjoin( PATL, trmmLUCU );
         }
         else
         {
            ATL_rtrmm = ATL_rtrmmLLC;
            if( DIAG == AtlasNonUnit ) type.Ttrmm = Mjoin( PATL, trmmLLCN );
            else                       type.Ttrmm = Mjoin( PATL, trmmLLCU );
         }
      }
#endif
   }
   else
   {
      if( TRANS == AtlasNoTrans )
示例#22
0
文件: ATL_hpmv.c 项目: certik/vendor
void Mjoin( PATL, hpmv )
(
   const enum ATLAS_UPLO      UPLO,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const TYPE                 * X,
   const int                  INCX,
   const SCALAR               BETA,
   TYPE                       * Y,
   const int                  INCY
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, hpmv ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n Hermitian matrix, supplied in packed form.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*gpmv0)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmv1)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmvN)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
#ifdef TREAL
   TYPE                       alphaY, beta0;
#define one                   ATL_rone
#define zero                  ATL_rzero
#else
   const TYPE                 * alphaY, * beta0;
   const TYPE                 one [2] = { ATL_rone,  ATL_rzero },
                              zero[2] = { ATL_rzero, ATL_rzero };
#endif
   void                       * vx = NULL, * vy = NULL;
   TYPE                       * A0, * A1, * x, * x0, * x1, * y, * y00, * y0,
                              * y1;
   int                        incXY, incXY1, j, jb, lda, lda0, lda1, mb, mb1,
                              n, nb;
/* ..
 * .. Executable Statements ..
 *
 */
   if( N == 0 ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   {
      if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY );
      return;
   }

   if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) )
   {
      vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vx ); x = ATL_AlignPtr( vx );
      Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 );
      alphaY = one;
   }
   else { x = (TYPE *)(X); alphaY = ALPHA; }

   if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) )
   {
      vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy );
      beta0 = zero;
   }
   else { y00 = y = (TYPE *)(Y); beta0 = BETA; }

   ATL_GetPartSPMV( A, N, &mb, &nb );

   mb1 = N - ( ( N - 1 ) / mb ) * mb; incXY1 = (nb SHIFT);

   if( UPLO == AtlasUpper )
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 );

      lda = 1; lda0 = lda; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 );
      incXY = (mb SHIFT); x0 = x + incXY; y0 = y + incXY;

      for( n = N - mb; n > 0; n -= mb, x0 += incXY, x += incXY,
           y0 += incXY, y += incXY )
      {
         Mjoin( PATL, hpmvU )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 - (mb SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MUpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1; lda = lda0; A = A0; MUpnext( mb, A0, lda0 );
      }
      Mjoin( PATL, hpmvU )( mb1, A, lda, x, beta0, y );
   }
   else
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvLN_a1_x1_b1_y1 );

      lda = N; lda0 = lda; A0 = (TYPE *)(A); MLpnext( N, A, lda );
      incXY = (mb SHIFT); x0 = x; y0 = y;

      for( n  = N - mb, x += ((N-mb) SHIFT), y += ((N-mb) SHIFT); n > 0;
           n -= mb, x -= incXY, y -= incXY )
      {
         MLpprev( mb, A, lda );
         Mjoin( PATL, hpmvL )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 + (n SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MLpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1;
      }
      Mjoin( PATL, hpmvL )( mb1, A0, lda0, x0, beta0, y0 );
   }

   if( vx ) free( vx );
   if( vy )
   { Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); }
/*
 * End of Mjoin( PATL, hpmv )
 */
}
示例#23
0
文件: ATL_mmIJK.c 项目: apollos/atlas
void Mjoin(PATL,mmIJK2)(int K, int nMb, int nNb, int nKb, int ib, int jb,
                        int kb, const SCALAR alpha, const TYPE *A, int lda,
                        TYPE *pA0, int incA, MAT2BLK A2blk, const TYPE *pB0,
                        const SCALAR beta, TYPE *C, int ldc, TYPE *pC,
                        PUTBLK putblk, NBMM0 NBmm0)
/*
 * Outer three loops for matmul with outer loop over rows of A
 */
{
    int i, j, ldpc;
    const int ZEROC = ((putblk == NULL) && SCALAR_IS_ZERO(beta));
    const int incK = ATL_MulByNB(K), incC = ATL_MulByNB(ldc);
    TYPE *pA=pA0, *stA=pA0+ATL_MulByNBNB(nKb);
    const TYPE *pB=pB0;
    const TYPE cubeta = ( (putblk) ? ATL_rzero : beta );
    TYPE *c;

    if (putblk)
    {
        ldpc = NB;
        if (!nKb && kb) Mjoin(PATL,gezero)(MB, NB, pC, MB);
    }
    else ldpc = ldc;
    for (i=nMb; i; i--)    /* loop over full row panels of A */
    {
        if (A)
        {
            A2blk(K, NB, A, lda, pA, alpha);  /* get 1 row panel of A */
            A += incA;
        }
        if (!putblk) pC = C;
        c = C;
        C += NB;
        for (j=nNb; j; j--)  /* full column panels of B */
        {
            if (nKb)
            {
                NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, beta, pC, ldpc);
                pA += NBNB;
                pB += NBNB;
                if (nKb != 1)
                {
                    do
                    {
                        NBmm(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone,
                             pC, ldpc);
                        pA += NBNB;
                        pB += NBNB;
                    }
                    while (pA != stA);
                }
                if (kb)
                {
                    KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, pC, ldpc);
                    pB += kb*NB;
                }
            }
            else
            {
                if (ZEROC) Mjoin(PATL,gezero)(MB, NB, pC, ldpc);
                if (kb)
                {
                    KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, cubeta, pC, ldpc);
                    pB += kb*NB;
                }
            }
            pA = pA0;
            if (putblk) putblk(NB, NB, pC, c, ldc, beta);
            else pC += incC;
            c += incC;
        }
        if (jb)
        {
            NBJBmm(jb, K, pA, pB, cubeta, pC, ldpc);
            if (putblk) putblk(NB, jb, pC, c, ldc, beta);
        }
        pB = pB0;
        if (!A)
        {
            pA0 += incK;
            pA = pA0;
            stA += incK;
        }
    }
    if (ib)
    {
        c = C;
        if (A) A2blk(K, ib, A, lda, pA, alpha);  /* get last row panel of A */
        for (j=nNb; j; j--)  /* full column panels of B */
        {
            if (putblk)
            {
                IBNBmm(ib, K, pA, pB, ATL_rzero, pC, ib);
                putblk(ib, NB, pC, c, ldc, beta);
            }
            else IBNBmm(ib, K, pA, pB, beta, c, ldc);
            pB += incK;
            c += incC;
        }
        if (jb)
        {
            if (putblk)
            {
                IBJBmm(ib, jb, K, pA, pB, ATL_rzero, pC, ib);
                putblk(ib, jb, pC, c, ldc, beta);
            }
            else IBJBmm(ib, jb, K, pA, pB, beta, c, ldc);
        }
    }
}
示例#24
0
文件: ATL_ger.c 项目: AIDman/Kaldi
void ATL_ger
   (const int M, const int N, const SCALAR alpha0, const TYPE *X,
    const int incX, const TYPE *Y, const int incY, TYPE *A, const int lda)
{
#ifdef ATL_CHOICE
   const size_t opsize = (M*N+M+N)*sizeof(TYPE)SHIFT;

   if (opsize <= ATL_MulBySize(ATL_L1elts))
      Mjoin(MY_GER,_L1)(M, N, alpha0, X, incX, Y, incY, A, lda);
   else if (opsize <= MY_CE)
      Mjoin(MY_GER,_L2)(M, N, alpha0, X, incX, Y, incY, A, lda);
   else
      Mjoin(MY_GER,_OOC)(M, N, alpha0, X, incX, Y, incY, A, lda);
#else
   void (*getX)(const int N, const SCALAR alpha, const TYPE *X,
                const int incX, TYPE *Y, const int incY);
   ATL_r1kern_t gerk;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN;
   #ifdef TREAL
      #define one ATL_rone
      TYPE alpha = alpha0;
      const int ALPHA_IS_ONE = (alpha0 == ATL_rone);
   #else
      TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0;
      const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero);
   #endif

   if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha))
      return;
/*
 * Get gerk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 */
   gerk = ATL_GetR1Kern(M, N, A, lda, &mu, &nu, &minM, &minN, &alignX,
                         &ALIGNX2A, &alignY, &ForceNU, &CacheElts);
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
/*
 * Set up to handle case where kernel requres N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call AXPY-based routine
 */
   if (N < 4 || Nm < 1)
   {
      MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda);
      return;
   }
/*
 * ATLAS's GER kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of A & B for these degenerate
 * cases
 */
   if (M < 16 || M < minM)
   {
      MY_GERK_MLT16(M, N, alpha0, X, incX, Y, incY, A, lda);
      return;
   }
/*
 *****************************************************************************
 Figure out whether vecs need be copied, and which one will be scaled by alpha
 *****************************************************************************
 */
   #ifdef Conj_
      COPYY = 1;
   #else
      COPYY = (incY != 1);
      if (!COPYY && alignY)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignY)*alignY != t1);
      }
   #endif
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   if (COPYX != COPYY)         /* if only one of them is already being copied */
      APPLYALPHAX = COPYX;     /* apply alpha to that one */
   else if (!COPYY && !COPYX)  /* nobody currently being copied means */
   {                           /* we'll need to force a copy to apply alpha */
      APPLYALPHAX = (M < N);   /* apply alpha to shorter vector */
      if (!ALPHA_IS_ONE)       /* force copy if alpha != 1.0 */
      {
         COPYX = APPLYALPHAX;
         COPYY = !APPLYALPHAX;
      }
   }
   else                        /* if both are being copied anyway */
      APPLYALPHAX = (M < N);   /* apply alpha to shorter vector */

   if (COPYX | COPYY)         /* if I need to copy either vector */
   {                          /* allocate & align them */
      vp = malloc(ATL_MulBySize(COPYX*mb+COPYY*N) + 2*ATL_Cachelen);
/*
 *    If we cannot allocate enough space to copy the vectors, give up and
 *    call the simple loop-based implementation
 */
      if (!vp)
      {
         MY_GERK_AXPY(M, N, alpha0, X, incX, Y, incY, A, lda);
         return;
      }
      if (COPYY)
      {
         y = ATL_AlignPtr(vp);
         x = y + (N SHIFT);
         x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
         if (!APPLYALPHAX && !ALPHA_IS_ONE)  /* need to apply alpha to Y */
         {
            #ifdef Conj_
               Mjoin(PATL,moveConj)(N, alpha, Y, incY, y, 1);
            #else
               Mjoin(PATL,cpsc)(N, alpha, Y, incY, y, 1);
            #endif
            alpha = one;
         }
         else  /* do not apply alpha */
         #ifdef Conj_
            Mjoin(PATL,copyConj)(N, Y, incY, y, 1);
         #else
            Mjoin(PATL,copy)(N, Y, incY, y, 1);
         #endif
      }
      else if (ALIGNX2A)
         x = ATL_Align2Ptr(vp, A);
      else
         x = ATL_AlignPtr(vp);
   }
   getX = (COPYX) ? Mjoin(PATL,cpsc) : NULL;
   m = M;
   do
   {
      imb = Mmin(mb, m);
      if (getX)    /* copy X if necessary */
         getX(imb, alpha, X, incX, x, 1);
      else
         x = (TYPE*) X;
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb > minM)
         gerk(imb, Nm, x, y, A, lda);
      else
         Mjoin(PATL,gerk_Mlt16)(imb, Nm, one, x, 1, y, 1, A, lda);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,gerk_axpy)(imb, nr, one, x, 1, y+(Nm SHIFT), 1,
                               A+((size_t)lda)*(Nm SHIFT), lda);
      A += imb SHIFT;
      X += (imb*incX)SHIFT;
      m -= imb;
      imb = Mmin(m,mb);
   }
   while(m);
   if (vp)
      free(vp);
#endif
}
GLOBAL Int UMFPACK_numeric
(
    const Int Ap [ ],
    const Int Ai [ ],
    const double Ax [ ],
#ifdef COMPLEX
    const double Az [ ],
#endif
    void *SymbolicHandle,
    void **NumericHandle,
    const double Control [UMFPACK_CONTROL],
    double User_Info [UMFPACK_INFO]
)
{

    /* ---------------------------------------------------------------------- */
    /* local variables */
    /* ---------------------------------------------------------------------- */

    double Info2 [UMFPACK_INFO], alloc_init, relpt, relpt2, droptol,
	front_alloc_init, stats [2] ;
    double *Info ;
    WorkType WorkSpace, *Work ;
    NumericType *Numeric ;
    SymbolicType *Symbolic ;
    Int n_row, n_col, n_inner, newsize, i, status, *inew, npiv, ulen, scale ;
    Unit *mnew ;

    /* ---------------------------------------------------------------------- */
    /* get the amount of time used by the process so far */
    /* ---------------------------------------------------------------------- */

    umfpack_tic (stats) ;

    /* ---------------------------------------------------------------------- */
    /* initialize and check inputs */
    /* ---------------------------------------------------------------------- */

#ifndef NDEBUG
    UMF_dump_start ( ) ;
    init_count = UMF_malloc_count ;
    DEBUGm4 (("\nUMFPACK numeric: U transpose version\n")) ;
#endif

    /* If front_alloc_init negative then allocate that size of front in
     * UMF_start_front.  If alloc_init negative, then allocate that initial
     * size of Numeric->Memory. */

    relpt = GET_CONTROL (UMFPACK_PIVOT_TOLERANCE,
	UMFPACK_DEFAULT_PIVOT_TOLERANCE) ;
    relpt2 = GET_CONTROL (UMFPACK_SYM_PIVOT_TOLERANCE,
	UMFPACK_DEFAULT_SYM_PIVOT_TOLERANCE) ;
    alloc_init = GET_CONTROL (UMFPACK_ALLOC_INIT, UMFPACK_DEFAULT_ALLOC_INIT) ;
    front_alloc_init = GET_CONTROL (UMFPACK_FRONT_ALLOC_INIT,
	UMFPACK_DEFAULT_FRONT_ALLOC_INIT) ;
    scale = GET_CONTROL (UMFPACK_SCALE, UMFPACK_DEFAULT_SCALE) ;
    droptol = GET_CONTROL (UMFPACK_DROPTOL, UMFPACK_DEFAULT_DROPTOL) ;

    relpt   = MAX (0.0, MIN (relpt,  1.0)) ;
    relpt2  = MAX (0.0, MIN (relpt2, 1.0)) ;
    droptol = MAX (0.0, droptol) ;
    front_alloc_init = MIN (1.0, front_alloc_init) ;

    if (scale != UMFPACK_SCALE_NONE && scale != UMFPACK_SCALE_MAX)
    {
	scale = UMFPACK_DEFAULT_SCALE ;
    }

    if (User_Info != (double *) NULL)
    {
	/* return Info in user's array */
	Info = User_Info ;
	/* clear the parts of Info that are set by UMFPACK_numeric */
	for (i = UMFPACK_NUMERIC_SIZE ; i <= UMFPACK_MAX_FRONT_NCOLS ; i++)
	{
	    Info [i] = EMPTY ;
	}
	for (i = UMFPACK_NUMERIC_DEFRAG ; i < UMFPACK_IR_TAKEN ; i++)
	{
	    Info [i] = EMPTY ;
	}
    }
    else
    {
	/* no Info array passed - use local one instead */
	Info = Info2 ;
	for (i = 0 ; i < UMFPACK_INFO ; i++)
	{
	    Info [i] = EMPTY ;
	}
    }

    Symbolic = (SymbolicType *) SymbolicHandle ;
    Numeric = (NumericType *) NULL ;
    if (!UMF_valid_symbolic (Symbolic))
    {
	Info [UMFPACK_STATUS] = UMFPACK_ERROR_invalid_Symbolic_object ;
	return (UMFPACK_ERROR_invalid_Symbolic_object) ;
    }

    /* compute alloc_init automatically for AMD or other symmetric ordering */
    if (/* Symbolic->ordering == UMFPACK_ORDERING_AMD */ alloc_init >= 0
        && Symbolic->amd_lunz > 0)
    {
	alloc_init = (Symbolic->nz + Symbolic->amd_lunz) / Symbolic->lunz_bound;
	alloc_init = MIN (1.0, alloc_init) ;
	alloc_init *= UMF_REALLOC_INCREASE ;
    }

    n_row = Symbolic->n_row ;
    n_col = Symbolic->n_col ;
    n_inner = MIN (n_row, n_col) ;

    /* check for integer overflow in Numeric->Memory minimum size */
    if (INT_OVERFLOW (Symbolic->dnum_mem_init_usage * sizeof (Unit)))
    {
	/* :: int overflow, initial Numeric->Memory size :: */
	/* There's no hope to allocate a Numeric object big enough simply to
	 * hold the initial matrix, so return an out-of-memory condition */
	DEBUGm4 (("out of memory: numeric int overflow\n")) ;
	Info [UMFPACK_STATUS] = UMFPACK_ERROR_out_of_memory ;
	return (UMFPACK_ERROR_out_of_memory) ;
    }

    Info [UMFPACK_STATUS] = UMFPACK_OK ;
    Info [UMFPACK_NROW] = n_row ;
    Info [UMFPACK_NCOL] = n_col ;
    Info [UMFPACK_SIZE_OF_UNIT] = (double) (sizeof (Unit)) ;

    if (!Ap || !Ai || !Ax || !NumericHandle)
    {
	Info [UMFPACK_STATUS] = UMFPACK_ERROR_argument_missing ;
	return (UMFPACK_ERROR_argument_missing) ;
    }

    Info [UMFPACK_NZ] = Ap [n_col] ;
    *NumericHandle = (void *) NULL ;

    /* ---------------------------------------------------------------------- */
    /* allocate the Work object */
    /* ---------------------------------------------------------------------- */

    /* (1) calls UMF_malloc 15 or 17 times, to obtain temporary workspace of
     * size c+1 Entry's and 2*(n_row+1) + 3*(n_col+1) + (n_col+n_inner+1) +
     * (nn+1) + * 3*(c+1) + 2*(r+1) + max(r,c) + (nfr+1) integers plus 2*nn
     * more integers if diagonal pivoting is to be done.  r is the maximum
     * number of rows in any frontal matrix, c is the maximum number of columns
     * in any frontal matrix, n_inner is min (n_row,n_col), nn is
     * max (n_row,n_col), and nfr is the number of frontal matrices.  For a
     * square matrix, this is c+1 Entry's and about 8n + 3c + 2r + max(r,c) +
     * nfr integers, plus 2n more for diagonal pivoting.
     */

    Work = &WorkSpace ;
    Work->n_row = n_row ;
    Work->n_col = n_col ;
    Work->nfr = Symbolic->nfr ;
    Work->nb = Symbolic->nb ;
    Work->n1 = Symbolic->n1 ;

    if (!work_alloc (Work, Symbolic))
    {
	DEBUGm4 (("out of memory: numeric work\n")) ;
	Info [UMFPACK_STATUS] = UMFPACK_ERROR_out_of_memory ;
	error (&Numeric, Work) ;
	return (UMFPACK_ERROR_out_of_memory) ;
    }
    ASSERT (UMF_malloc_count == init_count + 16 + 2*Symbolic->prefer_diagonal) ;

    /* ---------------------------------------------------------------------- */
    /* allocate Numeric object */
    /* ---------------------------------------------------------------------- */

    /* (2) calls UMF_malloc 10 or 11 times, for a total space of
     * sizeof (NumericType) bytes, 4*(n_row+1) + 4*(n_row+1) integers, and
     * (n_inner+1) Entry's, plus n_row Entry's if row scaling is to be done.
     * sizeof (NumericType) is a small constant.  Next, it calls UMF_malloc
     * once, for the variable-sized part of the Numeric object
     * (Numeric->Memory).  The size of this object is the larger of
     * (Control [UMFPACK_ALLOC_INIT]) *  (the approximate upper bound computed
     * by UMFPACK_symbolic), and the minimum required to start the numerical
     * factorization.  * This request is reduced if it fails.
     */

    if (!numeric_alloc (&Numeric, Symbolic, alloc_init, scale))
    {
	DEBUGm4 (("out of memory: initial numeric\n")) ;
	Info [UMFPACK_STATUS] = UMFPACK_ERROR_out_of_memory ;
	error (&Numeric, Work) ;
	return (UMFPACK_ERROR_out_of_memory) ;
    }
    DEBUG0 (("malloc: init_count "ID" UMF_malloc_count "ID"\n",
	init_count, UMF_malloc_count)) ;
    ASSERT (UMF_malloc_count == init_count
	+ (16 + 2*Symbolic->prefer_diagonal)
	+ (11 + (scale != UMFPACK_SCALE_NONE))) ;

    /* set control parameters */
    Numeric->relpt = relpt ;
    Numeric->relpt2 = relpt2 ;
    Numeric->droptol = droptol ;
    Numeric->alloc_init = alloc_init ;
    Numeric->front_alloc_init = front_alloc_init ;
    Numeric->scale = scale ;

    DEBUG0 (("umf relpt %g %g init %g %g inc %g red %g\n",
	relpt, relpt2, alloc_init, front_alloc_init,
	UMF_REALLOC_INCREASE, UMF_REALLOC_REDUCTION)) ;

    /* ---------------------------------------------------------------------- */
    /* scale and factorize */
    /* ---------------------------------------------------------------------- */

    /* (3) During numerical factorization (inside UMF_kernel), the variable-size
     * block of memory is increased in size via a call to UMF_realloc if it is
     * found to be too small.  During factorization, this block holds the
     * pattern and values of L and U at the top end, and the elements
     * (contibution blocks) and the current frontal matrix (Work->F*) at the
     * bottom end.  The peak size of the variable-sized object is estimated in
     * UMFPACK_*symbolic (Info [UMFPACK_VARIABLE_PEAK_ESTIMATE]), although this
     * upper bound can be very loose.  The size of the Symbolic object
     * (which is currently allocated) is in Info [UMFPACK_SYMBOLIC_SIZE], and
     * is between 2*n and 13*n integers.
     */

    DEBUG0 (("Calling umf_kernel\n")) ;
    status = UMF_kernel (Ap, Ai, Ax,
#ifdef COMPLEX
	Az,
#endif
	Numeric, Work, Symbolic) ;

    Info [UMFPACK_STATUS] = status ;
    if (status < UMFPACK_OK)
    {
	/* out of memory, or pattern has changed */
	error (&Numeric, Work) ;
	return (status) ;
    }

    Info [UMFPACK_FORCED_UPDATES] = Work->nforced ;
    Info [UMFPACK_VARIABLE_INIT] = Numeric->init_usage ;
    if (Symbolic->prefer_diagonal)
    {
	Info [UMFPACK_NOFF_DIAG] = Work->noff_diagonal ;
    }

    DEBUG0 (("malloc: init_count "ID" UMF_malloc_count "ID"\n",
	init_count, UMF_malloc_count)) ;

    npiv = Numeric->npiv ;	/* = n_inner for nonsingular matrices */
    ulen = Numeric->ulen ;	/* = 0 for square nonsingular matrices */

    /* ---------------------------------------------------------------------- */
    /* free Work object */
    /* ---------------------------------------------------------------------- */

    /* (4) After numerical factorization all of the objects allocated in step
     * (1) are freed via UMF_free, except that one object of size n_col+1 is
     * kept if there are off-diagonal nonzeros in the last pivot row (can only
     * occur for singular or rectangular matrices).  This is Work->Upattern,
     * which is transfered to Numeric->Upattern if ulen > 0.
     */

    DEBUG0 (("malloc: init_count "ID" UMF_malloc_count "ID"\n",
	init_count, UMF_malloc_count)) ;

    free_work (Work) ;

    DEBUG0 (("malloc: init_count "ID" UMF_malloc_count "ID"\n",
	init_count, UMF_malloc_count)) ;
    DEBUG0 (("Numeric->ulen: "ID" scale: "ID"\n", ulen, scale)) ;
    ASSERT (UMF_malloc_count == init_count + (ulen > 0) +
	(11 + (scale != UMFPACK_SCALE_NONE))) ;

    /* ---------------------------------------------------------------------- */
    /* reduce Lpos, Lilen, Lip, Upos, Uilen and Uip to size npiv+1 */
    /* ---------------------------------------------------------------------- */

    /* (5) Six components of the Numeric object are reduced in size if the
     * matrix is singular or rectangular.   The original size is 3*(n_row+1) +
     * 3*(n_col+1) integers.  The new size is 6*(npiv+1) integers.  For
     * square non-singular matrices, these two sizes are the same.
     */

    if (npiv < n_row)
    {
	/* reduce Lpos, Uilen, and Uip from size n_row+1 to size npiv */
	inew = (Int *) UMF_realloc (Numeric->Lpos, npiv+1, sizeof (Int)) ;
	if (inew)
	{
	    Numeric->Lpos = inew ;
	}
	inew = (Int *) UMF_realloc (Numeric->Uilen, npiv+1, sizeof (Int)) ;
	if (inew)
	{
	    Numeric->Uilen = inew ;
	}
	inew = (Int *) UMF_realloc (Numeric->Uip, npiv+1, sizeof (Int)) ;
	if (inew)
	{
	    Numeric->Uip = inew ;
	}
    }

    if (npiv < n_col)
    {
	/* reduce Upos, Lilen, and Lip from size n_col+1 to size npiv */
	inew = (Int *) UMF_realloc (Numeric->Upos, npiv+1, sizeof (Int)) ;
	if (inew)
	{
	    Numeric->Upos = inew ;
	}
	inew = (Int *) UMF_realloc (Numeric->Lilen, npiv+1, sizeof (Int)) ;
	if (inew)
	{
	    Numeric->Lilen = inew ;
	}
	inew = (Int *) UMF_realloc (Numeric->Lip, npiv+1, sizeof (Int)) ;
	if (inew)
	{
	    Numeric->Lip = inew ;
	}
    }

    /* ---------------------------------------------------------------------- */
    /* reduce Numeric->Upattern from size n_col+1 to size ulen+1 */
    /* ---------------------------------------------------------------------- */

    /* (6) The size of Numeric->Upattern (formerly Work->Upattern) is reduced
     * from size n_col+1 to size ulen + 1.  If ulen is zero, the object does
     * not exist. */

    DEBUG4 (("ulen: "ID" Upattern "ID"\n", ulen, (Int) Numeric->Upattern)) ;
    ASSERT (IMPLIES (ulen == 0, Numeric->Upattern == (Int *) NULL)) ;
    if (ulen > 0 && ulen < n_col)
    {
	inew = (Int *) UMF_realloc (Numeric->Upattern, ulen+1, sizeof (Int)) ;
	if (inew)
	{
	    Numeric->Upattern = inew ;
	}
    }

    /* ---------------------------------------------------------------------- */
    /* reduce Numeric->Memory to hold just the LU factors at the head */
    /* ---------------------------------------------------------------------- */

    /* (7) The variable-sized block (Numeric->Memory) is reduced to hold just L
     * and U, via a call to UMF_realloc, since the frontal matrices are no
     * longer needed.
     */

    newsize = Numeric->ihead ;
    if (newsize < Numeric->size)
    {
	mnew = (Unit *) UMF_realloc (Numeric->Memory, newsize, sizeof (Unit)) ;
	if (mnew)
	{
	    /* realloc succeeded (how can it fail since the size is reduced?) */
	    Numeric->Memory = mnew ;
	    Numeric->size = newsize ;
	}
    }
    Numeric->ihead = Numeric->size ;
    Numeric->itail = Numeric->ihead ;
    Numeric->tail_usage = 0 ;
    Numeric->ibig = EMPTY ;
    /* UMF_mem_alloc_tail_block can no longer be called (no tail marker) */

    /* ---------------------------------------------------------------------- */
    /* report the results and return the Numeric object */
    /* ---------------------------------------------------------------------- */

    UMF_set_stats (
	Info,
	Symbolic,
	(double) Numeric->max_usage,	/* actual peak Numeric->Memory */
	(double) Numeric->size,		/* actual final Numeric->Memory */
	Numeric->flops,			/* actual "true flops" */
	(double) Numeric->lnz + n_inner,		/* actual nz in L */
	(double) Numeric->unz + Numeric->nnzpiv,	/* actual nz in U */
	(double) Numeric->maxfrsize,	/* actual largest front size */
	(double) ulen,			/* actual Numeric->Upattern size */
	(double) npiv,			/* actual # pivots found */
	(double) Numeric->maxnrows,	/* actual largest #rows in front */
	(double) Numeric->maxncols,	/* actual largest #cols in front */
	scale != UMFPACK_SCALE_NONE,
	Symbolic->prefer_diagonal,
	ACTUAL) ;

    Info [UMFPACK_ALLOC_INIT_USED] = Numeric->alloc_init ;
    Info [UMFPACK_NUMERIC_DEFRAG] = Numeric->ngarbage ;
    Info [UMFPACK_NUMERIC_REALLOC] = Numeric->nrealloc ;
    Info [UMFPACK_NUMERIC_COSTLY_REALLOC] = Numeric->ncostly ;
    Info [UMFPACK_COMPRESSED_PATTERN] = Numeric->isize ;
    Info [UMFPACK_LU_ENTRIES] = Numeric->nLentries + Numeric->nUentries +
	    Numeric->npiv ;
    Info [UMFPACK_UDIAG_NZ] = Numeric->nnzpiv ;
    Info [UMFPACK_RSMIN] = Numeric->rsmin ;
    Info [UMFPACK_RSMAX] = Numeric->rsmax ;
    Info [UMFPACK_WAS_SCALED] = Numeric->scale ;

    /* nz in L and U with no dropping of small entries */
    Info [UMFPACK_ALL_LNZ] = Numeric->all_lnz + n_inner ;
    Info [UMFPACK_ALL_UNZ] = Numeric->all_unz + Numeric->nnzpiv ;
    Info [UMFPACK_NZDROPPED] =
	  (Numeric->all_lnz - Numeric->lnz)
	+ (Numeric->all_unz - Numeric->unz) ;

    /* estimate of the reciprocal of the condition number. */
    if (SCALAR_IS_ZERO (Numeric->min_udiag)
     || SCALAR_IS_ZERO (Numeric->max_udiag)
     ||	SCALAR_IS_NAN (Numeric->min_udiag)
     ||	SCALAR_IS_NAN (Numeric->max_udiag))
    {
	/* rcond is zero if there is any zero or NaN on the diagonal */
	Numeric->rcond = 0.0 ;
    }
    else
    {
	/* estimate of the recipricol of the condition number. */
	/* This is NaN if diagonal is zero-free, but has one or more NaN's. */
	Numeric->rcond = Numeric->min_udiag / Numeric->max_udiag ;
    }
    Info [UMFPACK_UMIN]  = Numeric->min_udiag ;
    Info [UMFPACK_UMAX]  = Numeric->max_udiag ;
    Info [UMFPACK_RCOND] = Numeric->rcond ;

    if (Numeric->nnzpiv < n_inner
    || SCALAR_IS_ZERO (Numeric->rcond) || SCALAR_IS_NAN (Numeric->rcond))
    {
	/* there are zeros and/or NaN's on the diagonal of U */
	DEBUG0 (("Warning, matrix is singular in umfpack_numeric\n")) ;
	DEBUG0 (("nnzpiv "ID" n_inner "ID" rcond %g\n", Numeric->nnzpiv,
	    n_inner, Numeric->rcond)) ;
	status = UMFPACK_WARNING_singular_matrix ;
	Info [UMFPACK_STATUS] = status ;
    }

    Numeric->valid = NUMERIC_VALID ;
    *NumericHandle = (void *) Numeric ;

    /* Numeric has 11 to 13 objects */
    ASSERT (UMF_malloc_count == init_count + 11 +
	+ (ulen > 0)			    /* Numeric->Upattern */
	+ (scale != UMFPACK_SCALE_NONE)) ;  /* Numeric->Rs */

    /* ---------------------------------------------------------------------- */
    /* get the time used by UMFPACK_numeric */
    /* ---------------------------------------------------------------------- */

    umfpack_toc (stats) ;
    Info [UMFPACK_NUMERIC_WALLTIME] = stats [0] ;
    Info [UMFPACK_NUMERIC_TIME] = stats [1] ;

    /* return UMFPACK_OK or UMFPACK_WARNING_singular_matrix */
    return (status) ;

}
示例#26
0
int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                        const int M0, const int N, const int K,
                        const SCALAR alpha, const TYPE *A, const int lda,
                        const TYPE *B, const int ldb, const SCALAR beta,
                        TYPE *C, const int ldc)
/*
 * Copy matmul algorithm, copies A and B on-the-fly
 * If M < 0, allocates only (MB+NB)*KB workspace
 */
{
   void *v=NULL;
   const TYPE *a=A;
   TYPE *pA, *pB, *pB0;
   MAT2BLK2 A2blk, B2blk;
   NBMM0 NBmm0, NBmm1, pNBmm0;
   const int M = (M0 >= 0) ? M0 : -M0;
   int nkblks, nmblks, nnblks, mr, nr, kr, KR, bigK, h, i, j, ZEROC;
   size_t incAk, incBk, incAm, incBn, incAW, incAWp, incBW, incBWp, incW;

/*
 * If both M and N <= NB, and one of them is not full, call BPP, which
 * can sometimes avoid doing cleanup forall cases
 */
   if (M <= MB && N <= NB && (M != MB || N != NB))
      return(Mjoin(PATL,mmBPP)(TA, TB, M, N, K, alpha, A, lda, B, ldb,
                               beta, C, ldc));
/*
 * If these workspace increments are 0, we do JIT NBxNB copies instead of
 * copying entire array/panel.  Don't copy mat if you can't reuse it.
 */
   if (M0 > 0)
   {
      incAW = (N > NB) ? KB*MB : 0;
      incBW = (M > NB) ? KB*NB : 0;
   }
   else /* allocate in minimal space */
      incAW = incBW = 0;
   nmblks = M/MB;
   nnblks = N/NB;
   nkblks = K/KB;
   mr = M - nmblks*MB;
   nr = N - nnblks*NB;
   kr = K - nkblks*KB;
/*
 * K-loop is special, in that we don't call user cleanup, must explicitly zero,
 * and K-cleanup is typically slower even for generated kernels.  Therefore,
 * allow extra leaway for doing extra flops.  Note error is unaffected by
 * any of these extra flops: K-loop has elts zeroed, and multiplying zeros
 * and adding in zeros doesn't add to error
 */
   KR = (kr && kr+4 >= KB) ? KB : kr;
   bigK = nkblks*KB+KR;
   if (incAW)
   {
      i = MB*bigK;
      incAWp = KB*mr;
   }
   else
   {
      i = MB*KB;
      incAWp = 0;
   }
   if (incBW)
   {
      incBWp = KB*nr;
      incW = bigK*NB;
      i += N*bigK;
   }
   else
   {
      incBWp = incW = 0;
      i += NB*KB;
   }
   i *= sizeof(TYPE);
   if (i <= ATL_MaxMalloc || !(incAW | incBW))
      v = malloc(ATL_Cachelen+i);
   if (!v) return(-1);
   pA = ATL_AlignPtr(v);
   pB0 = pA + (incAW ? bigK*MB : KB*MB);
   if (TA == AtlasNoTrans)
   {
      A2blk = Mjoin(PATL,gemoveT);
      incAk = lda*KB;
      incAm = MB;
   }
   else
   {
      A2blk = Mjoin(PATL,gemove);
      incAk = KB;
      incAm = MB*lda;
   }
   if (TB == AtlasNoTrans)
   {
      B2blk = Mjoin(PATL,gemove);
      incBk = KB;
      incBn = NB*ldb;
   }
   else
   {
      B2blk = Mjoin(PATL,gemoveT);
      incBk = ldb*KB;
      incBn = NB;
   }
/*
 * See what kernel we're calling
 */
   if ( SCALAR_IS_ONE(beta) )
   {
      NBmm0 = NBmm_b1;
      pNBmm0 = Mjoin(PATL,pNBmm_b1);
   }
   else if ( SCALAR_IS_ZERO(beta) )
   {
      NBmm0 = NBmm_b0;
      pNBmm0 = Mjoin(PATL,pNBmm_b0);
   }
   else
   {
      NBmm0 = NBmm_bX;
      pNBmm0 = Mjoin(PATL,pNBmm_bX);
   }
   KR = (KR == KB) ? KB : 0;
   ZEROC = !KR && SCALAR_IS_ZERO(beta);

   for (i=0; i < nmblks; i++)
   {
      a = A+i*incAm;
      pB = pB0;       /* foreach row-panel of A, start at B's copy space */
      for (j=nnblks; j; j--)
      {
         Mjoin(PATL,mmK)(MB, MB, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBW,
                         C, ldc, A2blk, B2blk, NBmm0, NBmm_b1);
         B += incBn;             /* copy next col panel of B */
         pB += incW;             /* to next col panel of pB  */
         a = (incAW ? NULL : a); /* reuse row-panel of A if copied */
         C += ldc*NB;
      }
      if (nr)
      {
         if (ZEROC)
            Mjoin(PATL,gezero)(MB, nr, C, ldc);
         Mjoin(PATL,mmK)(MB, MB, nr, nr, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBWp,
                         C, ldc, A2blk, B2blk, pNBmm0, Mjoin(PATL,pNBmm_b1));
      }
      C += MB - nnblks*ldc*NB;
      if (incBW)
      {
         B = NULL;              /* finished copying B */
         incBn = 0;
      }
      else
         B -= nnblks*incBn;
   }
   if (mr)
   {
      a = A + nmblks*incAm;
      pB = pB0;
      if ( SCALAR_IS_ONE(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b1);
      else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b0);
      else NBmm0 = Mjoin(PATL,pMBmm_bX);
      for (j=nnblks; j; j--)
      {
         Mjoin(PATL,mmK)(mr, mr, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBW,
                         C, ldc, A2blk, B2blk, NBmm0, Mjoin(PATL,pMBmm_b1));
         B += incBn;              /* copy next col panel of B */
         pB += incW;              /* to next col panel of pB  */
         a = (incAW ? NULL : a);  /* reuse row-panel of A if copied */
         C += ldc*NB;
      }
      if (nr)
      {
         if ( SCALAR_IS_ZERO(beta) )
            Mjoin(PATL,gezero)(mr, nr, C, ldc);
         Mjoin(PATL,mmK)(mr, mr, nr, nr, nkblks, kr, (incAW | incBW) ? KR:0,
                         ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp,
                         B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk,
                         Mjoin(PATL,pKBmm), Mjoin(PATL,pKBmm));
      }
   }
   free(v);
   return(0);
}
示例#27
0
void Mjoin(PATL,ttrsm)(const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo,
                       const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag,
                       ATL_CINT M, ATL_CINT N, const SCALAR alpha,
                       const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb)
{
   ATL_TTRSM_t trsms[ATL_NTHREADS];
   TYPE *b;
   ATL_INT n, nblks, minblks;
   double tblks;
   int nr, p, i, j, extrablks;
   static int nb=0;

   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))
   {
      Mjoin(PATL,gezero)(M, N, B, ldb);
      return;
   }
   #if defined(ATL_ARCH_XeonPHI) && defined(TREAL)
   {
      int Mjoin(PATL,ttrsm_amm)
         (const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo,
          const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag,
          ATL_CINT M, ATL_CINT N, const SCALAR alpha,
          const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb);
     if (!Mjoin(PATL,ttrsm_amm)(side, uplo, TA, diag, M, N, alpha,
                                A, lda, B, ldb))
        return;
   }
   #endif
/*
 * Distribute RHS over the processors
 */
   if (!nb) nb = Mjoin(PATL,GetNB)();
   if (side == AtlasLeft)
   {
      nblks = N/nb;
      nr = N - nblks*nb;
      tblks = ((double)(M*N)) / ( (double)nb * nb );
      p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER;
      p = Mmin(p, ATL_NTHREADS);
      p = p ? p : 1;

      b = B;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         trsms[i].A = A;
         trsms[i].M = M;
         trsms[i].N = n;
         trsms[i].lda = lda;
         trsms[i].ldb = ldb;
         trsms[i].B = b;
         trsms[i].alpha = SADD alpha;
         trsms[i].side = side;
         trsms[i].uplo = uplo;
         trsms[i].TA   = TA;
         trsms[i].diag = diag;
         n *= (ldb << Mjoin(PATL,shift));
         b = MindxT(b, n);
      }
   }
   else /* Side == AtlasRight */
   {
      nblks = M/nb;
      nr = M - nblks*nb;
      tblks = (N/nb)*nblks;
      p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER;
      p = Mmin(p, ATL_NTHREADS);
      p = p ? p : 1;

      b = B;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         trsms[i].A = A;
         trsms[i].M = n;
         trsms[i].N = N;
         trsms[i].lda = lda;
         trsms[i].ldb = ldb;
         trsms[i].B = b;
         trsms[i].alpha = SADD alpha;
         trsms[i].side = side;
         trsms[i].uplo = uplo;
         trsms[i].TA   = TA;
         trsms[i].diag = diag;
         n <<= Mjoin(PATL,shift);
         b = MindxT(b, n);
      }
   }
   if (p < 2)
   {
      Mjoin(PATL,trsm)(side, uplo, TA, diag, M, N, alpha, A, lda, B, ldb);
      return;
   }
   for (; i < ATL_NTHREADS; i++)  /* flag rest of struct as uninitialized */
      trsms[i].B = NULL;
   ATL_goparallel(p, Mjoin(PATL,DoWorkTRSM), trsms, NULL);
}
示例#28
0
Int KLU_condest         /* return TRUE if successful, FALSE otherwise */
(
    Int Ap [ ],
    double Ax [ ],
    KLU_symbolic *Symbolic,
    KLU_numeric *Numeric,
    KLU_common *Common
)
{
    double xj, Xmax, csum, anorm, ainv_norm, est_old, est_new, abs_value ;
    Entry *Udiag, *Aentry, *X, *S ;
    Int i, j, jmax, jnew, pend, n ;
#ifndef COMPLEX
    Int unchanged ;
#endif

    /* ---------------------------------------------------------------------- */
    /* check inputs */
    /* ---------------------------------------------------------------------- */

    if (Common == NULL)
    {
        return (FALSE) ;
    }
    if (Symbolic == NULL || Ap == NULL || Ax == NULL)
    {
        Common->status = KLU_INVALID ;
        return (FALSE) ;
    }
    abs_value = 0 ;
    if (Numeric == NULL)
    {
        /* treat this as a singular matrix */
        Common->condest = 1 / abs_value ;
        Common->status = KLU_SINGULAR ;
        return (TRUE) ;
    }
    Common->status = KLU_OK ;

    /* ---------------------------------------------------------------------- */
    /* get inputs */
    /* ---------------------------------------------------------------------- */

    n = Symbolic->n ;
    Udiag = Numeric->Udiag ;

    /* ---------------------------------------------------------------------- */
    /* check if diagonal of U has a zero on it */
    /* ---------------------------------------------------------------------- */

    for (i = 0 ; i < n ; i++)
    {
        ABS (abs_value, Udiag [i]) ;
        if (SCALAR_IS_ZERO (abs_value))
        {
            Common->condest = 1 / abs_value ;
            Common->status = KLU_SINGULAR ;
            return (TRUE) ;
        }
    }

    /* ---------------------------------------------------------------------- */
    /* compute 1-norm (maximum column sum) of the matrix */
    /* ---------------------------------------------------------------------- */

    anorm =  0.0 ;
    Aentry = (Entry *) Ax ;
    for (i = 0 ; i < n ; i++)
    {
        pend = Ap [i + 1] ;
        csum = 0.0 ;
        for (j = Ap [i] ; j < pend ; j++)
        {
            ABS (abs_value, Aentry [j]) ;
            csum += abs_value ;
        }
        if (csum > anorm)
        {
            anorm = csum ;
        }
    }

    /* ---------------------------------------------------------------------- */
    /* compute estimate of 1-norm of inv (A) */
    /* ---------------------------------------------------------------------- */

    /* get workspace (size 2*n Entry's) */
    X = Numeric->Xwork ;            /* size n space used in KLU_solve, tsolve */
    X += n ;                        /* X is size n */
    S = X + n ;                     /* S is size n */

    for (i = 0 ; i < n ; i++)
    {
        CLEAR (S [i]) ;
        CLEAR (X [i]) ;
        REAL (X [i]) = 1.0 / ((double) n) ;
    }
    jmax = 0 ;

    ainv_norm = 0.0 ;
    for (i = 0 ; i < 5 ; i++)
    {
        if (i > 0)
        {
            /* X [jmax] is the largest entry in X */
            for (j = 0 ; j < n ; j++)
            {
                /* X [j] = 0 ;*/
                CLEAR (X [j]) ;
            }
            REAL (X [jmax]) = 1 ;
        }

        KLU_solve (Symbolic, Numeric, n, 1, (double *) X, Common) ;
        est_old = ainv_norm ;
        ainv_norm = 0.0 ;

        for (j = 0 ; j < n ; j++)
        {
            /* ainv_norm += ABS (X [j]) ;*/
            ABS (abs_value, X [j]) ;
            ainv_norm += abs_value ;
        }

#ifndef COMPLEX
        unchanged = TRUE ;

        for (j = 0 ; j < n ; j++)
        {
            double s = (X [j] >= 0) ? 1 : -1 ;
            if (s != (Int) REAL (S [j]))
            {
                S [j] = s ;
                unchanged = FALSE ;
            }
        }

        if (i > 0 && (ainv_norm <= est_old || unchanged))
        {
            break ;
        }
#else
        for (j = 0 ; j < n ; j++)
        {
            if (IS_NONZERO (X [j]))
            {
                ABS (abs_value, X [j]) ;
                SCALE_DIV_ASSIGN (S [j], X [j], abs_value) ;
            }
            else
            {
                CLEAR (S [j]) ;
                REAL (S [j]) = 1 ;
            }
        }

        if (i > 0 && ainv_norm <= est_old)
        {
            break ;
        }
#endif

        for (j = 0 ; j < n ; j++)
        {
            X [j] = S [j] ;
        }

#ifndef COMPLEX
        /* do a transpose solve */
        KLU_tsolve (Symbolic, Numeric, n, 1, X, Common) ;
#else
        /* do a conjugate transpose solve */
        KLU_tsolve (Symbolic, Numeric, n, 1, (double *) X, 1, Common) ;
#endif

        /* jnew = the position of the largest entry in X */
        jnew = 0 ;
        Xmax = 0 ;
        for (j = 0 ; j < n ; j++)
        {
            /* xj = ABS (X [j]) ;*/
            ABS (xj, X [j]) ;
            if (xj > Xmax)
            {
                Xmax = xj ;
                jnew = j ;
            }
        }
        if (i > 0 && jnew == jmax)
        {
            /* the position of the largest entry did not change
             * from the previous iteration */
            break ;
        }
        jmax = jnew ;
    }

    /* ---------------------------------------------------------------------- */
    /* compute another estimate of norm(inv(A),1), and take the largest one */
    /* ---------------------------------------------------------------------- */

    for (j = 0 ; j < n ; j++)
    {
        CLEAR (X [j]) ;
        if (j % 2)
        {
            REAL (X [j]) = 1 + ((double) j) / ((double) (n-1)) ;
        }
        else
        {
            REAL (X [j]) = -1 - ((double) j) / ((double) (n-1)) ;
        }
    }

    KLU_solve (Symbolic, Numeric, n, 1, (double *) X, Common) ;

    est_new = 0.0 ;
    for (j = 0 ; j < n ; j++)
    {
        /* est_new += ABS (X [j]) ;*/
        ABS (abs_value, X [j]) ;
        est_new += abs_value ;
    }
    est_new = 2 * est_new / (3 * n) ;
    ainv_norm = MAX (est_new, ainv_norm) ;

    /* ---------------------------------------------------------------------- */
    /* compute estimate of condition number */
    /* ---------------------------------------------------------------------- */

    Common->condest = ainv_norm * anorm ;
    return (TRUE) ;
}
示例#29
0
文件: ATL_tsymm.c 项目: AIDman/Kaldi
void Mjoin(PATL,tsymm)
   (const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
    ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda,
    const TYPE *B, ATL_CINT ldb, const SCALAR beta, TYPE *C, ATL_CINT ldc)
{
   ATL_INT n, nblks, tblks, nr, minblks, extrablks, p, i, j;
   ATL_thread_t tp[ATL_NTHREADS];
   ATL_TSYMM_t symms[ATL_NTHREADS];
   ATL_LAUNCHSTRUCT_t ls;
   const TYPE *b;
   TYPE *c;
   static int nb=0;

   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))
   {
      if (!SCALAR_IS_ONE(beta))
         Mjoin(PATL,gescal)(M, N, beta, C, ldc);
      return;
   }
   if (!nb) nb = Mjoin(PATL,GetNB());
   if (Side == AtlasLeft)
   {
      nblks = N / nb;
      nr = N - nblks*nb;
      tblks = ((double)(M*N)) / ( (double)nb * nb );
      p = (nblks+ATL_TSYMM_ADDP-1)/ATL_TSYMM_ADDP;
      if (p < ATL_NTHREADS)  /* N not big enough to give blk to each proc */
      {
/*
 *       If I can't split N, and M is the dominant cost, use recursion to
 *       decompose symmetric matrix; parallelism will come from TGEMM calls
 */
         if (M > (N<<(ATL_NTHRPOW2+2)))
         {
            ATL_tsymm_SYsplit(Side, Uplo, M, N, alpha, A, lda, B, ldb,
                              beta, C, ldc, nb);
            return;
         }
      }
      else
         p = ATL_NTHREADS;
      if (p < 2)
         goto SERIAL;
/*
 *    Distribute N over the processors
 */
      b = B;
      c = C;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         symms[i].A = A;
         symms[i].B = b;
         symms[i].alpha = SADD alpha;
         symms[i].beta = SADD beta;
         symms[i].C = c;
         symms[i].M = M;
         symms[i].N = n;
         symms[i].lda = lda;
         symms[i].ldb = ldb;
         symms[i].ldc = ldc;
         symms[i].side = Side;
         symms[i].uplo = Uplo;
         b = MindxT(b, ATL_MulBySize((size_t)ldb)*n);
         c = MindxT(c, ATL_MulBySize((size_t)ldc)*n);
      }
      for (; i < ATL_NTHREADS; i++)  /* flag rest of struct as uninitialized */
         symms[i].M = 0;
   }
   else  /* Side == AtlasRight */
   {
      nblks = M / nb;
      nr = M - nblks*nb;
      tblks = ((double)(M*N)) / ( (double)nb * nb );
      p = (nblks+ATL_TSYMM_ADDP-1)/ATL_TSYMM_ADDP;
      if (p < ATL_NTHREADS)  /* N not big enough to give blk to each proc */
      {
/*
 *       If I can't split M, and N is the dominant cost, use recursion to
 *       decompose symmetric matrix; parallelism will come from TGEMM calls
 */
         if (N > (M<<(ATL_NTHRPOW2+2)))
         {
            ATL_tsymm_SYsplit(Side, Uplo, M, N, alpha, A, lda, B, ldb,
                              beta, C, ldc, nb);
            return;
         }
      }
      else
         p = ATL_NTHREADS;
      if (p < 2)
         goto SERIAL;
/*
 *    Distribute M over the processors
 */
      b = B;
      c = C;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         symms[i].A = A;
         symms[i].B = b;
         symms[i].alpha = SADD alpha;
         symms[i].beta = SADD beta;
         symms[i].C = c;
         symms[i].M = n;
         symms[i].N = N;
         symms[i].lda = lda;
         symms[i].ldb = ldb;
         symms[i].ldc = ldc;
         symms[i].side = Side;
         symms[i].uplo = Uplo;
         b = MindxT(b, ATL_MulBySize((size_t)n));
         c = MindxT(c, ATL_MulBySize((size_t)n));
      }
      for (; i < ATL_NTHREADS; i++)  /* flag rest of struct as uninitialized */
         symms[i].M = 0;
   }
   if (p < 2)
   {
SERIAL:
      Mjoin(PATL,symm)(Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
      return;
   }
   ATL_goparallel(p, Mjoin(PATL,DoWorkSYMM), symms, NULL);
}
示例#30
0
Int KLU_rgrowth         /* return TRUE if successful, FALSE otherwise */
(
    Int *Ap,
    Int *Ai,
    double *Ax,
    KLU_symbolic *Symbolic,
    KLU_numeric *Numeric,
    KLU_common *Common
)
{
    double temp, max_ai, max_ui, min_block_rgrowth ;
    Entry aik ;
    Int *Q, *Ui, *Uip, *Ulen, *Pinv ;
    Unit *LU ;
    Entry *Aentry, *Ux, *Ukk ;
    double *Rs ;
    Int i, newrow, oldrow, k1, k2, nk, j, oldcol, k, pend, len ;

    /* ---------------------------------------------------------------------- */
    /* check inputs */
    /* ---------------------------------------------------------------------- */

    if (Common == NULL)
    {
        return (FALSE) ;
    }

    if (Symbolic == NULL || Ap == NULL || Ai == NULL || Ax == NULL)
    {
        Common->status = KLU_INVALID ;
        return (FALSE) ;
    }

    if (Numeric == NULL)
    {
        /* treat this as a singular matrix */
        Common->rgrowth = 0 ;
        Common->status = KLU_SINGULAR ;
        return (TRUE) ;
    }
    Common->status = KLU_OK ;

    /* ---------------------------------------------------------------------- */
    /* compute the reciprocal pivot growth */
    /* ---------------------------------------------------------------------- */

    Aentry = (Entry *) Ax ;
    Pinv = Numeric->Pinv ;
    Rs = Numeric->Rs ;
    Q = Symbolic->Q ;
    Common->rgrowth = 1 ;

    for (i = 0 ; i < Symbolic->nblocks ; i++)
    {
        k1 = Symbolic->R[i] ;
        k2 = Symbolic->R[i+1] ;
        nk = k2 - k1 ;
        if (nk == 1)
        {
            continue ;      /* skip singleton blocks */
        }
        LU = (Unit *) Numeric->LUbx[i] ;
        Uip = Numeric->Uip + k1 ;
        Ulen = Numeric->Ulen + k1 ;
        Ukk = ((Entry *) Numeric->Udiag) + k1 ;
        min_block_rgrowth = 1 ;
        for (j = 0 ; j < nk ; j++)
        {
            max_ai = 0 ;
            max_ui = 0 ;
            oldcol = Q[j + k1] ;
            pend = Ap [oldcol + 1] ;
            for (k = Ap [oldcol] ; k < pend ; k++)
            {
                oldrow = Ai [k] ;
                newrow = Pinv [oldrow] ;
                if (newrow < k1)
                {
                    continue ;  /* skip entry outside the block */
                }
                ASSERT (newrow < k2) ;
                if (Rs != NULL)
                {
                    /* aik = Aentry [k] / Rs [oldrow] */
                    SCALE_DIV_ASSIGN (aik, Aentry [k], Rs [newrow]) ;
                }
                else
                {
                    aik = Aentry [k] ;
                }
                /* temp = ABS (aik) */
                ABS (temp, aik) ;
                if (temp > max_ai)
                {
                    max_ai = temp ;
                }
            }

            /* Ui is set but not used.  This is OK, because otherwise the macro
               would have to be redesigned. */
            GET_POINTER (LU, Uip, Ulen, Ui, Ux, j, len) ;
            for (k = 0 ; k < len ; k++)
            {
                /* temp = ABS (Ux [k]) */
                ABS (temp, Ux [k]) ;
                if (temp > max_ui)
                {
                    max_ui = temp ;
                }
            }
            /* consider the diagonal element */
            ABS (temp, Ukk [j]) ;
            if (temp > max_ui)
            {
                max_ui = temp ;
            }

            /* if max_ui is 0, skip the column */
            if (SCALAR_IS_ZERO (max_ui))
            {
                continue ;
            }
            temp = max_ai / max_ui ;
            if (temp < min_block_rgrowth)
            {
                min_block_rgrowth = temp ;
            }
        }

        if (min_block_rgrowth < Common->rgrowth)
        {
            Common->rgrowth = min_block_rgrowth ;
        }
    }
    return (TRUE) ;
}