Esempio n. 1
0
void Mjoin(PATL,sprk)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum ATLAS_UPLO UC, const int CP,
    const int N, const int K, const SCALAR alpha,
    const TYPE *A, const int IA, const int JA, const int lda,
    const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc)
{
   const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen);
   int j;
   #ifdef CacheEdge
      static const int CE_K = ((ATL_DivBySize(CacheEdge SHIFT)-(NBNB SHIFT)) /
                                           (NB*(NB+NB)))*NB;
   #else
      #define CE_K K
   #endif

   if ((!N) || ((SCALAR_IS_ZERO(alpha) || (!K)) && (SCALAR_IS_ONE(beta))))
      return;
   if (!K || SCALAR_IS_ZERO(alpha))
   {
      if (UC == CblasLower)
      {
         for (j=0; j != N; j++)
            Mjoin(PATL,scal)(N-j, beta, C+MindexP(UC2,IC+j,JC+j,ldc), 1);
      }
      else /* UC == CblasUpper */
      {
         for (j=0; j != N; j++)
            Mjoin(PATL,scal)(j+1, beta, C+MindexP(UC2,IC,JC+j,ldc), 1);
      }
      return;
   }
   Mjoin(PATL,sprk_rK)(UA, TA, UC, CP, N, K, CE_K, alpha, A, lda, beta, C, ldc);
}
Esempio n. 2
0
   int Mjoin(PATL,syr2kLT)
#endif
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
   int i;
   void *vc=NULL;
   TYPE *c;
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0, zero=0.0;
   #else
      #define alpha valpha
      const TYPE *beta=vbeta;
      const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0};
   #endif

   i = ATL_MulBySize(N)*N;
   if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i);
   if (vc == NULL) return(1);
   c = ATL_AlignPtr(vc);
   CgemmTN(N, N, K, alpha, A, lda, B, ldb, zero, c, N);
   if ( SCALAR_IS_ONE(beta) ) Mjoin(syr2k_put,_b1)(N, c, beta, C, ldc);
   else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr2k_put,_b0)(N, c, beta, C, ldc);
   #ifdef TCPLX
      else if (SCALAR_IS_NONE(beta)) Mjoin(syr2k_put,_bn1)(N, c, beta, C, ldc);
      else if (beta[1] == *zero) Mjoin(syr2k_put,_bXi0)(N, c, beta, C, ldc);
   #endif
   else Mjoin(syr2k_put,_bX)(N, c, beta, C, ldc);
   free(vc);
   return(0);
}
Esempio n. 3
0
void Mjoin(Mjoin(PATL,symmR),UploNM)
   (const int M, const int N, const void *valpha, const void *A, const int lda,
    const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0;
   #else
      #define alpha valpha
      #define beta  vbeta
   #endif
   void *va;
   TYPE *a;

   if (M > SYMM_Xover)
   {
      va = malloc(ATL_Cachelen + ATL_MulBySize(N)*N);
      ATL_assert(va);
      a = ATL_AlignPtr(va);
      #ifdef TREAL
         if ( SCALAR_IS_ONE(alpha) )
            Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(N, alpha, A, lda, a);
         else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(N, alpha, A, lda, a);
         ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, one, B, ldb, a, N, beta, C, ldc);
      #else
         Mjoin(Mjoin(PATL,sycopy),UploNM)(N, A, lda, a);
         ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, valpha, B, ldb, a, N, vbeta, C, ldc);
      #endif
      free(va);
   }
   else Mjoin(PATL,refsymm)(AtlasRight, Uplo_, M, N, alpha, A, lda, B, ldb,
                            beta, C, ldc);
}
Esempio n. 4
0
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T)
   (const int N, const int K, const void *valpha, const void *A, const int lda,
    const void *vbeta, void *C, const int ldc)
{
   void *vc;
   TYPE *c;
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR beta =*( (const SCALAR *)vbeta  );
      const SCALAR one=1.0, zero=0.0;
   #else
      #define alpha valpha
      const TYPE *beta=vbeta;
      const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0};
   #endif

   if (K > SYRK_Xover)
   {
      vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N);
      ATL_assert(vc);
      c = ATL_AlignPtr(vc);
      CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N);
      if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc);
      else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc);
      #ifdef TCPLX
         else if ( SCALAR_IS_NONE(beta) )
            Mjoin(syr_put,_bn1)(N, c, beta, C, ldc);
         else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc);
      #endif
      else Mjoin(syr_put,_bX)(N, c, beta, C, ldc);
      free(vc);
   }
   else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda,
                            beta, C, ldc);
}
Esempio n. 5
0
void Mjoin(Mjoin(PATL,trmmL),ATLP)
   (const int M, const int N, const void *valpha, const void *A, const int lda,
    void *C, const int ldc)
{
   #ifdef TREAL
      const SCALAR alpha=*( (const SCALAR *)valpha );
      const SCALAR one=1.0, zero=0.0;
   #else
      const TYPE zero[2]={0.0,0.0};
      #define alpha valpha
   #endif
   void *va;
   TYPE *a;

   if (N > TRMM_Xover)
   {
      va = malloc(ATL_Cachelen + ATL_MulBySize(M)*M);
      ATL_assert(va);
      a = ATL_AlignPtr(va);
      #ifdef TREAL
         if ( SCALAR_IS_ONE(alpha) ) Mjoin(ATL_trcopy,_a1)(M, alpha, A, lda, a);
         else Mjoin(ATL_trcopy,_aX)(M, alpha, A, lda, a);
         CAgemmTN(M, N, M, one, a, M, C, ldc, zero, C, ldc);
      #else
         ATL_trcopy(M, A, lda, a);
         CAgemmTN(M, N, M, valpha, a, M, C, ldc, zero, C, ldc);
      #endif
      free(va);
   }
   else Mjoin(PATL,reftrmm)(AtlasLeft, Uplo_, Trans_, Unit_, M, N, alpha,
                            A, lda, C, ldc);
}
Esempio n. 6
0
void Mjoin(Mjoin(PATL,symmL),UploNM)
(const int M, const int N, const void *valpha, const void *A, const int lda,
 const void *B, const int ldb, const void *vbeta, void *C, const int ldc)
{
#ifdef TREAL
    const SCALAR alpha=*( (const SCALAR *)valpha );
    const SCALAR beta =*( (const SCALAR *)vbeta  );
    const SCALAR one=1.0;
#else
#define alpha valpha
#define beta vbeta
#endif
    TYPE *a;
    void *va;

    if (N > SYMM_Xover)
    {
        va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M));
        ATL_assert(va);
        a = ATL_AlignPtr(va);
#ifdef TREAL
        if ( SCALAR_IS_ONE(alpha) )
            Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(M, alpha, A, lda, a);
        else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(M, alpha, A, lda, a);
        CgemmTN(M, N, M, one, a, M, B, ldb, beta, C, ldc);
#else
        Mjoin(Mjoin(PATL,sycopy),UploNM)(M, A, lda, a);
        CgemmTN(M, N, M, valpha, a, M, B, ldb, vbeta, C, ldc);
#endif
        free(va);
    }
    else Mjoin(PATL,refsymm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb,
                                 beta, C, ldc);
}
Esempio n. 7
0
void Mjoin(PATL,mvnk_Mlt16)
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta, TYPE *Y, ATL_CINT incY)
/*
 * y = alpha*A*x + beta*y
 */
{
#ifdef TREAL
   const static ATL_MVFUNC mvfunc[15] = {ATL_mvn_Meq1,
                                         ATL_mvn_Meq2,
                                         ATL_mvn_Meq3,
                                         ATL_mvn_Meq4,
                                         ATL_mvn_Meq5,
                                         ATL_mvn_Meq6,
                                         ATL_mvn_Meq7,
                                         ATL_mvn_Meq8,
                                         ATL_mvn_Meq9,
                                         ATL_mvn_Meq10,
                                         ATL_mvn_Meq11,
                                         ATL_mvn_Meq12,
                                         ATL_mvn_Meq13,
                                         ATL_mvn_Meq14,
                                         ATL_mvn_Meq15
                                         };

   if ( M < 1 || N < 1 || (SCALAR_IS_ZERO(alpha) && SCALAR_IS_ONE(beta)) )
      return;
/*
 * Base max unrolling we use on how many regs we think we have
 */
   #ifdef ATL_GAS_x8664
   if (M > 14)
   #elif defined(ATL_GAS_x8632)
   if (M > 6)
   #else
   if (M > 15)
   #endif
   {
      Mjoin(PATL,mvnk_smallN)(M, N, alpha, A, lda, X, incX, beta, Y, incY);
      return;
   }
   mvfunc[M-1](M, N, alpha, A, lda, X, incX, beta, Y, incY);
#else
   #ifndef TUNING
   if (M <= 8)
      Mjoin(PATL,refgemv)(AtlasNoTrans, M, N, alpha, A, lda, X, incX,
                          beta, Y, incY);
   else
   #endif
      Mjoin(PATL,mvnk_smallN)(M, N, alpha, A, lda, X, incX, beta, Y, incY);
#endif
}
Esempio n. 8
0
void Mjoin( PATL, gbmv )
(
   const enum ATLAS_TRANS     TRANS,
   const int                  M,
   const int                  N,
   const int                  KL,
   const int                  KU,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * X,
   const int                  INCX,
   const SCALAR               BETA,
   TYPE                       * Y,
   const int                  INCY
)
{
/*
 * .. Local Variables ..
 */
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( M == 0 ) || ( N == 0 ) ||
       ( ( SCALAR_IS_ZERO( ALPHA ) ) && ( SCALAR_IS_ONE( BETA ) ) ) ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   {
      if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( M, BETA, Y, INCY );
      return;
   }

   Mjoin( PATL, refgbmv )( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, BETA,
                           Y, INCY );
/*
 * End of Mjoin( PATL, gbmv )
 */
}
Esempio n. 9
0
void Mjoin(PATL,prow2blkTF)(const int M, const int N, const SCALAR alpha,
                            const TYPE *A, int lda, const int ldainc, TYPE *V)
{
   const int mb = Mmin(NB,M), nMb = ATL_DivByNB(M);
   const int m = ATL_MulByNB(nMb), n = ATL_MulByNB(ATL_DivByNB(N));
   const int nr = N - n, mr = M - m;
   const int incVm = ATL_MulByNB(N), incVV = ATL_MulByNB(mr);
   int i, j, ib, jb;
   const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper :
      ( (ldainc == -1) ? PackLower : PackGen );
   TYPE *v, *vv = V+nMb*incVm;
   void (*row2blk)(const int M, const int N, const TYPE alpha, const TYPE *A,
                   int lda, const int ldainc, TYPE *V);

   if (ldainc)
   {
      if (alpha == ATL_rone) row2blk = ATL_prow2blk_KB_a1;
      else row2blk = ATL_prow2blk_KB_aX;

      for (j=0; j < n; j += NB)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, NB, alpha, A+MindexP(UA,i,j,lda), Mpld(UA,j,lda),
                    ldainc, v);
         if (mr)
         {
            row2blk(mr, NB, alpha, A+MindexP(UA,m,j,lda), Mpld(UA,j,lda),
                    ldainc, vv);
            vv += incVV;
         }
         V += NBNB;
      }
      if (nr)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, nr, alpha, A+MindexP(UA,i,n,lda), Mpld(UA,n,lda),
                    ldainc, v);
         if (mr)
            row2blk(mr, nr, alpha, A+MindexP(UA,m,n,lda), Mpld(UA,n,lda),
                    ldainc, vv);
      }
   }
   else if (SCALAR_IS_ONE(alpha))
      Mjoin(PATL,row2blkT2_a1)(M, N, A, lda, V, alpha);
   else
      Mjoin(PATL,row2blkT2_aX)(M, N, A, lda, V, alpha);
}
Esempio n. 10
0
void Mjoin( PATL, symm )
(
   const enum ATLAS_SIDE      SIDE,
   const enum ATLAS_UPLO      UPLO,
   const int                  M,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * B,
   const int                  LDB,
   const SCALAR               BETA,
   TYPE                       * C,
   const int                  LDC
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, symm )  performs one of the matrix-matrix operations
 *
 *    C := alpha * A * B + beta * C,
 *
 * or
 *
 *    C := alpha * B * A + beta * C,
 *
 * where alpha and beta are scalars,  A  is a symmetric matrix and B and
 * C are m by n matrices.
 *
 * This is a  recursive  version of the  algorithm.  For a more detailed
 * description of  the arguments of this function, see the reference im-
 * plementation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
#ifdef TREAL
   TYPE                       alpha0 = (TYPE)(ALPHA), beta0 = (TYPE)(BETA);
   const TYPE                 one = ATL_rone;
#else
   TYPE                       one[2] = { ATL_rone, ATL_rzero };
#endif
   TYPE                       * alpha, * beta;
   RC3_FUN_SYMM_T             ATL_rsymm;
   RC3_SYMM_T                 type;
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( M == 0 ) || ( N == 0 ) ||
       ( ( SCALAR_IS_ZERO( ALPHA ) ) && ( SCALAR_IS_ONE( BETA ) ) ) ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   { Mjoin( PATL, gescal )( M, N, BETA, C, LDC ); return; }
#ifdef TREAL
   type.size    = sizeof( TYPE );           type.one = (void *)(&one);
   type.TgemmNN = Mjoin( PATL, gemmNN_RB );
   alpha        = &alpha0;                  beta     = &beta0;
#else
   type.size    = sizeof( TYPE[2] );        type.one = (void *)(one);
   type.TgemmNN = Mjoin( PATL, gemmNN_RB );
   alpha = (TYPE *)(ALPHA);                 beta     = (TYPE *)(BETA);
#endif

   if( SIDE == AtlasLeft )
   {
      type.Tgemm = Mjoin( PATL, gemmTN_RB );
      if( UPLO == AtlasUpper )
      { type.Tsymm = Mjoin( PATL, symmLU ); ATL_rsymm = ATL_rsymmLU; }
      else
      { type.Tsymm = Mjoin( PATL, symmLL ); ATL_rsymm = ATL_rsymmLL; }
   }
   else
   {
      type.Tgemm = Mjoin( PATL, gemmNT_RB );
      if( UPLO == AtlasUpper )
      { type.Tsymm = Mjoin( PATL, symmRU ); ATL_rsymm = ATL_rsymmRU; }
      else
      { type.Tsymm = Mjoin( PATL, symmRL ); ATL_rsymm = ATL_rsymmRL; }
   }

   ATL_rsymm( &type, M, N, ((void *)alpha), ((void *)A), LDA, ((void *)B),
              LDB, ((void *)beta), ((void *)C), LDC, SYMM_NB );
/*
 * End of Mjoin( PATL, symm )
 */
}
Esempio n. 11
0
void ATL_gemv
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY)
/*
 * y = alpha*A*x + beta*y, A is MxN, len(X) = N, len(Y) = M
 */
{
   ATL_mvkern_t mvnk, mvnk_b1, mvnk_b0;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y, *p;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN, DOTBASED;
   #ifdef TREAL
      #define one ATL_rone
      #define Zero ATL_rzero
      TYPE alpha = alpha0, beta = beta0;
      const int ALPHA_IS_ONE = (alpha0 == ATL_rone);
   #else
      TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0;
      TYPE Zero[2] = {ATL_rzero, ATL_rzero};
      TYPE *beta = (TYPE*) beta0;
      const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero);
   #endif

   if (M < 1 || N < 1)          /* F77BLAS doesn't scale in either case */
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(M, Y, incY);
         else
            Mjoin(PATL,scal)(M, beta, Y, incY);
      }
      return;
   }
/*
 * ATLAS's mvn kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of X & Y for these degenerate
 * cases
 */
   if (M < 16)
   {
      Mjoin(PATL,mvnk_Mlt16)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
      return;
   }
/*
 * Get mvnk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 * For no transpose, X alignment args really apply to Y, and vice versa.
 */
   mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu,
                            &minM, &minN, &alignY, &ALIGNY2A, &alignX,
                            &ForceNU, &CacheElts);
/*
 * Set up to handle case where kernel requires N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call special case code
 */
   if (N < 4 || Nm < 1)
   {
      Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
      return;
   }
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
/*
 *****************************************************************************
 Figure out whether vecs need be copied, and which one will be scaled by alpha
 *****************************************************************************
 */
   COPYX = (incX != 1);
   if (!COPYX && alignX)
   {
      t1 = (size_t) X;
      COPYX = ((t1/alignX)*alignX != t1);
   }
   COPYY = (incY != 1);
   if (!COPYY)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNY2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) Y;
         COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignY)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignY)*alignY != t1);
      }
   }
   if (COPYX != COPYY)         /* if only one of them is already being copied */
      APPLYALPHAX = COPYX;     /* apply alpha to that one */
   else if (!COPYY && !COPYX)  /* nobody currently being copied means */
   {                           /* we'll need to force a copy to apply alpha */
      APPLYALPHAX = (M < N);   /* apply alpha to vector requiring least */
      if (!ALPHA_IS_ONE)       /* workspace if alpha != 1.0 */
      {
         COPYX = APPLYALPHAX;
         COPYY = !APPLYALPHAX;
      }
   }
   else                        /* if both are being copied anyway */
      APPLYALPHAX = 0;         /* apply alpha during update of Y */

   if (COPYX | COPYY)         /* if I need to copy either vector */
   {                          /* allocate & align them */
      vp = malloc(ATL_MulBySize(COPYY*mb+COPYX*N) + 2*ATL_Cachelen);
/*
 *    If we cannot allocate enough space to copy the vectors, give up and
 *    call the simple loop-based implementation
 */
      if (!vp)
      {
         Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY);
         return;
      }
      if (COPYX)
      {
         x = ATL_AlignPtr(vp);
         if (APPLYALPHAX && !ALPHA_IS_ONE)
            Mjoin(PATL,cpsc)(N, alpha, X, incX, x, 1);
         else
            Mjoin(PATL,copy)(N, X, incX, x, 1);
         if (COPYY)
            y = x + (N SHIFT);
      }
      else /* if (COPYY)  known true by surrounding if */
         y = vp;
      if (COPYY)
      {
         y = (ALIGNY2A) ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y);
         beta = Zero;
         alpha = one;
      }
   }
/*
 * Apply beta to Y if we aren't copying Y
 */
   if (!COPYY && !SCALAR_IS_ONE(beta0))
   {
      if (SCALAR_IS_ZERO(beta0))
         beta = Zero;
      else
      {
         Mjoin(PATL,scal)(M, beta0, Y, incY);
         beta = one;
      }
   }
   mvnk = (COPYY || SCALAR_IS_ZERO(beta)) ? mvnk_b0 : mvnk_b1;
   m = M;
   do
   {
      imb = Mmin(mb, m);
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb >= minM)
         mvnk(imb, Nm, A, lda, x, y);
      else
         Mjoin(PATL,mvnk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,mvnk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda,
                                 x+(Nm SHIFT), 1, one, y, 1);
/*
 *    If we are copying Y, we have formed A*x into y, so scale it by the
 *    original alpha, by using axpby: Y = beta0*Y + alpha0*y
 */
      if (COPYY)
         Mjoin(PATL,axpby)(imb, alpha0, y, 1, beta0, Y, incY);
      else
         y += imb SHIFT;
      A += imb SHIFT;
      Y += (imb*incY)SHIFT;
      m -= imb;
   }
   while(m);
   if (vp)
      free(vp);
}
Esempio n. 12
0
void Mjoin(PATL,tgemv)
   (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
    const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
    const SCALAR beta, TYPE *Y, ATL_CINT incY)
{
   static size_t ALb=0, ALe=0;
   size_t at = (size_t) A;
   ATL_INT n, P, ldaP;
   ATL_TGEMV_t pd;
/*
 * quick return if possible.
 */
   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M;
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(NY, Y, incY);
         else
            Mjoin(PATL,scal)(NY, beta, Y, incY);
      }
      return;
   }
   pd.flg = (at >= ALb && at <= ALe) ? 1 : 0;
   ALb = (size_t)A;
   ALe = (size_t)(A+(M SHIFT));
   #ifdef TREAL
      pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0;
   #else
      if (TA != AtlasNoTrans)
      {
         if (TA == AtlasConj)
            pd.flg |= 4;
         else if (TA == AtlasTrans)
            pd.flg |= 2;
         else /* if (TA == AtlasConjTrans) */
            pd.flg |= (2|4);
      }
   #endif
   P = ATL_DivBySize(CacheEdge);
   P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
   P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P; it hurts alignment */
   P = Mmin(ATL_NTHREADS, P);
if (TA == AtlasNoTrans || TA == AtlasConj)
   P=1;
//fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N);
/*
 * Make sure we don't overflow 32-bit integer lda
 */
   ldaP = P * lda;
   while ((size_t)ldaP != ((size_t)lda)*P)
   {
      P--;
      ldaP = P * lda;
   }
   if (P > 1)
   {
      pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda;
      pd.alpha = alpha; pd.beta = beta;
      pd.X = X; pd.Y = Y; pd.A = A;
      pd.P = P;
      n = N / P;
      pd.n = n;
      pd.nr = N - n*P;
      if (pd.flg & 2)   /* Transpose case */
      {
         ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL);
         return;
      }
/*
 *    For gemvN, everyone needs a private M-length y.  Don't do this unless
 *    we are sure the combine cost is likely dominated by the parallelism
 */
      else if (n > Mmax(P,8))
      {
         int vrank;
         const TYPE *a;
         TYPE *y, *y0;
         #ifdef TCPLX
            TYPE one[2] = {ATL_rone, ATL_rzero};
            TYPE zero[2] = {ATL_rzero, ATL_rzero};
         #endif

         y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M)));
         ATL_assert(y);
         pd.Y = y;
         pd.incY = 1;
         #ifdef TREAL
            pd.alpha = ATL_rone;
            pd.beta  = ATL_rzero;
         #else
            pd.alpha = one;
            pd.beta  = zero;
         #endif
         ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd,
                        Mjoin(PATL,CombineMVN));
/*
 *       goparallel reduces all node's Ys to node 0's.  Extract his from the
 *       work array, and combine it with input array, applying both alpha
 *       and beta in the process
 */
         vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1;
         a = A + (lda SHIFT)*vrank;
         y = ATL_Align2Ptr(y, a);
         Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY);
         free(y0);
         return;
      }
   }
/*
 * If we haven't parallelized this thing, just do it serial
 */
   Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
}
Esempio n. 13
0
void Mjoin( PATL, syr2k )
(
   const enum ATLAS_UPLO      UPLO,
   const enum ATLAS_TRANS     TRANS,
   const int                  N,
   const int                  K,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * B,
   const int                  LDB,
   const SCALAR               BETA,
   TYPE                       * C,
   const int                  LDC
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, syr2k )  performs one of the @(syhe_comm) rank 2k operations
 *
 *    C := alpha * A * B' + alpha * B * A' + beta * C,
 *
 * or
 *
 *    C := alpha * A' * B + alpha * B' * A + beta * C,
 *
 * where alpha and beta are scalars, C is an n by n @(syhe_comm) matrix and
 * A and B are n by k matrices in the first case and k by n  matrices in
 * the second case.
 *
 * This is a  recursive  version of the  algorithm.  For a more detailed
 * description of  the arguments of this function, see the reference im-
 * plementation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
#ifdef TREAL
   TYPE                       alpha0 = (TYPE)(ALPHA), beta0 = (TYPE)(BETA);
   TYPE                       one = ATL_rone;
   TYPE                       * alpha, * beta;
#else
   TYPE                       one[2] = { ATL_rone, ATL_rzero };
   TYPE                       * alpha, * beta;
#endif
   RC3_FUN_SYR2K_T            ATL_rsyr2k;
   RC3_SYR2K_T                type;
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( N == 0 ) ||
       ( ( SCALAR_IS_ZERO( ALPHA ) || ( K == 0 ) ) && SCALAR_IS_ONE( BETA ) ) )
      return;

   if( ( SCALAR_IS_ZERO( ALPHA ) ) || ( K == 0 ) )
   { Mjoin( PATL, trscal )( UPLO, N, N, BETA, C, LDC ); return; }
#ifdef TREAL
   type.size = sizeof( TYPE );    type.one = (void *)(&one);
   alpha     = &alpha0;           beta     = &beta0;
#else
   type.size = sizeof( TYPE[2] ); type.one = (void *)one;
   alpha     = (TYPE *)(ALPHA);   beta     = (TYPE *)(BETA);
#endif

   if( TRANS == AtlasNoTrans )
   {
      type.Tgemm = Mjoin( PATL, gemmNT_RB );
      if( UPLO == AtlasUpper )
      { type.Tsyr2k = Mjoin( PATL, syr2kUN ); ATL_rsyr2k = ATL_rsyr2kUN; }
      else
      { type.Tsyr2k = Mjoin( PATL, syr2kLN ); ATL_rsyr2k = ATL_rsyr2kLN; }
   }
   else
   {
      type.Tgemm = Mjoin( PATL, gemmTN_RB );
      if( UPLO == AtlasUpper )
      { type.Tsyr2k = Mjoin( PATL, syr2kUT ); ATL_rsyr2k = ATL_rsyr2kUT; }
      else
      { type.Tsyr2k = Mjoin( PATL, syr2kLT ); ATL_rsyr2k = ATL_rsyr2kLT; }
   }

   ATL_rsyr2k( &type, N, K, (void *)(alpha), (void *)(A), LDA, (void *)(B),
               LDB, (void *)(beta), (void *)(C), LDC, SYR2K_NB );
/*
 * End of Mjoin( PATL, syr2k )
 */
}
Esempio n. 14
0
void Mjoin( PATL, hpmv )
(
   const enum ATLAS_UPLO      UPLO,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const TYPE                 * X,
   const int                  INCX,
   const SCALAR               BETA,
   TYPE                       * Y,
   const int                  INCY
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, hpmv ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n Hermitian matrix, supplied in packed form.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*gpmv0)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmv1)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmvN)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
#ifdef TREAL
   TYPE                       alphaY, beta0;
#define one                   ATL_rone
#define zero                  ATL_rzero
#else
   const TYPE                 * alphaY, * beta0;
   const TYPE                 one [2] = { ATL_rone,  ATL_rzero },
                              zero[2] = { ATL_rzero, ATL_rzero };
#endif
   void                       * vx = NULL, * vy = NULL;
   TYPE                       * A0, * A1, * x, * x0, * x1, * y, * y00, * y0,
                              * y1;
   int                        incXY, incXY1, j, jb, lda, lda0, lda1, mb, mb1,
                              n, nb;
/* ..
 * .. Executable Statements ..
 *
 */
   if( N == 0 ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   {
      if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY );
      return;
   }

   if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) )
   {
      vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vx ); x = ATL_AlignPtr( vx );
      Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 );
      alphaY = one;
   }
   else { x = (TYPE *)(X); alphaY = ALPHA; }

   if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) )
   {
      vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy );
      beta0 = zero;
   }
   else { y00 = y = (TYPE *)(Y); beta0 = BETA; }

   ATL_GetPartSPMV( A, N, &mb, &nb );

   mb1 = N - ( ( N - 1 ) / mb ) * mb; incXY1 = (nb SHIFT);

   if( UPLO == AtlasUpper )
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 );

      lda = 1; lda0 = lda; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 );
      incXY = (mb SHIFT); x0 = x + incXY; y0 = y + incXY;

      for( n = N - mb; n > 0; n -= mb, x0 += incXY, x += incXY,
           y0 += incXY, y += incXY )
      {
         Mjoin( PATL, hpmvU )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 - (mb SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MUpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1; lda = lda0; A = A0; MUpnext( mb, A0, lda0 );
      }
      Mjoin( PATL, hpmvU )( mb1, A, lda, x, beta0, y );
   }
   else
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvLN_a1_x1_b1_y1 );

      lda = N; lda0 = lda; A0 = (TYPE *)(A); MLpnext( N, A, lda );
      incXY = (mb SHIFT); x0 = x; y0 = y;

      for( n  = N - mb, x += ((N-mb) SHIFT), y += ((N-mb) SHIFT); n > 0;
           n -= mb, x -= incXY, y -= incXY )
      {
         MLpprev( mb, A, lda );
         Mjoin( PATL, hpmvL )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 + (n SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MLpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1;
      }
      Mjoin( PATL, hpmvL )( mb1, A0, lda0, x0, beta0, y0 );
   }

   if( vx ) free( vx );
   if( vy )
   { Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); }
/*
 * End of Mjoin( PATL, hpmv )
 */
}
Esempio n. 15
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
{
    size_t incA, incB, incC;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    const size_t incK = ATL_MulByNB((size_t)K);
    int N = N0;
    int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n;
    void *vA=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    MAT2BLK A2blk, B2blk;
    PUTBLK putblk;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    /*
     * If K sufficiently large, write to temporary C as safety measure;  otherwise
     * write directly to C
     */
    if (nKb < 12)
    {
        putblk = NULL;
        pC = C;
        if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1;
        else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0;
        else NBmm0 = NBmm_bX;
    }
    else
    {
        NBmm0 = NBmm_b0;
        vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB));
        if (!vC) return(-1);
        pC = ATL_AlignPtr(vC);
        if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1);
        else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0);
        else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1);
        else putblk = Mjoin(PATL,putblk_bX);
    }
    /*
     * Special case if we don't need to copy one or more input matrix
     */
    if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B))
    {
        if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) &&
                ATL_DataIsMinAligned(A))
        {
            i = NBNB;
            pA = (TYPE *) A;
            A = NULL;
            A2blk = NULL;
            incA = 0;
        }
        else
        {
            vA = malloc(ATL_Cachelen + ATL_MulBySize(incK));
            if (!vA)
            {
                free(vC);
                return(-1);
            }
            pA = ATL_AlignPtr(vA);
            if (TA == AtlasNoTrans)
            {
                incA = NB;
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1);
                else A2blk = Mjoin(PATL,row2blkT_aX);
            }
            else
            {
                incA = ATL_MulByNB(lda);
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1);
                else A2blk = Mjoin(PATL,col2blk_aX);
            }
        }
        Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA,
                           incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0);
        if (vA) free(vA);
        if (vC) free(vC);
        return(0);
    }
    i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
    if (i <= ATL_MaxMalloc) vA = malloc(i);
    if (!vA)
    {
        if (TA == AtlasNoTrans && TB == AtlasNoTrans)
        {
            if (vC) free(vC);
            return(1);
        }
        if (jb) n = nNb + 1;
        else n = nNb;
        for (j=2; !vA; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vA = malloc(h);
        }
        if (!vA)
        {
            if (vC) free(vC);
            return(-1);
        }
        n = ATL_MulByNB(k);
        jb2 = 0;
    }
    else
    {
        jb2 = jb;
        k = nNb;
        n = N;
    }
    pA = ATL_AlignPtr(vA);
    if (TB == AtlasNoTrans)
    {
        incB = ldb*n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1);
        else B2blk = Mjoin(PATL,col2blk2_aX);
    }
    else
    {
        incB = n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1);
        else B2blk = Mjoin(PATL,row2blkT2_aX);
    }
    if (TA == AtlasNoTrans)
    {
        incA = NB;
        A2blk = Mjoin(PATL,row2blkT_a1);
    }
    else
    {
        incA = ATL_MulByNB(lda);
        A2blk = Mjoin(PATL,col2blk_a1);
    }
    incC = ldc*n;
    pB = pA + incK;

    do
    {
        if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
        else B2blk(n, K, B, ldb, pB, alpha);
        Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                           incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0);
        N -= n;
        nNb -= k;
        if (N < n)
        {
            jb2 = jb;
            n = N;
            k = nNb;
        }
        C += incC;
        B += incB;
        if (!putblk) pC = C;
    }
    while (N);

    if (vC) free(vC);
    free(vA);
    return(0);
}
Esempio n. 16
0
void Mjoin(PATL,pputblk_diag)
   (const int M, const int N, const TYPE *V, const enum ATLAS_UPLO UC,
    TYPE *C, int ldc, int ldcinc, const SCALAR alpha, const SCALAR beta)
/*
 * Copies only the Upper or Lower portion of V to C
 */
{
   int i, j;

   if (UC == AtlasUpper)
   {
      if (SCALAR_IS_ZERO(beta))
      {
         if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = -V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = alpha * V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
      }
      else if (SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] += V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] -= V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] += alpha * V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
      }
      else
      {
         if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = beta*C[i] + V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = beta*C[i] - V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               for (i=0; i <= j; i++) C[i] = beta*C[i] + alpha * V[i];
               C += ldc;
               V += M;
               ldc += ldcinc;
            }
         }
      }
   }
   else
   {
      if (SCALAR_IS_ZERO(beta))
      {
         if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = -V[i];
               C += ldc;
               V += M;
            }
         }
         else if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = V[i];
               C += ldc;
               V += M;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = alpha * V[i];
               C += ldc;
               V += M;
            }
         }
      }
      else if (SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] -= V[i];
               C += ldc;
               V += M;
            }
         }
         else if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] += V[i];
               C += ldc;
               V += M;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] += alpha * V[i];
               C += ldc;
               V += M;
            }
         }
      }
      else
      {
         if (SCALAR_IS_NONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = beta*C[i] - V[i];
               C += ldc;
               V += M;
            }
         }
         else if (SCALAR_IS_ONE(alpha))
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = beta*C[i] + V[i];
               C += ldc;
               V += M;
            }
         }
         else
         {
            for (j=0; j < N; j++)
            {
               ldc += ldcinc;
               for (i=j; i < M; i++) C[i] = beta*C[i] + alpha * V[i];
               C += ldc;
               V += M;
            }
         }
      }
   }
}
Esempio n. 17
0
void Mjoin( PATL, hpmvU )
(
   const int                  N,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * X,
   const SCALAR               BETA,
   TYPE                       * Y
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, hpmvU ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n Hermitian matrix, supplied in packed form.
 *
 * This is a  recursive  version of the  algorithm.  For a more detailed
 * description of  the arguments of this function, see the reference im-
 * plementation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*gpmvT)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmvN)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
#ifdef TREAL
#define    one                ATL_rone
   TYPE                       beta0;
#else
   const TYPE                 * beta0, one[2] = { ATL_rone, ATL_rzero };
#endif
   TYPE                       * A0, * x0, * y0;
   int                        j, jb, jbs, lda = LDA, m, mb, nb;
/* ..
 * .. Executable Statements ..
 *
 */
   ATL_GetPartSPMV( A, N, &mb, &nb );

   beta0 = BETA;
   if(      SCALAR_IS_ZERO( beta0 ) )
   {
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b0_y1 );
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 );
   }
   else if( SCALAR_IS_ONE ( beta0 ) )
   {
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 );
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
   }
   else
   {
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_bX_y1 );
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 );
   }

   MUpnext( N, A, lda );
   x0 = (TYPE *)(X); X += (N SHIFT); y0 = (TYPE *)(Y); Y += (N SHIFT);

   for( j = 0; j < N; j += nb )
   {
      jb = N - j; jb = Mmin( jb, nb ); jbs = (jb SHIFT);
      MUpprev( jb, A, lda ); X -= jbs; Y -= jbs;

      if( ( m = N-j-jb ) != 0 )
      {
         A0 = (TYPE *)(A) - (m SHIFT);
         gpmvT( jb, m, one, A0, lda, x0, 1, beta0, Y,  1 );
         gpmvN( m, jb, one, A0, lda, X,  1, beta0, y0, 1 ); beta0 = one;
      }
      Mjoin( PATL, refhpmvU )( jb, one, A, lda, X, 1, beta0, Y, 1 );
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 ); beta0 = one;
      gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
   }
/*
 * End of Mjoin( PATL, hpmvU )
 */
}
Esempio n. 18
0
int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M0, const int N, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
/*
 * Outer three loops for matmul with outer loop over columns of B
 */
{
    int M = M0;
    int nMb, nNb, nKb, ib, jb, kb, ib2, h, i, j, k, m, n;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    size_t incA, incB, incC;
    int AlphaIsOne;
    const size_t incK = ATL_MulByNB((size_t)K);
    void *vB=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    const TYPE one[2] = {1.0,0.0}, zero[2] = {0.0,0.0};
    MAT2BLK A2blk, B2blk;
    MATSCAL gescal;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    pC = C;
    if (beta[1] == ATL_rzero)
    {
        gescal = NULL;
        if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
        else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
        else NBmm0 = Mjoin(PATL,CNBmm_bX);
    }
    else
    {
        NBmm0 = Mjoin(PATL,CNBmm_b1);
        gescal = Mjoin(PATL,gescal_bX);
    }
    /*
     * Special case for when what we are really doing is
     *    C <- beta*C + alpha * A * A'   or   C <- beta*C + alpha * A' * A
     */
    if ( A == B && M == N && TA != TB && (SCALAR_IS_ONE(alpha) || M <= NB)
            && TA != AtlasConjTrans && TB != AtlasConjTrans && lda == ldb )
    {
        AlphaIsOne = SCALAR_IS_ONE(alpha);
        i = ATL_MulBySize(M * K);
        if (!AlphaIsOne && pC == C && !SCALAR_IS_ZERO(beta))
            i += ATL_MulBySize(M*N);
        if (i <= ATL_MaxMalloc) vB = malloc(i + ATL_Cachelen);
        if (vB)
        {
            pA = ATL_AlignPtr(vB);
            if (TA == AtlasNoTrans)
                Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha);
            else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha);
            /*
             *       Can't write directly to C if alpha is not one
             */
            if (!AlphaIsOne)
            {
                if (SCALAR_IS_ZERO(beta)) h = ldc;
                else if (pC == C)
                {
                    pC = pA + (((size_t)M) * K SHIFT);
                    h = M;
                }
                else h = NB;
                Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, one, pA, NULL,
                                   ldb, pA, 0, NULL, zero, pC, h,
                                   Mjoin(PATL,gescal_b0), Mjoin(PATL,CNBmm_b0));

                if (alpha[1] == ATL_rzero)
                    Mjoin(PATL,gescal_bXi0)(M, N, alpha, pC, h);
                else Mjoin(PATL,gescal_bX)(M, N, alpha, pC, h);

                if (C != pC)
                {
                    if (beta[1] == ATL_rzero)
                    {
                        if (*beta == ATL_rone)
                            Mjoin(PATL,putblk_b1)(M, N, pC, C, ldc, beta);
                        else if (*beta == ATL_rnone)
                            Mjoin(PATL,putblk_bn1)(M, N, pC, C, ldc, beta);
                        else if (*beta == ATL_rzero)
                            Mjoin(PATL,putblk_b0)(M, N, pC, C, ldc, beta);
                        else Mjoin(PATL,putblk_bXi0)(M, N, pC, C, ldc, beta);
                    }
                    else Mjoin(PATL,putblk_bX)(M, N, pC, C, ldc, beta);
                }
            }
            else Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, pA, NULL,
                                        ldb, pA, 0, NULL, beta, C, ldc, gescal, NBmm0);
            free(vB);
            if (vC) free(vC);
            return(0);
        }
    }
    i = ATL_Cachelen + ATL_MulBySize(M*K + incK);
    if (i <= ATL_MaxMalloc) vB = malloc(i);
    if (!vB)
    {
        if (TA != AtlasNoTrans && TB != AtlasNoTrans) return(1);
        if (ib) n = nMb + 1;
        else n = nMb;
        for (j=2; !vB; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vB = malloc(h);
        }
        if (!vB) return(-1);
        n = k;
        m = ATL_MulByNB(n);
        ib2 = 0;
    }
    else
    {
        n = nMb;
        m = M;
        ib2 = ib;
    }
    pB = ATL_AlignPtr(vB);
    if (TA == AtlasNoTrans)
    {
        incA = m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,row2blkT2_a1);
            else A2blk = Mjoin(PATL,row2blkT2_aXi0);
        }
        else A2blk = Mjoin(PATL,row2blkT2_aX);
    }
    else if (TA == AtlasConjTrans)
    {
        incA = lda*m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blkConj2_a1);
            else A2blk = Mjoin(PATL,col2blkConj2_aXi0);
        }
        else A2blk = Mjoin(PATL,col2blkConj2_aX);
    }
    else
    {
        incA = lda*m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blk2_a1);
            else A2blk = Mjoin(PATL,col2blk2_aXi0);
        }
        else A2blk = Mjoin(PATL,col2blk2_aX);
    }
    if (TB == AtlasNoTrans)
    {
        incB = ATL_MulByNB(ldb) SHIFT;
        B2blk = Mjoin(PATL,col2blk_a1);
    }
    else if (TB == AtlasConjTrans)
    {
        incB = NB2;
        B2blk = Mjoin(PATL,row2blkC_a1);
    }
    else
    {
        incB = NB2;
        B2blk = Mjoin(PATL,row2blkT_a1);
    }
    incC = m SHIFT;

    pA = pB + (incK SHIFT);
    do
    {
        if (TA == AtlasNoTrans) A2blk(m, K, A, lda, pA, alpha);
        else A2blk(K, m, A, lda, pA, alpha);
        Mjoin(PATL,mmJIK2)(K, n, nNb, nKb, ib2, jb, kb, alpha, pA, B, ldb, pB,
                           incB, B2blk, beta, C, ldc, gescal, NBmm0);
        M -= m;
        nMb -= n;
        if (M <= m)
        {
            ib2 = ib;
            m = M;
            n = nMb;
        }
        C += incC;
        A += incA;
    }
    while (M);
    free(vB);
    if (vC) free(vC);
    return(0);
}
Esempio n. 19
0
int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                        const int M0, const int N, const int K,
                        const SCALAR alpha, const TYPE *A, const int lda,
                        const TYPE *B, const int ldb, const SCALAR beta,
                        TYPE *C, const int ldc)
/*
 * Copy matmul algorithm, copies A and B on-the-fly
 * If M < 0, allocates only (MB+NB)*KB workspace
 */
{
   void *v=NULL;
   const TYPE *a=A;
   TYPE *pA, *pB, *pB0;
   MAT2BLK2 A2blk, B2blk;
   NBMM0 NBmm0, NBmm1, pNBmm0;
   const int M = (M0 >= 0) ? M0 : -M0;
   int nkblks, nmblks, nnblks, mr, nr, kr, KR, bigK, h, i, j, ZEROC;
   size_t incAk, incBk, incAm, incBn, incAW, incAWp, incBW, incBWp, incW;

/*
 * If both M and N <= NB, and one of them is not full, call BPP, which
 * can sometimes avoid doing cleanup forall cases
 */
   if (M <= MB && N <= NB && (M != MB || N != NB))
      return(Mjoin(PATL,mmBPP)(TA, TB, M, N, K, alpha, A, lda, B, ldb,
                               beta, C, ldc));
/*
 * If these workspace increments are 0, we do JIT NBxNB copies instead of
 * copying entire array/panel.  Don't copy mat if you can't reuse it.
 */
   if (M0 > 0)
   {
      incAW = (N > NB) ? KB*MB : 0;
      incBW = (M > NB) ? KB*NB : 0;
   }
   else /* allocate in minimal space */
      incAW = incBW = 0;
   nmblks = M/MB;
   nnblks = N/NB;
   nkblks = K/KB;
   mr = M - nmblks*MB;
   nr = N - nnblks*NB;
   kr = K - nkblks*KB;
/*
 * K-loop is special, in that we don't call user cleanup, must explicitly zero,
 * and K-cleanup is typically slower even for generated kernels.  Therefore,
 * allow extra leaway for doing extra flops.  Note error is unaffected by
 * any of these extra flops: K-loop has elts zeroed, and multiplying zeros
 * and adding in zeros doesn't add to error
 */
   KR = (kr && kr+4 >= KB) ? KB : kr;
   bigK = nkblks*KB+KR;
   if (incAW)
   {
      i = MB*bigK;
      incAWp = KB*mr;
   }
   else
   {
      i = MB*KB;
      incAWp = 0;
   }
   if (incBW)
   {
      incBWp = KB*nr;
      incW = bigK*NB;
      i += N*bigK;
   }
   else
   {
      incBWp = incW = 0;
      i += NB*KB;
   }
   i *= sizeof(TYPE);
   if (i <= ATL_MaxMalloc || !(incAW | incBW))
      v = malloc(ATL_Cachelen+i);
   if (!v) return(-1);
   pA = ATL_AlignPtr(v);
   pB0 = pA + (incAW ? bigK*MB : KB*MB);
   if (TA == AtlasNoTrans)
   {
      A2blk = Mjoin(PATL,gemoveT);
      incAk = lda*KB;
      incAm = MB;
   }
   else
   {
      A2blk = Mjoin(PATL,gemove);
      incAk = KB;
      incAm = MB*lda;
   }
   if (TB == AtlasNoTrans)
   {
      B2blk = Mjoin(PATL,gemove);
      incBk = KB;
      incBn = NB*ldb;
   }
   else
   {
      B2blk = Mjoin(PATL,gemoveT);
      incBk = ldb*KB;
      incBn = NB;
   }
/*
 * See what kernel we're calling
 */
   if ( SCALAR_IS_ONE(beta) )
   {
      NBmm0 = NBmm_b1;
      pNBmm0 = Mjoin(PATL,pNBmm_b1);
   }
   else if ( SCALAR_IS_ZERO(beta) )
   {
      NBmm0 = NBmm_b0;
      pNBmm0 = Mjoin(PATL,pNBmm_b0);
   }
   else
   {
      NBmm0 = NBmm_bX;
      pNBmm0 = Mjoin(PATL,pNBmm_bX);
   }
   KR = (KR == KB) ? KB : 0;
   ZEROC = !KR && SCALAR_IS_ZERO(beta);

   for (i=0; i < nmblks; i++)
   {
      a = A+i*incAm;
      pB = pB0;       /* foreach row-panel of A, start at B's copy space */
      for (j=nnblks; j; j--)
      {
         Mjoin(PATL,mmK)(MB, MB, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBW,
                         C, ldc, A2blk, B2blk, NBmm0, NBmm_b1);
         B += incBn;             /* copy next col panel of B */
         pB += incW;             /* to next col panel of pB  */
         a = (incAW ? NULL : a); /* reuse row-panel of A if copied */
         C += ldc*NB;
      }
      if (nr)
      {
         if (ZEROC)
            Mjoin(PATL,gezero)(MB, nr, C, ldc);
         Mjoin(PATL,mmK)(MB, MB, nr, nr, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBWp,
                         C, ldc, A2blk, B2blk, pNBmm0, Mjoin(PATL,pNBmm_b1));
      }
      C += MB - nnblks*ldc*NB;
      if (incBW)
      {
         B = NULL;              /* finished copying B */
         incBn = 0;
      }
      else
         B -= nnblks*incBn;
   }
   if (mr)
   {
      a = A + nmblks*incAm;
      pB = pB0;
      if ( SCALAR_IS_ONE(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b1);
      else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b0);
      else NBmm0 = Mjoin(PATL,pMBmm_bX);
      for (j=nnblks; j; j--)
      {
         Mjoin(PATL,mmK)(mr, mr, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
                         a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBW,
                         C, ldc, A2blk, B2blk, NBmm0, Mjoin(PATL,pMBmm_b1));
         B += incBn;              /* copy next col panel of B */
         pB += incW;              /* to next col panel of pB  */
         a = (incAW ? NULL : a);  /* reuse row-panel of A if copied */
         C += ldc*NB;
      }
      if (nr)
      {
         if ( SCALAR_IS_ZERO(beta) )
            Mjoin(PATL,gezero)(mr, nr, C, ldc);
         Mjoin(PATL,mmK)(mr, mr, nr, nr, nkblks, kr, (incAW | incBW) ? KR:0,
                         ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp,
                         B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk,
                         Mjoin(PATL,pKBmm), Mjoin(PATL,pKBmm));
      }
   }
   free(v);
   return(0);
}
Esempio n. 20
0
void Mjoin(PATL,mm_axpy)
   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M,
    const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda0,
    const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0)
/*
 * TA == AtlasNoTrans
 * GEMM implemented by calling axpy, with any M partitioning already done
 */
{
   int i, j, k, incBk, incBn;
   const size_t lda=((size_t)lda0)+lda0, ldc=((size_t)ldc0)+ldc0, incAn = lda*K;
   const int ALONE=SCALAR_IS_ONE(alpha), BEONE=SCALAR_IS_ONE(beta);
   TYPE alph[2], BC[2];
   register TYPE r1, r2, i1, i2;

   if (TB == AtlasNoTrans)
   {
      incBk = 2;
      incBn = ldb0 - K;
   }
   else
   {
      incBk = ldb0+ldb0;
      incBn = 1 - ldb0*K;
   }
   incBn += incBn;

   if (TB == AtlasConjTrans)
   {
      if (BEONE)
      {
         if (ALONE)
         {
            for(j=0; j < N; j++)
            {
               for (k=0; k < K; k++, B += incBk, A += lda)
               {
                  BC[0] = B[0]; BC[1] = -B[1];
                  Mjoin(PATL,axpy)(M, BC, A, 1, C, 1);
               }
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
         else
         {
            for(j=0; j < N; j++)
            {
               BC[0] = B[0]; BC[1] = -B[1];
               ATL_cplxmul(alph, alpha, BC);
               Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1);
               B += incBk;
               A += lda;
               for (k=1; k < K; k++, B += incBk, A += lda)
               {
                  BC[0] = B[0]; BC[1] = -B[1];
                  ATL_cplxmul(alph, alpha, BC);
                  Mjoin(PATL,axpy)(M, alph, A, 1, C, 1);
               }
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
      }
      else /* BETA != 1.0 */
      {
         if (ALONE)
         {
            for(j=0; j < N; j++)
            {
               BC[0] = B[0]; BC[1] = -B[1];
               Mjoin(PATL,axpby)(M, BC, A, 1, beta, C, 1);
               B += incBk;
               A += lda;
               for (k=1; k < K; k++, B += incBk, A += lda)
               {
                  BC[0] = B[0]; BC[1] = -B[1];
                  Mjoin(PATL,axpy)(M, BC, A, 1, C, 1);
               }
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
         else
         {
            for(j=0; j < N; j++)
            {
               BC[0] = B[0]; BC[1] = -B[1];
               ATL_cplxmul(alph, alpha, BC);
               Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1);
               B += incBk;
               A += lda;
               for (k=1; k < K; k++, B += incBk, A += lda)
               {
                  BC[0] = B[0]; BC[1] = -B[1];
                  ATL_cplxmul(alph, alpha, BC);
                  Mjoin(PATL,axpy)(M, alph, A, 1, C, 1);
               }
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
      }
   }
   else /* B is not conjugated */
   {
      if (BEONE)
      {
         if (ALONE)
         {
            for(j=0; j < N; j++)
            {
               for (k=0; k < K; k++, B += incBk, A += lda)
                  Mjoin(PATL,axpy)(M, B, A, 1, C, 1);
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
         else
         {
            for(j=0; j < N; j++)
            {
               ATL_cplxmul(alph, alpha, B);
               Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1);
               B += incBk;
               A += lda;
               for (k=1; k < K; k++, B += incBk, A += lda)
               {
                  ATL_cplxmul(alph, alpha, B);
                  Mjoin(PATL,axpy)(M, alph, A, 1, C, 1);
               }
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
      }
      else /* BETA != 1.0 */
      {
         if (ALONE)
         {
            for(j=0; j < N; j++)
            {
               Mjoin(PATL,axpby)(M, B, A, 1, beta, C, 1);
               B += incBk;
               A += lda;
               for (k=1; k < K; k++, B += incBk, A += lda)
                  Mjoin(PATL,axpy)(M, B, A, 1, C, 1);
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
         else
         {
            for(j=0; j < N; j++)
            {
               ATL_cplxmul(alph, alpha, B);
               Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1);
               B += incBk;
               A += lda;
               for (k=1; k < K; k++, B += incBk, A += lda)
               {
                  ATL_cplxmul(alph, alpha, B);
                  Mjoin(PATL,axpy)(M, alph, A, 1, C, 1);
               }
               C += ldc;
               B += incBn;
               A -= incAn;
            }
         }
      }
   }
}
Esempio n. 21
0
int Mjoin(PATL,mmJKI)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda,
                      const TYPE *B, const int ldb, const SCALAR beta,
                      TYPE *C, const int ldc)
/*
 * This gemm is for small K, so we build gemm out of AXPY (outer product)
 * rather than dot (inner product).
 */
{
   int Mp, mp, m, k, ldaa=lda;
   void *vA=NULL;
   TYPE *pA;
   const TYPE CONE[2]={ATL_rone, ATL_rzero}, CNONE[2]={ATL_rnone, ATL_rzero};
   const SCALAR alp=alpha;

/*
 * Compute M partition necessary to promote reuse in the L1 cache.  Check
 * NB^2 in addition to L1elts, to catch machines where L1 is not used by FPU.
 * If this gives a small Mp, use CacheEdge instead (reuse in L2 instead of L1).
 */
   Mp = NB*NB;
   m = ATL_L1elts >> 1;
   Mp = (m > Mp) ? m : Mp;
   Mp /= ((K+2)<<1);
   if (Mp < 128)
   {
      #if !defined(CacheEdge) || CacheEdge == 0
         Mp = M;
      #else
         Mp = (CacheEdge) / ((K+2)*ATL_sizeof);
         if (Mp < 128)
            Mp = M;
      #endif
   }
   if (Mp > M)
      Mp = M;
/*
 * Change Mp if remainder is very small
 */
   else
   {
      Mp -= 16;      /* small safety margin on filling cache */
      mp = M / Mp;
      m = M - mp*Mp;
      if (m && m < 32)
         Mp += (m+mp-1)/mp;
   }
/*
 * If A not in NoTrans format, need to copy so it can use axpy wt stride=1.
 * NOTE: this routine should not be called when you can't afford this copy
 */
   if (TA != AtlasNoTrans)
   {
      vA = malloc(ATL_Cachelen + Mp*ATL_MulBySize(K));
      if (!vA) return(-1);
      pA = ATL_AlignPtr(vA);
      alp = CONE;
      ldaa = Mp;
      pA += Mp+Mp;
   }
   else
      pA = (TYPE *) A;
   for (m=0; m < M; m += Mp)
   {
      mp = M - m;
      if (mp > Mp)
         mp = Mp;
/*
 *    If the thing is in Trans format, copy to NoTrans
 */
      if (vA)
      {
         pA -= (Mp+Mp);
         if (TA == AtlasConjTrans)
         {
            for (k=0; k < K; k++)
            {
               Mjoin(PATL,copy)(mp, A+k+k, lda, pA+((k*ldaa)<<1), 1);
               Mjoin(PATLU,scal)(mp, ATL_rnone, pA+1+((k*ldaa)<<1), 2);
               if (!SCALAR_IS_ONE(alpha))
                  Mjoin(PATL,scal)(mp, alpha, pA+((k*ldaa)<<1), 1);
            }
         }
         else
         {
            for (k=0; k < K; k++)
               Mjoin(PATL,cpsc)(mp, alpha, A+k+k, lda, pA+((k*ldaa)<<1), 1);
         }
         A += mp*(lda+lda);
      }
      Mjoin(PATL,mm_axpy)(AtlasNoTrans, TB, mp, N, K, alp, pA, ldaa, B, ldb,
                          beta, C, ldc);
      pA += mp+mp;
      C += mp+mp;
   }
   if (vA) free(vA);
   return(0);
}
Esempio n. 22
0
int Mjoin(PATL,tsyrk_amm_K)
(
   const enum ATLAS_UPLO Uplo,
   const enum ATLAS_TRANS Trans,
   ATL_CINT N,
   ATL_CINT K,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CINT lda,
   const SCALAR beta,
   TYPE *C,
   ATL_CINT ldc
)
{
   amminfo_t mminfo;
   ATL_tsyrk_ammK_t pd;
   ablk2cmat_t Mjoin(PATL,tGetSyammInfo_K)
      (amminfo_t *out, const int P, enum ATLAS_TRANS TA, ATL_CSZT N,ATL_CSZT K);
   int kb=ATL_AMM_MAXKB, nkb = K / ATL_AMM_MAXKB, P = ATL_NTHREADS;
   int ku, kr, mb, nb, mu, nu;
   size_t sz;
   void *vp=NULL;

   if (nkb < P)
   {
      kb = ATL_AMM_98KB;
      nkb = K / ATL_AMM_98KB;
      if (nkb < P)
      {
         nkb = K / ATL_AMM_66KB;
         kb = ATL_AMM_66KB;
      }
   }
   if (nkb < P)
   {
      if (nkb < 2)
      {
          Mjoin(PATL,syrk)(Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
          return(0);
      }
      P = nkb;
   }
   pd.blk2c_b0 = Mjoin(PATL,tGetSyammInfo_K)(&mminfo, P, Trans, N, kb);
   kb = mminfo.kb;
   nkb = K / kb;

   mu = mminfo.mu;
   nu = mminfo.nu;
   pd.nmu = (N+mu-1) / mu;
   pd.nnu = (N+nu-1) / nu;
   pd.mb = mb = pd.nmu*mu;
   pd.nb = nb = pd.nnu*nu;
   pd.kb = mminfo.kb;
   sz = ((((size_t)mb)*nb)<<1) + (mb+nb)*kb;
   pd.wsz = sz;
   sz = ATL_MulBySize(sz)*P;
   vp = malloc(sz+ATL_Cachelen);
   if (!vp)
      return(1);
   pd.w = ATL_AlignPtr(vp);
   kr = K - nkb*kb;
   pd.kb0 = pd.KB0 = kr;
   ku = mminfo.ku;
   if (!kr)
   {
      pd.kb0 = pd.KB0 = kb;
      pd.ammK_b0 = mminfo.amm_b0;
      pd.ammK_b1 = mminfo.amm_b1;
   }
   else
   {
      #if ATL_AMM_MAXKMAJ > 1
         if (ATL_AMMFLG_KMAJOR(mminfo.flag))
         {
            pd.KB0 = ((kr+ku-1)/ku)*ku;
            if (ATL_AMMFLG_KRUNTIME(mminfo.flag))
            {
               pd.ammK_b0 = mminfo.amm_b0;
               pd.ammK_b1 = mminfo.amm_b1;
            }
            else
            {
               pd.ammK_b0 = mminfo.amm_k1_b0;
               pd.ammK_b1 = mminfo.amm_k1_b1;
            }
         }
         else
      #endif
      {
         if (ATL_AMMFLG_KRUNTIME(mminfo.flag) && kr == (kr/ku)*ku &&
             kr > mminfo.kbmin)
         {
               pd.ammK_b0 = mminfo.amm_b0;
               pd.ammK_b1 = mminfo.amm_b1;
         }
         else
         {
               pd.ammK_b0 = mminfo.amm_k1_b0;
               pd.ammK_b1 = mminfo.amm_k1_b1;
         }
      }
   }
   pd.amm_b0 = mminfo.amm_b0;
   pd.amm_b1 = mminfo.amm_b1;
   pd.blk2c_b1 = mminfo.Cblk2cm;
   pd.a2blk = mminfo.a2blk;
   pd.b2blk = mminfo.b2blk;
   pd.A = A;
   pd.C = C;
   pd.alpha = &alpha;
   pd.beta = &beta;
   pd.nkblks = (kr) ? nkb+1 : nkb;
   pd.KbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(pd.nkblks, P), pd.nkblks, 0);
   pd.Cmut = ATL_mutex_init();
   pd.BETA_APPLIED = SCALAR_IS_ONE(beta);
   pd.LOWER = (Uplo == AtlasLower);
   pd.TA = (Trans == AtlasTrans);
   pd.N = N;
   pd.lda = lda;
   pd.ldc = ldc;

//   #define DEBUG1 1
   #ifdef DEBUG1
   {
      ATL_LAUNCHSTRUCT_t ls;
      ATL_thread_t ts;
      ts.rank = 0;
      ts.P = 1;
      ls.opstruct = &pd;
      Mjoin(PATL,DoWork_syrk_amm_K)(&ls, &ts);
   }
   #else
      ATL_goparallel(P, Mjoin(PATL,DoWork_syrk_amm_K), &pd,
                     Mjoin(PATL,CombSyrk_ammK));
   #endif
/*
 * Answer is written back to rank0's workspace, extract it & write to C
 */
   {
      TYPE *wC = pd.w+kb*(mb+nb), *w = wC + mb*nb, *c = C;
/*
 *    Put it into block-major storage in w
 */
      pd.blk2c_b0(N, N, ATL_rone, wC, ATL_rzero, w, N);
/*
 *    Now copy out only upper or lower portion
 */
      if (pd.LOWER)
      {
         int j;
         for (j=0; j < N; j++, c += ldc+1, w += N+1)
             Mjoin(PATL,axpby)(N-j, alpha, w, 1, beta, c, 1);
      }
      else
      {
         int j;
         for (j=0; j < N; j++, c += ldc, w += N)
             Mjoin(PATL,axpby)(j+1, alpha, w, 1, beta, c, 1);
      }
   }
   free(vp);
   ATL_mutex_free(pd.Cmut);
   ATL_FreeGlobalAtomicCount(pd.KbCtr);
   return(0);
}
Esempio n. 23
0
void Mjoin( PATL, ptgeadd )
(
   const int                  M,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const SCALAR               BETA,
   TYPE                       * C,
   const int                  LDC
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, ptgeadd ) adds an m-by-n matrix A to the matrix B.
 *
 * This is a multi-threaded version of the algorithm.
 *
 * Arguments
 * =========
 *
 * PTYPE   (input)                       const PT_MISC_TYPE_T *
 *         On entry, PTYPE  points  to the data structure containing the
 *         type information.
 *
 * NODE    (input)                       const unsigned int
 *         On entry, NODE specifies the current node number.
 *
 * THREADS (input)                       const unsigned int
 *         On entry, THREADS  specifies the number of threads to be used
 *         for the current operation.
 *
 * ATTR    (input)                       pthread_attr_t *
 *         On entry, ATTR  specifies  the  thread attribute object to be
 *         used for the node functions to be threaded.
 *
 * NB      (input)                       const int
 *         On entry, NB  specifies  the  blocksize  to  be  used for the
 *         problem size partitioning.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   pthread_attr_t             attr;
   PT_TREE_T                  root = NULL;
#ifdef TREAL
   TYPE                       alpha0 = (TYPE)(ALPHA),
                              beta0  = (TYPE)(BETA);
#endif
   void                       * alpha, * beta;
/* ..
 * .. Executable Statements ..
 *
 */
   if( ( M <= 0 ) || ( N <= 0 ) ||
       ( SCALAR_IS_ZERO( ALPHA ) && SCALAR_IS_ONE( BETA ) ) ) return;

#ifdef TREAL
   alpha = (void *)(&alpha0); beta = (void *)(&beta0);
#else
   alpha = (void *)(ALPHA);   beta = (void *)(BETA);
#endif
   ATL_thread_init( &attr );
   root = Mjoin( PATL, ptgeadd_nt )( ATL_NTHREADS, &attr, M, N, alpha,
                                     (void *)(A), LDA, beta, (void *)(C),
                                     LDC );
   ATL_join_tree  ( root );
   ATL_free_tree  ( root );
   ATL_thread_exit( &attr );
/*
 * End of Mjoin( PATL, ptgeadd )
 */
}
Esempio n. 24
0
void Mjoin(PATL,tsymm)
   (const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
    ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda,
    const TYPE *B, ATL_CINT ldb, const SCALAR beta, TYPE *C, ATL_CINT ldc)
{
   ATL_INT n, nblks, tblks, nr, minblks, extrablks, p, i, j;
   ATL_thread_t tp[ATL_NTHREADS];
   ATL_TSYMM_t symms[ATL_NTHREADS];
   ATL_LAUNCHSTRUCT_t ls;
   const TYPE *b;
   TYPE *c;
   static int nb=0;

   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))
   {
      if (!SCALAR_IS_ONE(beta))
         Mjoin(PATL,gescal)(M, N, beta, C, ldc);
      return;
   }
   if (!nb) nb = Mjoin(PATL,GetNB());
   if (Side == AtlasLeft)
   {
      nblks = N / nb;
      nr = N - nblks*nb;
      tblks = ((double)(M*N)) / ( (double)nb * nb );
      p = (nblks+ATL_TSYMM_ADDP-1)/ATL_TSYMM_ADDP;
      if (p < ATL_NTHREADS)  /* N not big enough to give blk to each proc */
      {
/*
 *       If I can't split N, and M is the dominant cost, use recursion to
 *       decompose symmetric matrix; parallelism will come from TGEMM calls
 */
         if (M > (N<<(ATL_NTHRPOW2+2)))
         {
            ATL_tsymm_SYsplit(Side, Uplo, M, N, alpha, A, lda, B, ldb,
                              beta, C, ldc, nb);
            return;
         }
      }
      else
         p = ATL_NTHREADS;
      if (p < 2)
         goto SERIAL;
/*
 *    Distribute N over the processors
 */
      b = B;
      c = C;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         symms[i].A = A;
         symms[i].B = b;
         symms[i].alpha = SADD alpha;
         symms[i].beta = SADD beta;
         symms[i].C = c;
         symms[i].M = M;
         symms[i].N = n;
         symms[i].lda = lda;
         symms[i].ldb = ldb;
         symms[i].ldc = ldc;
         symms[i].side = Side;
         symms[i].uplo = Uplo;
         b = MindxT(b, ATL_MulBySize((size_t)ldb)*n);
         c = MindxT(c, ATL_MulBySize((size_t)ldc)*n);
      }
      for (; i < ATL_NTHREADS; i++)  /* flag rest of struct as uninitialized */
         symms[i].M = 0;
   }
   else  /* Side == AtlasRight */
   {
      nblks = M / nb;
      nr = M - nblks*nb;
      tblks = ((double)(M*N)) / ( (double)nb * nb );
      p = (nblks+ATL_TSYMM_ADDP-1)/ATL_TSYMM_ADDP;
      if (p < ATL_NTHREADS)  /* N not big enough to give blk to each proc */
      {
/*
 *       If I can't split M, and N is the dominant cost, use recursion to
 *       decompose symmetric matrix; parallelism will come from TGEMM calls
 */
         if (N > (M<<(ATL_NTHRPOW2+2)))
         {
            ATL_tsymm_SYsplit(Side, Uplo, M, N, alpha, A, lda, B, ldb,
                              beta, C, ldc, nb);
            return;
         }
      }
      else
         p = ATL_NTHREADS;
      if (p < 2)
         goto SERIAL;
/*
 *    Distribute M over the processors
 */
      b = B;
      c = C;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         symms[i].A = A;
         symms[i].B = b;
         symms[i].alpha = SADD alpha;
         symms[i].beta = SADD beta;
         symms[i].C = c;
         symms[i].M = n;
         symms[i].N = N;
         symms[i].lda = lda;
         symms[i].ldb = ldb;
         symms[i].ldc = ldc;
         symms[i].side = Side;
         symms[i].uplo = Uplo;
         b = MindxT(b, ATL_MulBySize((size_t)n));
         c = MindxT(c, ATL_MulBySize((size_t)n));
      }
      for (; i < ATL_NTHREADS; i++)  /* flag rest of struct as uninitialized */
         symms[i].M = 0;
   }
   if (p < 2)
   {
SERIAL:
      Mjoin(PATL,symm)(Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
      return;
   }
   ATL_goparallel(p, Mjoin(PATL,DoWorkSYMM), symms, NULL);
}
Esempio n. 25
0
/*
 * This is special-case code that handles rank-2 update by calling GER2
 */
int Mjoin(PATL,ammm_rk2)
(
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CSZT M,
   ATL_CSZT N,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CSZT lda,
   const TYPE *B,
   ATL_CSZT ldb,
   const SCALAR beta,
   TYPE *C,
   ATL_CSZT ldc
)
{
   #ifdef DREAL
      const int MB=512, NB=32;
   #else /* SREAL */
      const int MB=512, NB=64;
   #endif
   void *vp;
   TYPE *x, *y, *w, *z;
   size_t j;
   ATL_CSZT incC = NB*ldc;

/*
 * If beta is one, can handle by one call to ger2
 */
   if (SCALAR_IS_ONE(beta))
   {
      if (TA == AtlasNoTrans)
      {
         if (TB == AtlasNoTrans)
            Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, ldb, alpha, A+lda, 1,
                             B+1, ldb, C, ldc);
         else
            Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, 1, alpha, A+lda, 1,
                             B+ldb, 1, C, ldc);
      }
      else if (TB == AtlasNoTrans)
         Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, ldb, alpha, A+1, lda,
                          B+1, ldb, C, ldc);
      else
         Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, 1, alpha, A+1, lda,
                          B+ldb, 1, C, ldc);
      return(0);
   }
/*
 * Later on, do smart think like copy only MB/NB at a time, and don't copy
 * at all if vectors are contiguous, but right now, always do copy up-front
 * so loop does not have to worry about TA/TB; this is a O(N) cost in N^2 alg
 */
   vp = malloc(2*ATL_MulBySize(M+N)+4*ATL_Cachelen);
   if (!vp)
      return(1);
   x = ATL_AlignPtr(vp);
   y = x + M;
   y = ATL_AlignPtr(y);
   w = y + N;
   w = ATL_AlignPtr(w);
   z = w + M;
   z = ATL_AlignPtr(z);
   if (TA == AtlasNoTrans)
   {
      Mjoin(PATL,copy)(M, A, 1, x, 1);
      Mjoin(PATL,copy)(M, A+lda, 1, w, 1);
   }
   else
   {
      Mjoin(PATL,copy)(M, A, lda, x, 1);
      Mjoin(PATL,copy)(M, A+1, lda, w, 1);
   }
   if (SCALAR_IS_ONE(alpha))
   {
      if (TB == AtlasNoTrans)
      {
         Mjoin(PATL,copy)(N, B, ldb, y, 1);
         Mjoin(PATL,copy)(N, B+1, ldb, z, 1);
      }
      else
      {
         Mjoin(PATL,copy)(N, B, 1, y, 1);
         Mjoin(PATL,copy)(N, B+ldb, 1, z, 1);
      }
   }
   else
   {
      if (TB == AtlasNoTrans)
      {
         Mjoin(PATL,cpsc)(N, alpha, B, ldb, y, 1);
         Mjoin(PATL,cpsc)(N, alpha, B+1, ldb, z, 1);
      }
      else
      {
         Mjoin(PATL,cpsc)(N, alpha, B, 1, y, 1);
         Mjoin(PATL,cpsc)(N, alpha, B+ldb, 1, z, 1);
      }
   }
   for (j=0; j < N; j += NB, C += incC)
   {
      size_t i, nb = N-j;
      nb = (nb >= NB) ? NB : nb;
      for (i=0; i < M; i += MB)
      {
         size_t mb = M-i;
         mb = (mb >= MB) ? MB : mb;
         Mjoin(PATL,gescal)(mb, nb, beta, C+i, ldc);
         Mjoin(PATL,ger2)(mb, nb, ATL_rone, x+i, 1, y+j, 1, ATL_rone,
                          w+i, 1, z+j, 1, C+i, ldc);
      }
   }
   free(vp);
   return(0);
}
Esempio n. 26
0
void Mjoin( PATL, sbmv )
(
   const enum ATLAS_UPLO      UPLO,
   const int                  N,
   const int                  K,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const int                  LDA,
   const TYPE                 * X,
   const int                  INCX,
   const SCALAR               BETA,
   TYPE                       * Y,
   const int                  INCY
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, sbmv ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n symmetric band matrix, with k super-diagonals.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*gbmv0)( const int, const int, const int,
                              const int, const SCALAR, const TYPE *, const int,
                              const TYPE *, const int, const SCALAR, TYPE *,
                              const int );
   void                       (*gbmv1)( const int, const int, const int,
                              const int, const SCALAR, const TYPE *, const int,
                              const TYPE *, const int, const SCALAR, TYPE *,
                              const int );
   void                       (*gbmvN)( const int, const int, const int,
                              const int, const SCALAR, const TYPE *, const int,
                              const TYPE *, const int, const SCALAR, TYPE *,
                              const int );
#ifdef TREAL
   TYPE                       alphaY, beta0;
#define lda2                  LDA
#define one                   ATL_rone
#define zero                  ATL_rzero
   void                       * vx = NULL, * vy = NULL;
   TYPE                       * x, * y, * y00;
#else
   const TYPE                 one [2] = { ATL_rone,  ATL_rzero },
                              zero[2] = { ATL_rzero, ATL_rzero };
   const TYPE                 * alphaY, * beta0;
   void                       * vx = NULL, * vy = NULL;
   TYPE                       * x, * y, * y00;
   const int                  lda2 = ( LDA SHIFT );
#endif
   int                        ian, ia, j, ja, jan, k, kb, kl, ku, ma, mb,
                              mb1, n, na, nb;
/* ..
 * .. Executable Statements ..
 *
 */
   if( N == 0 ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   {
      if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY );
      return;
   }
   Mjoin(PATL,refsbmv)(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, INCY);

/*
 * End of Mjoin( PATL, sbmv )
 */
}
Esempio n. 27
0
int Mjoin(PATU,usergemm)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                          const int M, const int N, const int K,
                          const SCALAR alpha, const TYPE *A, const int lda0,
                          const TYPE *B, const int ldb0, const SCALAR beta,
                          TYPE *C, const int ldc0)
{
   TYPE *buffer, *a_buf;
   #ifdef TREAL
      #define lda lda0
      #define ldb ldb0
      #define ldc ldc0
   #else
      const int lda = lda0<<1, ldb = ldb0<<1, ldc = ldc0<<1;
   #endif
/*
 * NOTE: this function pointer is critical.  If you call goto's assembly
 * matmul directly, you get seg fault for complex on the EV6, but it works
 * fine as long as you use a function pointer.  -- RCW
 */
   int (*gemm_xx)
      (const int, const int, const int, const SCALAR, const TYPE *, const int,
       const TYPE *, const int, TYPE *, const int, TYPE *);

   if ( !SCALAR_IS_ONE(beta) ) Mjoin(PATL,gescal)(M, N, beta, C, ldc0);
   #ifdef ATL_OS_OSF1
      buffer = (TYPE *) mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
                             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   #else
      buffer = (TYPE *) malloc(BUFFER_SIZE);
   #endif
   if (!buffer) return(-1);

   a_buf = (TYPE *)(((size_t)buffer + ALIGN_SIZE) & ~ALIGN_SIZE);

   if (TA == AtlasNoTrans)
   {
      if (TB == AtlasNoTrans) gemm_xx = Mjoin(PATU,gotogemm_nn);
      #ifdef TCPLX
         else if (TB == AtlasConjTrans) gemm_xx = Mjoin(PATU,gotogemm_nc);
      #endif
      else gemm_xx = Mjoin(PATU,gotogemm_nt);

   }
   #ifdef TCPLX
      else if (TA == AtlasConjTrans)
      {
         if (TB == AtlasNoTrans) gemm_xx = Mjoin(PATU,gotogemm_cn);
         else if (TB == AtlasConjTrans) gemm_xx = Mjoin(PATU,gotogemm_cc);
         else gemm_xx = Mjoin(PATU,gotogemm_ct);
      }
   #endif
   else
   {
      if (TB == AtlasNoTrans) gemm_xx = Mjoin(PATU,gotogemm_tn);
      #ifdef TCPLX
         else if (TB == AtlasConjTrans) gemm_xx = Mjoin(PATU,gotogemm_tc);
      #endif
      else gemm_xx = Mjoin(PATU,gotogemm_tt);
   }
   gemm_xx(M, N, K, alpha, A, lda, B, ldb, C, ldc, a_buf);
   #ifdef ATL_OS_OSF1
      munmap((void*)buffer, BUFFER_SIZE);
   #else
      free(buffer);
   #endif
   return(0);
}
Esempio n. 28
0
void ATL_gemv
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
    const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY)
/*
 * Y = alpha*conj(A)*X + beta*Y
 * For Conjugate transpose, first form x = conj(X), y = A^T * conj(X),
 * then use axpbyConj to add this to original Y in the operation
 * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * X), which is
 * Y = beta*Y + alpha * A^H * X.
 */
{
   ATL_mvkern_t mvtk, mvtk_b1, mvtk_b0;
   void *vp=NULL;
   TYPE *x = (TYPE*)X, *y = (TYPE*)Y;
   size_t t1, t2;
   ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1;
   int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX;
   int minM, minN;
   TYPE one[2] = {ATL_rone, ATL_rzero};
   TYPE Zero[2] = {ATL_rzero, ATL_rzero};
   TYPE *beta = (TYPE*) beta0;
   const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha0[1] == ATL_rzero);

   if (M < 1 || N < 1)          /* F77 BLAS doesn't scale in either case */
      return;
   if (SCALAR_IS_ZERO(alpha0))   /* No contrib from alpha*A*x */
   {
      if (!SCALAR_IS_ONE(beta0))
      {
         if (SCALAR_IS_ZERO(beta0))
            Mjoin(PATL,zero)(N, Y, incY);
         else
            Mjoin(PATL,scal)(N, beta, Y, incY);
      }
      return;
   }
/*
 * ATLAS's MVT kernels loop over M in inner loop, which is bad news if M is
 * very small.  Call code that requires no copy of X & Y for these degenerate
 * cases
 */
   if (M < 16)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
/*
 * Get mvtk kernel pointer along with any usage guidelines, and use the
 * optimized CacheElts to compute the correct blocking factor
 */
   mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu,
                            &minM, &minN, &alignX, &ALIGNX2A, &alignY,
                            &ForceNU, &CacheElts);
/*
 * Set up to handle case where kernel requres N to be a multiple if NU
 */
   if (ForceNU)
   {
      Nm = (N/nu)*nu;
      nr = N - Nm;
   }
   else
   {
      Nm = N;
      nr = 0;
   }
/*
 * For very small N, we can't afford the data copy, so call special case code
 */
   if (N < 4 || Nm < 1)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
   if (CacheElts)
   {
      mb = (CacheElts - 2*nu) / (2*(nu+1));
      mb = (mb > mu) ? (mb/mu)*mu : M;
      mb = (mb > M) ? M : mb;
   }
   else
      mb = M;
   vp = malloc(ATL_MulBySize(mb+N) + 2*ATL_Cachelen);
/*
 * If we cannot allocate enough space to copy the vectors, give up and
 * call the simple loop-based implementation
 */
   if (!vp)
   {
      Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX,
                          beta0, Y, incY);
      return;
   }
   y = ATL_AlignPtr(vp);
   x = y + (N SHIFT);
   x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
   beta = Zero;
/*
 * In this step, we form y = A^T * conj(X)
 */
   mvtk = mvtk_b0;
   m = M;
   do
   {
      imb = Mmin(mb, m);
      Mjoin(PATL,copyConj)(imb, X, incX, x, 1);  /* x = conj(X) */
/*
 *    Call optimized kernel (can be restricted or general)
 */
      if (imb >= minM)
         mvtk(imb, Nm, A, lda, x, y);
      else
         Mjoin(PATL,mvtk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1);
/*
 *    Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy
 */
      if (nr)
         Mjoin(PATL,mvtk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda,
                                 x, 1, beta, y+(Nm SHIFT), 1);
      beta = one;
      mvtk = mvtk_b1;
      A += imb SHIFT;
      X += (imb*incX)SHIFT;
      m -= imb;
      imb = Mmin(m,mb);
   }
   while(m);

/*
 * Given y = A^T * conj(X) from above, now do:
 *    Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * x), which is
 *    Y = beta*Y + alpha * A^H * x.
 */
   Mjoin(PATL,axpbyConj)(N, alpha0, y, 1, beta0, Y, incY);
   free(vp);
}