Пример #1
0
int ATL_getrfR(const int M, const int N, TYPE *A, const int lda, int *ipiv)
/*
 * Row-major factorization of form
 *   A = L * U * P
 * where P is a column-permutation matrix, L is lower triangular (lower
 * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper
 * trapazoidal if M < N).  This is the recursive Level 3 BLAS version.
 */
{
   const int MN = Mmin(M, N);
   int Nup, Ndown, i, ierr=0;
   #ifdef TCPLX
      const TYPE one[2] = {ATL_rone, ATL_rzero};
      const TYPE none[2] = {ATL_rnone, ATL_rzero};
      TYPE inv[2], tmp[2];
   #else
      #define one ATL_rone
      #define none ATL_rnone
      TYPE tmp;
   #endif
   TYPE *Ar, *Ac, *An;

   if (MN > 1)
   {
      Nup = MN >> 1;
      #ifdef NB
         if (Nup > NB) Nup = ATL_MulByNB(ATL_DivByNB(Nup));
      #endif
      Ndown = M - Nup;
      i = ATL_getrfR(Nup, N, A, lda, ipiv);
      if (i) if (!ierr) ierr = i;
      Ar = A + (Nup * lda SHIFT);
      Ac = A + (Nup SHIFT);
      An = Ar + (Nup SHIFT);

      ATL_laswp(Ndown, Ar, lda, 0, Nup, ipiv, 1);  /* apply pivots */
      cblas_trsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans,
                 CblasUnit, Ndown, Nup, one, A, lda, Ar, lda);
      cblas_gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Ndown, N-Nup, Nup,
                 none, Ar, lda, Ac, lda, one, An, lda);

      i = ATL_getrfR(Ndown, N-Nup, An, lda, ipiv+Nup);
      if (i) if (!ierr) ierr = Nup + i;
      for (i=Nup; i != MN; i++) ipiv[i] += Nup;
      ATL_laswp(Nup, A, lda, Nup, MN, ipiv, 1);  /* apply pivots */
   }
Пример #2
0
int ATL_trtriRL(const enum ATLAS_DIAG Diag, const int N, TYPE *A, const int lda)
{
  int ierr = 0;

   TYPE *Age, *Atr;
   TYPE tmp;
   int Nleft, Nright;
   #ifdef TREAL
      #define one ATL_rone
      #define mone -ATL_rone
      #define none ATL_rnone
   #else
      static const TYPE one[2] = {ATL_rone, ATL_rzero};
      static const TYPE mone[2] = {-ATL_rone, ATL_rzero};
      static const TYPE none[2] = {ATL_rnone, ATL_rzero};
   #endif

#ifdef TREAL
   if (N > REAL_RECURSE_LIMIT)
#else
   if (N > 1)
#endif
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;

      Age = A + ((Nleft*lda) SHIFT);
      Atr = A + (Nleft * (lda+1) SHIFT);

      cblas_trsm(AtlasRowMajor, AtlasRight, AtlasLower, AtlasNoTrans, Diag,
                 Nright, Nleft, one, A, lda, Age, lda);

      cblas_trsm(AtlasRowMajor, AtlasLeft, AtlasLower, AtlasNoTrans, Diag,
                 Nright, Nleft, mone, Atr, lda, Age, lda);

      ierr = ATL_trtriRL(Diag, Nleft, A, lda);
      if (ierr!=0) return(ierr);
      ierr = ATL_trtriRL(Diag, Nright, Atr, lda);
      if (ierr!=0) return(ierr+Nleft);

   }
Пример #3
0
int ATL_potrfL(const int N, TYPE *A, const int lda)
{
   TYPE *An, *Ar;
   const size_t lda2=(lda SHIFT);
   int Nleft, Nright, ierr;
   #ifdef TREAL
      #define lda2 lda
      #define ONE ATL_rone
   #else
      static const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #endif

#ifdef TREAL
   if (N > 4)
#else
   if (N > 1)
#endif
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB<<1) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;
      ierr = ATL_potrfL(Nleft, A, lda);
      if (!ierr)
      {
         Ar = A + (Nleft SHIFT);
         An = Ar + lda2 * Nleft;
         cblas_trsm(CblasColMajor, CblasRight, CblasLower, llt_trans,
                    CblasNonUnit, Nright, Nleft, ONE, A, lda, Ar, lda);
         llt_syrk(CblasColMajor, CblasLower, CblasNoTrans, Nright, Nleft,
                  ATL_rnone, Ar, lda, ATL_rone, An, lda);
         ierr = ATL_potrfL(Nright, An, lda);
         if (ierr) return(ierr+Nleft);
      }
      else return(ierr);
   }
Пример #4
0
void Mjoin(PATL,mmIJK2)(int K, int nMb, int nNb, int nKb, int ib, int jb,
                        int kb, const SCALAR alpha, const TYPE *A, int lda,
                        TYPE *pA0, int incA, MAT2BLK A2blk, const TYPE *pB0,
                        const SCALAR beta, TYPE *C, int ldc, TYPE *pC,
                        PUTBLK putblk, NBMM0 NBmm0)
/*
 * Outer three loops for matmul with outer loop over rows of A
 */
{
    int i, j, ldpc;
    const int ZEROC = ((putblk == NULL) && SCALAR_IS_ZERO(beta));
    const int incK = ATL_MulByNB(K), incC = ATL_MulByNB(ldc);
    TYPE *pA=pA0, *stA=pA0+ATL_MulByNBNB(nKb);
    const TYPE *pB=pB0;
    const TYPE cubeta = ( (putblk) ? ATL_rzero : beta );
    TYPE *c;

    if (putblk)
    {
        ldpc = NB;
        if (!nKb && kb) Mjoin(PATL,gezero)(MB, NB, pC, MB);
    }
    else ldpc = ldc;
    for (i=nMb; i; i--)    /* loop over full row panels of A */
    {
        if (A)
        {
            A2blk(K, NB, A, lda, pA, alpha);  /* get 1 row panel of A */
            A += incA;
        }
        if (!putblk) pC = C;
        c = C;
        C += NB;
        for (j=nNb; j; j--)  /* full column panels of B */
        {
            if (nKb)
            {
                NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, beta, pC, ldpc);
                pA += NBNB;
                pB += NBNB;
                if (nKb != 1)
                {
                    do
                    {
                        NBmm(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone,
                             pC, ldpc);
                        pA += NBNB;
                        pB += NBNB;
                    }
                    while (pA != stA);
                }
                if (kb)
                {
                    KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, pC, ldpc);
                    pB += kb*NB;
                }
            }
            else
            {
                if (ZEROC) Mjoin(PATL,gezero)(MB, NB, pC, ldpc);
                if (kb)
                {
                    KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, cubeta, pC, ldpc);
                    pB += kb*NB;
                }
            }
            pA = pA0;
            if (putblk) putblk(NB, NB, pC, c, ldc, beta);
            else pC += incC;
            c += incC;
        }
        if (jb)
        {
            NBJBmm(jb, K, pA, pB, cubeta, pC, ldpc);
            if (putblk) putblk(NB, jb, pC, c, ldc, beta);
        }
        pB = pB0;
        if (!A)
        {
            pA0 += incK;
            pA = pA0;
            stA += incK;
        }
    }
    if (ib)
    {
        c = C;
        if (A) A2blk(K, ib, A, lda, pA, alpha);  /* get last row panel of A */
        for (j=nNb; j; j--)  /* full column panels of B */
        {
            if (putblk)
            {
                IBNBmm(ib, K, pA, pB, ATL_rzero, pC, ib);
                putblk(ib, NB, pC, c, ldc, beta);
            }
            else IBNBmm(ib, K, pA, pB, beta, c, ldc);
            pB += incK;
            c += incC;
        }
        if (jb)
        {
            if (putblk)
            {
                IBJBmm(ib, jb, K, pA, pB, ATL_rzero, pC, ib);
                putblk(ib, jb, pC, c, ldc, beta);
            }
            else IBJBmm(ib, jb, K, pA, pB, beta, c, ldc);
        }
    }
}
Пример #5
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
{
    size_t incA, incB, incC;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    const size_t incK = ATL_MulByNB((size_t)K);
    int N = N0;
    int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n;
    void *vA=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    MAT2BLK A2blk, B2blk;
    PUTBLK putblk;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    /*
     * If K sufficiently large, write to temporary C as safety measure;  otherwise
     * write directly to C
     */
    if (nKb < 12)
    {
        putblk = NULL;
        pC = C;
        if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1;
        else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0;
        else NBmm0 = NBmm_bX;
    }
    else
    {
        NBmm0 = NBmm_b0;
        vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB));
        if (!vC) return(-1);
        pC = ATL_AlignPtr(vC);
        if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1);
        else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0);
        else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1);
        else putblk = Mjoin(PATL,putblk_bX);
    }
    /*
     * Special case if we don't need to copy one or more input matrix
     */
    if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B))
    {
        if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) &&
                ATL_DataIsMinAligned(A))
        {
            i = NBNB;
            pA = (TYPE *) A;
            A = NULL;
            A2blk = NULL;
            incA = 0;
        }
        else
        {
            vA = malloc(ATL_Cachelen + ATL_MulBySize(incK));
            if (!vA)
            {
                free(vC);
                return(-1);
            }
            pA = ATL_AlignPtr(vA);
            if (TA == AtlasNoTrans)
            {
                incA = NB;
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1);
                else A2blk = Mjoin(PATL,row2blkT_aX);
            }
            else
            {
                incA = ATL_MulByNB(lda);
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1);
                else A2blk = Mjoin(PATL,col2blk_aX);
            }
        }
        Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA,
                           incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0);
        if (vA) free(vA);
        if (vC) free(vC);
        return(0);
    }
    i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
    if (i <= ATL_MaxMalloc) vA = malloc(i);
    if (!vA)
    {
        if (TA == AtlasNoTrans && TB == AtlasNoTrans)
        {
            if (vC) free(vC);
            return(1);
        }
        if (jb) n = nNb + 1;
        else n = nNb;
        for (j=2; !vA; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vA = malloc(h);
        }
        if (!vA)
        {
            if (vC) free(vC);
            return(-1);
        }
        n = ATL_MulByNB(k);
        jb2 = 0;
    }
    else
    {
        jb2 = jb;
        k = nNb;
        n = N;
    }
    pA = ATL_AlignPtr(vA);
    if (TB == AtlasNoTrans)
    {
        incB = ldb*n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1);
        else B2blk = Mjoin(PATL,col2blk2_aX);
    }
    else
    {
        incB = n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1);
        else B2blk = Mjoin(PATL,row2blkT2_aX);
    }
    if (TA == AtlasNoTrans)
    {
        incA = NB;
        A2blk = Mjoin(PATL,row2blkT_a1);
    }
    else
    {
        incA = ATL_MulByNB(lda);
        A2blk = Mjoin(PATL,col2blk_a1);
    }
    incC = ldc*n;
    pB = pA + incK;

    do
    {
        if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
        else B2blk(n, K, B, ldb, pB, alpha);
        Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                           incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0);
        N -= n;
        nNb -= k;
        if (N < n)
        {
            jb2 = jb;
            n = N;
            k = nNb;
        }
        C += incC;
        B += incB;
        if (!putblk) pC = C;
    }
    while (N);

    if (vC) free(vC);
    free(vA);
    return(0);
}
Пример #6
0
int ATL_pmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
               const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
               const int M, const int N, const int K, const SCALAR alpha,
               const TYPE *A, const int lda, const TYPE *B, const int ldb,
               const SCALAR beta, const enum PACK_UPLO UC,
               TYPE *C, const int ldc)
/*
 * Special packed matmul, calls dense gemm kernel using at most
 * K*NB + 2*NB*NB space.  $B$ is copied only once, but $A$ is copied
 * ceil(N/NB) times.  However, $A$ should start in-cache for kernel call.
 */

{
   const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb);
   const int incK = ATL_MulByNB(K);
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0);
   int ib, jb, i, j, k;
   void *vC;
   TYPE *pC, *pA, *pB;
   NBMM0 pNBmm, pNBmm0;

   vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+NBNB+incK));
   if (!vC) return(-1);
   pC = ATL_AlignPtr(vC);
   pA = pC + NBNB;
   pB = pA + NBNB;

/*
 * Loop over column panels of $B$
 */
   for (j=0; j < N; j += NB)
   {
      jb = N - j;
      jb = Mmin(jb, NB);
/*
 *    Copy column-panel of B to block-major storage
 */
      if (alpha == 1.0)
      {
         if (TB == AtlasNoTrans)
            ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
         else /* TB == AtlasTrans */
            ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
      }
      else if (TB == AtlasNoTrans)
         ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
      else /* TB == AtlasTrans */
         ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
/*
 *    Loop over row-panels of A
 */
      for (i=0; i < M; i += MB)
      {
         ib = M - i;
         ib = Mmin(ib, MB);
         if (jb != NB || ib != MB)
         {
            pNBmm0 = pNBmm = ATL_gNBmm;
            if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB);
         }
         else
         {
            pNBmm = NBmm;
            pNBmm0 = NBmm_b0;
         }
/*
 *       Handle full blocks of K
 */
         if (nKb)
         {
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,0,lda),
                             lda, ldainc, pA);
            else
               ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
            pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib);
            for (k=1; k != nKb; k++)
            {
               if (TA == AtlasNoTrans)
                  ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,ATL_MulByNB(k),lda),
                                Mpld(UA,ATL_MulByNB(k),lda), ldainc, pA);
               else
                  ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,ATL_MulByNB(k),i,lda),
                               Mpld(UA,i,lda), ldainc, pA);
               pNBmm(ib, jb, NB, ATL_rone, pA, NB, pB+jb*NB*k, NB,
                     ATL_rone, pC, ib);
            }
            if (kb)
            {
               if (TA == AtlasNoTrans)
                  ATL_prow2blkT(ib, kb, 1.0,
                                A+MindexP(UA,i,ATL_MulByNB(nKb),lda),
                                Mpld(UA,ATL_MulByNB(nKb),lda), ldainc, pA);
               else
                  ATL_pcol2blk(kb, ib, 1.0,
                               A+MindexP(UA,ATL_MulByNB(nKb),i,lda),
                               Mpld(UA,i,lda), ldainc, pA);
               ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB+jb*NB*nKb, kb,
                         ATL_rone, pC, ib);
            }
         }
         else if (kb)
         {
            Mjoin(PATL,gezero)(ib, jb, pC, ib);
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, kb, 1.0, A+MindexP(UA,i,0,lda),
                             lda, ldainc, pA);
            else
               ATL_pcol2blk(kb, ib, 1.0, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
            ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib);
         }
         ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc),
                     ldcinc, beta);
      }
   }
   free(vC);
   return(0);
}
Пример #7
0
int ATL_pmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
                const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
                const int M, const int N, const int K, const SCALAR alpha,
                const TYPE *A, const int lda, const TYPE *B, const int ldb,
                const SCALAR beta, const enum PACK_UPLO UC,
                TYPE *C, const int ldc)
/*
 * Special packed matmul, calls dense gemm kernel using at most
 * M*K + K*NB + NB*NB space.  If this exceeds ATL_pkMaxMalloc or fails,
 * operates using at most 2*K*NB + NB*NB.  If this fails, returns non-zero.
 * If full space is malloced, both matrices are copied exactly once.  If
 * the smaller space is used, $A$ will be copied ceil(N/NB) times.
 */
{
   const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb);
   const int incK = ATL_MulByNB(K);
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0);
   int ib, jb, i, j, k;
   void *vC=NULL;
   TYPE *pC, *pA, *pB, *pA0;
   NBMM0 pNBmm, pNBmm0;
   void (*A2blk)(const int M, const int N, const TYPE alpha, const TYPE *A,
                 int lda, const int ldainc, TYPE *V);

   i = ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K)+M*K);
   if (i <= ATL_pkMaxMalloc) vC = malloc(i);
   if (!vC)
   {
      vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K+K)));
      if (TA == AtlasNoTrans) A2blk = ATL_prow2blkT;
      else A2blk = ATL_pcol2blk;
   }
   else A2blk = NULL;

   if (!vC) return(-1);
   pC = ATL_AlignPtr(vC);
   pB = pC + NBNB;
   pA = pB + ATL_MulByNB(K);

/*
 * If we've got the space, copy all of A up front
 */
   if (!A2blk)
   {
      if (TA == AtlasNoTrans)
         ATL_prow2blkTF(M, K, ATL_rone, A, lda, ldainc, pA);
      else ATL_pcol2blkF(K, M, ATL_rone, A, lda, ldainc, pA);
      pA -= ATL_MulByNB(K);
   }
   pA0 = pA;
/*
 * Loop over column panels of $B$
 */
   for (j=0; j < N; j += NB)
   {
      jb = N - j;
      jb = Mmin(jb, NB);
/*
 *    Copy column-panel of B to block-major storage
 */
      if (alpha == 1.0)
      {
         if (TB == AtlasNoTrans)
            ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
         else /* TB == AtlasTrans */
            ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
      }
      else if (TB == AtlasNoTrans)
         ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
      else /* TB == AtlasTrans */
         ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
/*
 *    Loop over row-panels of A
 */
      for (i=0; i < M; i += MB)
      {
         ib = M - i;
         ib = Mmin(ib, MB);
         if (A2blk)
         {
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, K, ATL_rone, A+MindexP(UA,i,0,lda), lda,
                             ldainc, pA);
            else /* TA == AtlasTrans */
               ATL_pcol2blk(K, ib, ATL_rone, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
         }
         else pA += ATL_MulByNB(K);
         if (jb != NB || ib != MB)
         {
            pNBmm0 = pNBmm = ATL_gNBmm;
            if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB);
         }
         else
         {
            pNBmm = NBmm;
            pNBmm0 = NBmm_b0;
         }
/*
 *       Handle full blocks of K
 */
         if (nKb)
         {
            pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib);
            for (k=1; k != nKb; k++)
            {
               pNBmm(ib, jb, NB, ATL_rone, pA+ib*NB*k, NB, pB+jb*NB*k, NB,
                     ATL_rone, pC, ib);
            }
            if (kb)
               ATL_gNBmm(ib, jb, kb, ATL_rone, pA+ib*NB*nKb, kb,
                         pB+jb*NB*nKb, kb, ATL_rone, pC, ib);
         }
         else if (kb)
         {
            Mjoin(PATL,gezero)(ib, jb, pC, ib);
            ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib);
         }
         ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc),
                     ldcinc, beta);
      }
      pA = pA0;
   }
   free(vC);
   return(0);
}
Пример #8
0
void Mjoin(PATL,mmJIK2)
(int K, int nMb, int nNb, int nKb, int ib, int jb, int kb,
 const SCALAR alpha, const TYPE *pA0, const TYPE *B, int ldb,
 TYPE *pB0, int incB, MAT2BLK B2blk, const SCALAR beta,
 TYPE *C, int ldc, MATSCAL gescal, NBMM0 NBmm0)
{
    const int incK = ATL_MulByNB(K)SHIFT, incC = ATL_MulByNB(ldc-nMb) SHIFT;
    const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta));
    int i, j = nNb;
    const TYPE *pA=pA0;
    const TYPE rbeta = ( (gescal) ? ATL_rone : *beta );
    TYPE *pB=pB0, *stB=pB0+(ATL_MulByNBNB(nKb)SHIFT);

    if (nNb)
    {
        do  /* Loop over full column panels of B */
        {
            if (B)
            {
                B2blk(K, NB, B, ldb, pB, alpha);
                B += incB;
            }
            if (nMb)
            {
                i = nMb;
                do /* loop over full row panels of A */
                {
                    if (gescal) gescal(NB, NB, beta, C, ldc);
                    if (nKb) /* loop over full blocks in panels */
                    {
                        NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc);
                        pA += NBNB2;
                        pB += NBNB2;
                        if (nKb != 1)
                        {
                            do
                            {
                                NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone,
                                        C, ldc);
                                pA += NBNB2;
                                pB += NBNB2;
                            }
                            while (pB != stB);
                        }
                        if (kb)
                        {
                            KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone,
                                 C, ldc);
                            pA += ATL_MulByNB(kb)<<1;
                        }
                    }
                    else if (kb)
                    {
                        if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc);
                        KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc);
                        pA += ATL_MulByNB(kb)<<1;
                    }
                    pB = pB0;
                    C += NB2;
                }
                while (--i);
            }
            if (ib)
            {
                if (gescal) gescal(ib, NB, beta, C, ldc);
                IBNBmm(ib, K, pA, pB, rbeta, C, ldc);
            }
            if (!B)
            {
                pB0 += incK;
                pB = pB0;
                stB += incK;
            }
            C += incC;
            pA = pA0;
        }
        while (--j);
    }
    if (jb)
    {
        if (B) B2blk(K, jb, B, ldb, pB, alpha);
        for (i=nMb; i; i--)
        {
            if (gescal) gescal(NB, jb, beta, C, ldc);
            NBJBmm(jb, K, pA, pB, rbeta, C, ldc);
            pA += incK;
            C += NB2;
        }
        if (ib)
        {
            if (gescal) gescal(ib, jb, beta, C, ldc);
            IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc);
        }
    }
}
Пример #9
0
int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M0, const int N, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
/*
 * Outer three loops for matmul with outer loop over columns of B
 */
{
    int M = M0;
    int nMb, nNb, nKb, ib, jb, kb, ib2, h, i, j, k, m, n;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    size_t incA, incB, incC;
    int AlphaIsOne;
    const size_t incK = ATL_MulByNB((size_t)K);
    void *vB=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    const TYPE one[2] = {1.0,0.0}, zero[2] = {0.0,0.0};
    MAT2BLK A2blk, B2blk;
    MATSCAL gescal;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    pC = C;
    if (beta[1] == ATL_rzero)
    {
        gescal = NULL;
        if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
        else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
        else NBmm0 = Mjoin(PATL,CNBmm_bX);
    }
    else
    {
        NBmm0 = Mjoin(PATL,CNBmm_b1);
        gescal = Mjoin(PATL,gescal_bX);
    }
    /*
     * Special case for when what we are really doing is
     *    C <- beta*C + alpha * A * A'   or   C <- beta*C + alpha * A' * A
     */
    if ( A == B && M == N && TA != TB && (SCALAR_IS_ONE(alpha) || M <= NB)
            && TA != AtlasConjTrans && TB != AtlasConjTrans && lda == ldb )
    {
        AlphaIsOne = SCALAR_IS_ONE(alpha);
        i = ATL_MulBySize(M * K);
        if (!AlphaIsOne && pC == C && !SCALAR_IS_ZERO(beta))
            i += ATL_MulBySize(M*N);
        if (i <= ATL_MaxMalloc) vB = malloc(i + ATL_Cachelen);
        if (vB)
        {
            pA = ATL_AlignPtr(vB);
            if (TA == AtlasNoTrans)
                Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha);
            else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha);
            /*
             *       Can't write directly to C if alpha is not one
             */
            if (!AlphaIsOne)
            {
                if (SCALAR_IS_ZERO(beta)) h = ldc;
                else if (pC == C)
                {
                    pC = pA + (((size_t)M) * K SHIFT);
                    h = M;
                }
                else h = NB;
                Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, one, pA, NULL,
                                   ldb, pA, 0, NULL, zero, pC, h,
                                   Mjoin(PATL,gescal_b0), Mjoin(PATL,CNBmm_b0));

                if (alpha[1] == ATL_rzero)
                    Mjoin(PATL,gescal_bXi0)(M, N, alpha, pC, h);
                else Mjoin(PATL,gescal_bX)(M, N, alpha, pC, h);

                if (C != pC)
                {
                    if (beta[1] == ATL_rzero)
                    {
                        if (*beta == ATL_rone)
                            Mjoin(PATL,putblk_b1)(M, N, pC, C, ldc, beta);
                        else if (*beta == ATL_rnone)
                            Mjoin(PATL,putblk_bn1)(M, N, pC, C, ldc, beta);
                        else if (*beta == ATL_rzero)
                            Mjoin(PATL,putblk_b0)(M, N, pC, C, ldc, beta);
                        else Mjoin(PATL,putblk_bXi0)(M, N, pC, C, ldc, beta);
                    }
                    else Mjoin(PATL,putblk_bX)(M, N, pC, C, ldc, beta);
                }
            }
            else Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, pA, NULL,
                                        ldb, pA, 0, NULL, beta, C, ldc, gescal, NBmm0);
            free(vB);
            if (vC) free(vC);
            return(0);
        }
    }
    i = ATL_Cachelen + ATL_MulBySize(M*K + incK);
    if (i <= ATL_MaxMalloc) vB = malloc(i);
    if (!vB)
    {
        if (TA != AtlasNoTrans && TB != AtlasNoTrans) return(1);
        if (ib) n = nMb + 1;
        else n = nMb;
        for (j=2; !vB; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vB = malloc(h);
        }
        if (!vB) return(-1);
        n = k;
        m = ATL_MulByNB(n);
        ib2 = 0;
    }
    else
    {
        n = nMb;
        m = M;
        ib2 = ib;
    }
    pB = ATL_AlignPtr(vB);
    if (TA == AtlasNoTrans)
    {
        incA = m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,row2blkT2_a1);
            else A2blk = Mjoin(PATL,row2blkT2_aXi0);
        }
        else A2blk = Mjoin(PATL,row2blkT2_aX);
    }
    else if (TA == AtlasConjTrans)
    {
        incA = lda*m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blkConj2_a1);
            else A2blk = Mjoin(PATL,col2blkConj2_aXi0);
        }
        else A2blk = Mjoin(PATL,col2blkConj2_aX);
    }
    else
    {
        incA = lda*m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blk2_a1);
            else A2blk = Mjoin(PATL,col2blk2_aXi0);
        }
        else A2blk = Mjoin(PATL,col2blk2_aX);
    }
    if (TB == AtlasNoTrans)
    {
        incB = ATL_MulByNB(ldb) SHIFT;
        B2blk = Mjoin(PATL,col2blk_a1);
    }
    else if (TB == AtlasConjTrans)
    {
        incB = NB2;
        B2blk = Mjoin(PATL,row2blkC_a1);
    }
    else
    {
        incB = NB2;
        B2blk = Mjoin(PATL,row2blkT_a1);
    }
    incC = m SHIFT;

    pA = pB + (incK SHIFT);
    do
    {
        if (TA == AtlasNoTrans) A2blk(m, K, A, lda, pA, alpha);
        else A2blk(K, m, A, lda, pA, alpha);
        Mjoin(PATL,mmJIK2)(K, n, nNb, nKb, ib2, jb, kb, alpha, pA, B, ldb, pB,
                           incB, B2blk, beta, C, ldc, gescal, NBmm0);
        M -= m;
        nMb -= n;
        if (M <= m)
        {
            ib2 = ib;
            m = M;
            n = nMb;
        }
        C += incC;
        A += incA;
    }
    while (M);
    free(vB);
    if (vC) free(vC);
    return(0);
}
Пример #10
0
void Mjoin(PATL,mmIJK2)
   (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb,
    const SCALAR alpha, const TYPE *A, const int lda, TYPE *pA0, const int incA,
    MAT2BLK A2blk, TYPE *pB0, const SCALAR beta, TYPE *C, int ldc,
    MATSCAL gescal, NBMM0 NBmm0)
{
   const int incK = ATL_MulByNB(K)<<1;
   const int incCn = ATL_MulByNB(ldc)<<1, incCm = (MB<<1) - nNb*incCn;
   const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta));
   int i, j, k;
   const TYPE *pB=pB0;
   const TYPE rbeta = ( (gescal) ? ATL_rone : *beta );
   TYPE *pA=pA0;

   for (i=nMb; i; i--)
   {
      if (A)
      {
         A2blk(K, NB, A, lda, pA, alpha);  /* get 1 row panel of A */
         A += incA;
      }
      for (j=nNb; j; j--)
      {
         if (gescal) gescal(MB, NB, beta, C, ldc);
         if (nKb)
         {
            NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc);
            pA += NBNB2;
            pB += NBNB2;
            if (nKb != 1)
            {
               for (k=nKb-1; k; k--, pA += NBNB2, pB += NBNB2)
                  NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB,
                          ATL_rone, C, ldc);
            }
            if (kb)
            {
               KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, C, ldc);
               pB += ATL_MulByNB(kb)<<1;
            }
         }
         else
         {
            if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc);
            if (kb)
            {
               KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc);
               pB += ATL_MulByNB(kb)<<1;
            }
         }
         pA = pA0;
         C += incCn;
      }
      if (jb)
      {
         if (gescal) gescal(MB, jb, beta, C, ldc);
         MBJBmm(jb, K, pA, pB, rbeta, C, ldc);
      }
      pB = pB0;
      if (!A)
      {
         pA0 += incK;
         pA = pA0;
      }
      C += incCm;
   }
   if (ib)
   {
      if (A) A2blk(K, ib, A, lda, pA, alpha);   /* get last row panel of A */
      for(j=nNb; j; j--) /* full column panels of B */
      {
         if (gescal) gescal(ib, NB, beta, C, ldc);
         IBNBmm(ib, K, pA, pB, rbeta, C, ldc);
         pB += incK;
         C += incCn;
      }
      if (jb)
      {
         if (gescal) gescal(ib, jb, beta, C, ldc);
         IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc);
      }
   }
}
Пример #11
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda,
                      const TYPE *B, const int ldb, const SCALAR beta,
                      TYPE *C, const int ldc)
{
   int N = N0;
   int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n, incA, incB, incC;
   const int incK = ATL_MulByNB(K);
   void *vA=NULL;
   TYPE *pA, *pB;
   MAT2BLK A2blk, B2blk;
   MATSCAL gescal;
   NBMM0 NBmm0;

   nMb = ATL_DivByNB(M);
   nNb = ATL_DivByNB(N);
   nKb = ATL_DivByNB(K);
   ib = M - ATL_MulByNB(nMb);
   jb = N - ATL_MulByNB(nNb);
   kb = K - ATL_MulByNB(nKb);

   if (beta[1] == ATL_rzero)
   {
      gescal = NULL;
      if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
      else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
      else NBmm0 = Mjoin(PATL,CNBmm_bX);
   }
   else
   {
      gescal = Mjoin(PATL,gescal_bX);
      NBmm0 = Mjoin(PATL,CNBmm_b1);
   }

   i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
   if (i <= ATL_MaxMalloc) vA = malloc(i);
   if (!vA)
   {
      if (TA == AtlasNoTrans && TB == AtlasNoTrans) return(1);
      if (jb) n = nNb + 1;
      else n = nNb;
      for (j=2; !vA; j++)
      {
         k = n / j;
         if (k < 1) break;
         if (k*j < n) k++;
         h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
         if (h <= ATL_MaxMalloc) vA = malloc(h);
      }
      if (!vA) return(-1);
      n = ATL_MulByNB(k);
      jb2 = 0;
   }
   else
   {
      jb2 = jb;
      k = nNb;
      n = N;
   }
   pA = ATL_AlignPtr(vA);
   if (TB == AtlasNoTrans)
   {
      incB = ldb*n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,col2blk2_a1);
         else B2blk = Mjoin(PATL,col2blk2_aXi0);
      }
      else B2blk = Mjoin(PATL,col2blk2_aX);
   }
   else if (TB == AtlasConjTrans)
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkC2_a1);
         else B2blk = Mjoin(PATL,row2blkC2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkC2_aX);
   }
   else
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkT2_a1);
         else B2blk = Mjoin(PATL,row2blkT2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkT2_aX);
   }
   if (TA == AtlasNoTrans)
   {
      incA = NB<<1;
      A2blk = Mjoin(PATL,row2blkT_a1);
   }
   else if (TA == AtlasConjTrans)
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blkConj_a1);
   }
   else
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blk_a1);
   }
   incC = ldc*n<<1;
   pB = pA + (incK<<1);

   do
   {
      if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
      else B2blk(n, K, B, ldb, pB, alpha);
      Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                         incA, A2blk, pB, beta, C, ldc, gescal, NBmm0);
      N -= n;
      nNb -= k;
      if (N < n)
      {
         jb2 = jb;
         n = N;
         k = nNb;
      }
      C += incC;
      B += incB;
   }
   while (N);

   free(vA);
   return(0);
}