Exemplo n.º 1
0
void Mjoin(PATL,gpmm)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum PACK_UPLO UB, const enum PACK_TRANS TB, const enum PACK_UPLO UC,
    const int M, const int N, const int K, const SCALAR alpha,
    const TYPE *A, const int IA, const int JA, const int lda,
    const TYPE *B, const int IB, const int JB, const int ldb,
    const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc)
{
   int j;
   #ifdef CacheEdge
      static const int CE_K = ((ATL_DivBySize(CacheEdge)-(NBNB SHIFT)) /
                                           (NB*(NB+NB)))*NB;
   #else
      #define CE_K K
   #endif
   if (!M || !N) return;
   if (!K || SCALAR_IS_ZERO(alpha))
   {
      for (j=0; j != N; j++)
         Mjoin(PATL,scal)(M, beta, C+MindexP(UC,IC,JC+j,ldc), 1);
      return;
   }
/*
 * Packed gpmm not yet implemented for complex,
 * so die if not really a dense gemm
 */
   #ifdef TCPLX
      ATL_assert (UA == PackGen && UB == PackGen && UC == PackGen);
      Mjoin(PATL,gemm)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   #else
   Mjoin(PATL,prankK)(UA, TA, UB, TB, M, N, K, CE_K, alpha,
          A+MindexP(UA,IA,JA,lda), Mpld(UA,JA,lda), B+MindexP(UB,IB,JB,ldb),
          Mpld(UB,JB,ldb), beta, UC, C+MindexP(UC,IB,JB,ldc), Mpld(UC,JC,ldc));
   #endif
}
Exemplo n.º 2
0
static void ATL_rk_recUT
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum ATLAS_UPLO UC, const int CP,
    const int N, const int K, const SCALAR alpha,
    const TYPE *A, int lda, const SCALAR beta, TYPE *C, const int ldc)
/*
 * For upper trans matrix, use recursion to reduce N until enough memory
 * can be allocated
 */
{
   int Nright, Nleft;
   const enum PACK_UPLO UC2 = (CP ? UC : PackGen);
   if (Mjoin(PATL,prk_kmm)(UC, UA, TA, N, K, alpha, A, lda, beta, CP, C, ldc))
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;
      ATL_rk_recUT(UA, TA, UC, CP, Nleft, K, alpha, A, lda, beta, C, ldc);
      Mjoin(PATL,gpmm)(PackGen, TA, PackGen, AtlasNoTrans, CP?PackUpper:PackGen,
                       Nleft, Nright, K, alpha, A, 0, 0, lda,
                       A+Nleft*(lda SHIFT), 0, 0, lda, beta,
                       C+MindexP(UC2,0,Nleft,ldc), 0, 0, Mpld(UC2,Nleft,ldc));
      ATL_rk_recUT(UA, TA, UC, CP, Nright, K, alpha, A+(Nleft SHIFT)*lda, lda,
                  beta, C+MindexP(UC2,Nleft,Nleft,ldc), Mpld(UC2,Nleft,ldc));
   }
}
Exemplo n.º 3
0
void Mjoin(PATL,pcol2blkF)
   (const int M, const int N, const SCALAR alpha, const TYPE *A, int lda,
    const int ldainc, TYPE *V)
/*
 * Copies entire MxN matrix to block major format
 */
{
   int j, jb;
   const int incV = ATL_MulByNB(M);
   const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper :
      ( (lda == -1) ? PackLower : PackGen );
   void (*col2blk)(const int M, const int N, const SCALAR alpha, const TYPE *A,
                   int lda, const int ldainc, TYPE *V);

   if (ldainc)
   {
      if (alpha == ATL_rone) col2blk = Mjoin(PATL,pcol2blk_a1);
      else col2blk = Mjoin(PATL,pcol2blk_aX);

      for (j=0; j < N; j += NB)
      {
         jb = N-j;
         jb = Mmin(jb, NB);
         col2blk(M, jb, alpha, A+MindexP(UA,0,j,lda), Mpld(UA,j,lda), ldainc,V);
         V += incV;
      }
   }
   else if (alpha == ATL_rone)
      Mjoin(PATL,col2blk2_a1)(M, N, A, lda, V, alpha);
   else
      Mjoin(PATL,col2blk2_aX)(M, N, A, lda, V, alpha);
}
Exemplo n.º 4
0
void Mjoin(PATL,prow2blkTF)(const int M, const int N, const SCALAR alpha,
                            const TYPE *A, int lda, const int ldainc, TYPE *V)
{
   const int mb = Mmin(NB,M), nMb = ATL_DivByNB(M);
   const int m = ATL_MulByNB(nMb), n = ATL_MulByNB(ATL_DivByNB(N));
   const int nr = N - n, mr = M - m;
   const int incVm = ATL_MulByNB(N), incVV = ATL_MulByNB(mr);
   int i, j, ib, jb;
   const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper :
      ( (ldainc == -1) ? PackLower : PackGen );
   TYPE *v, *vv = V+nMb*incVm;
   void (*row2blk)(const int M, const int N, const TYPE alpha, const TYPE *A,
                   int lda, const int ldainc, TYPE *V);

   if (ldainc)
   {
      if (alpha == ATL_rone) row2blk = ATL_prow2blk_KB_a1;
      else row2blk = ATL_prow2blk_KB_aX;

      for (j=0; j < n; j += NB)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, NB, alpha, A+MindexP(UA,i,j,lda), Mpld(UA,j,lda),
                    ldainc, v);
         if (mr)
         {
            row2blk(mr, NB, alpha, A+MindexP(UA,m,j,lda), Mpld(UA,j,lda),
                    ldainc, vv);
            vv += incVV;
         }
         V += NBNB;
      }
      if (nr)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, nr, alpha, A+MindexP(UA,i,n,lda), Mpld(UA,n,lda),
                    ldainc, v);
         if (mr)
            row2blk(mr, nr, alpha, A+MindexP(UA,m,n,lda), Mpld(UA,n,lda),
                    ldainc, vv);
      }
   }
   else if (SCALAR_IS_ONE(alpha))
      Mjoin(PATL,row2blkT2_a1)(M, N, A, lda, V, alpha);
   else
      Mjoin(PATL,row2blkT2_aX)(M, N, A, lda, V, alpha);
}
Exemplo n.º 5
0
void Mjoin(pcol2blkF,_blk)
   (const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A,
    int lda, const int ldainc, TYPE *V)
/*
 * Copies entire MxN matrix to block major format
 */
{
   int j, jb;
   const int incV = blk*(M+M);
   const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper :
      ( (lda == -1) ? PackLower : PackGen );
   void (*col2blk)(const int blk, const int M, const int N, const SCALAR alpha,
                   const TYPE *A, int lda, const int ldainc, TYPE *V);

#ifdef Conj_
   if (alpha[1] == ATL_rzero)
   {
      if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_a1),_blk);
      else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aXi0),_blk);
   }
   else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aX),_blk);
#else
   if (alpha[1] == ATL_rzero)
   {
      if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blk_a1),_blk);
      else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aXi0),_blk);
   }
   else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aX),_blk);
#endif

   for (j=0; j < N; j += blk)
   {
      jb = N-j;
      jb = Mmin(jb, blk);
      col2blk(blk, M, jb, alpha, A+MindexP(UA,0,j,lda), Mpld(UA,j,lda),
              ldainc, V);
      V += incV;
   }
}
Exemplo n.º 6
0
int ATL_pmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
               const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
               const int M, const int N, const int K, const SCALAR alpha,
               const TYPE *A, const int lda, const TYPE *B, const int ldb,
               const SCALAR beta, const enum PACK_UPLO UC,
               TYPE *C, const int ldc)
/*
 * Special packed matmul, calls dense gemm kernel using at most
 * K*NB + 2*NB*NB space.  $B$ is copied only once, but $A$ is copied
 * ceil(N/NB) times.  However, $A$ should start in-cache for kernel call.
 */

{
   const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb);
   const int incK = ATL_MulByNB(K);
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0);
   int ib, jb, i, j, k;
   void *vC;
   TYPE *pC, *pA, *pB;
   NBMM0 pNBmm, pNBmm0;

   vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+NBNB+incK));
   if (!vC) return(-1);
   pC = ATL_AlignPtr(vC);
   pA = pC + NBNB;
   pB = pA + NBNB;

/*
 * Loop over column panels of $B$
 */
   for (j=0; j < N; j += NB)
   {
      jb = N - j;
      jb = Mmin(jb, NB);
/*
 *    Copy column-panel of B to block-major storage
 */
      if (alpha == 1.0)
      {
         if (TB == AtlasNoTrans)
            ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
         else /* TB == AtlasTrans */
            ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
      }
      else if (TB == AtlasNoTrans)
         ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
      else /* TB == AtlasTrans */
         ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
/*
 *    Loop over row-panels of A
 */
      for (i=0; i < M; i += MB)
      {
         ib = M - i;
         ib = Mmin(ib, MB);
         if (jb != NB || ib != MB)
         {
            pNBmm0 = pNBmm = ATL_gNBmm;
            if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB);
         }
         else
         {
            pNBmm = NBmm;
            pNBmm0 = NBmm_b0;
         }
/*
 *       Handle full blocks of K
 */
         if (nKb)
         {
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,0,lda),
                             lda, ldainc, pA);
            else
               ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
            pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib);
            for (k=1; k != nKb; k++)
            {
               if (TA == AtlasNoTrans)
                  ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,ATL_MulByNB(k),lda),
                                Mpld(UA,ATL_MulByNB(k),lda), ldainc, pA);
               else
                  ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,ATL_MulByNB(k),i,lda),
                               Mpld(UA,i,lda), ldainc, pA);
               pNBmm(ib, jb, NB, ATL_rone, pA, NB, pB+jb*NB*k, NB,
                     ATL_rone, pC, ib);
            }
            if (kb)
            {
               if (TA == AtlasNoTrans)
                  ATL_prow2blkT(ib, kb, 1.0,
                                A+MindexP(UA,i,ATL_MulByNB(nKb),lda),
                                Mpld(UA,ATL_MulByNB(nKb),lda), ldainc, pA);
               else
                  ATL_pcol2blk(kb, ib, 1.0,
                               A+MindexP(UA,ATL_MulByNB(nKb),i,lda),
                               Mpld(UA,i,lda), ldainc, pA);
               ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB+jb*NB*nKb, kb,
                         ATL_rone, pC, ib);
            }
         }
         else if (kb)
         {
            Mjoin(PATL,gezero)(ib, jb, pC, ib);
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, kb, 1.0, A+MindexP(UA,i,0,lda),
                             lda, ldainc, pA);
            else
               ATL_pcol2blk(kb, ib, 1.0, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
            ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib);
         }
         ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc),
                     ldcinc, beta);
      }
   }
   free(vC);
   return(0);
}
Exemplo n.º 7
0
int ATL_pmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
                const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
                const int M, const int N, const int K, const SCALAR alpha,
                const TYPE *A, const int lda, const TYPE *B, const int ldb,
                const SCALAR beta, const enum PACK_UPLO UC,
                TYPE *C, const int ldc)
/*
 * Special packed matmul, calls dense gemm kernel using at most
 * M*K + K*NB + NB*NB space.  If this exceeds ATL_pkMaxMalloc or fails,
 * operates using at most 2*K*NB + NB*NB.  If this fails, returns non-zero.
 * If full space is malloced, both matrices are copied exactly once.  If
 * the smaller space is used, $A$ will be copied ceil(N/NB) times.
 */
{
   const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb);
   const int incK = ATL_MulByNB(K);
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0);
   int ib, jb, i, j, k;
   void *vC=NULL;
   TYPE *pC, *pA, *pB, *pA0;
   NBMM0 pNBmm, pNBmm0;
   void (*A2blk)(const int M, const int N, const TYPE alpha, const TYPE *A,
                 int lda, const int ldainc, TYPE *V);

   i = ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K)+M*K);
   if (i <= ATL_pkMaxMalloc) vC = malloc(i);
   if (!vC)
   {
      vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K+K)));
      if (TA == AtlasNoTrans) A2blk = ATL_prow2blkT;
      else A2blk = ATL_pcol2blk;
   }
   else A2blk = NULL;

   if (!vC) return(-1);
   pC = ATL_AlignPtr(vC);
   pB = pC + NBNB;
   pA = pB + ATL_MulByNB(K);

/*
 * If we've got the space, copy all of A up front
 */
   if (!A2blk)
   {
      if (TA == AtlasNoTrans)
         ATL_prow2blkTF(M, K, ATL_rone, A, lda, ldainc, pA);
      else ATL_pcol2blkF(K, M, ATL_rone, A, lda, ldainc, pA);
      pA -= ATL_MulByNB(K);
   }
   pA0 = pA;
/*
 * Loop over column panels of $B$
 */
   for (j=0; j < N; j += NB)
   {
      jb = N - j;
      jb = Mmin(jb, NB);
/*
 *    Copy column-panel of B to block-major storage
 */
      if (alpha == 1.0)
      {
         if (TB == AtlasNoTrans)
            ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
         else /* TB == AtlasTrans */
            ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
      }
      else if (TB == AtlasNoTrans)
         ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
      else /* TB == AtlasTrans */
         ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
/*
 *    Loop over row-panels of A
 */
      for (i=0; i < M; i += MB)
      {
         ib = M - i;
         ib = Mmin(ib, MB);
         if (A2blk)
         {
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, K, ATL_rone, A+MindexP(UA,i,0,lda), lda,
                             ldainc, pA);
            else /* TA == AtlasTrans */
               ATL_pcol2blk(K, ib, ATL_rone, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
         }
         else pA += ATL_MulByNB(K);
         if (jb != NB || ib != MB)
         {
            pNBmm0 = pNBmm = ATL_gNBmm;
            if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB);
         }
         else
         {
            pNBmm = NBmm;
            pNBmm0 = NBmm_b0;
         }
/*
 *       Handle full blocks of K
 */
         if (nKb)
         {
            pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib);
            for (k=1; k != nKb; k++)
            {
               pNBmm(ib, jb, NB, ATL_rone, pA+ib*NB*k, NB, pB+jb*NB*k, NB,
                     ATL_rone, pC, ib);
            }
            if (kb)
               ATL_gNBmm(ib, jb, kb, ATL_rone, pA+ib*NB*nKb, kb,
                         pB+jb*NB*nKb, kb, ATL_rone, pC, ib);
         }
         else if (kb)
         {
            Mjoin(PATL,gezero)(ib, jb, pC, ib);
            ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib);
         }
         ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc),
                     ldcinc, beta);
      }
      pA = pA0;
   }
   free(vC);
   return(0);
}
Exemplo n.º 8
0
void Mjoin(PATL,sprk_rK)
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum ATLAS_UPLO UC, const int CP,
    const int N, const int K, int R, const SCALAR alpha,
    const TYPE *A, int lda, const SCALAR beta0, TYPE *C, const int ldc)
/*
 * This routine does the packed symmetric rank-K update by doing ceil(K/R)
 * rank-R updates of C.  This primarily done for CacheEdge, but is also
 * useful to auto-reduce R until enough workspace may be allocated.
 */
{
   const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen);
   int k=0, kb, ierr;
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC2 == AtlasUpper) ? 1 : ((UC2 == AtlasLower) ? -1 : 0);
   #ifdef TREAL
      TYPE beta = beta0;
   #else
      TYPE beta[2];
      *beta = *beta0;
      beta[1] = beta0[1];
   #endif

   if (R < NB) R = NB<<4;
   if ((K - R) < 2*NB) R = K;
   do
   {
      kb = K - k;
      if (kb - R < 2*NB) R = kb;
      kb = Mmin(R, kb);
/*
 *    If we can't do the rank-R update, reduce R until we can, or R = 1
 */
      ierr = Mjoin(PATL,prk_kmm)(UC, UA, TA, N, kb, alpha,
                                 A, lda, beta, CP, C, ldc);
      if (ierr && R <= NB*8)
      {
         if (UC == AtlasUpper)
         {
            if (TA == AtlasNoTrans)
               ATL_rk_recUN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
            else
               ATL_rk_recUT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
         }
         else
         {
            if (TA == AtlasNoTrans)
               ATL_rk_recLN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
            else
               ATL_rk_recLT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc);
         }
         ierr = 0;
      }
      if (ierr)
      {
         R = Mmin(NB*8, R>>1);
         ATL_assert(R);
      }
/*
 *    Subsequent updates use beta = 1
 */
      else
      {
         #ifdef TREAL
            beta = ATL_rone;
         #else
            *beta = ATL_rone;
            beta[1] = ATL_rzero;
         #endif
         if (TA == AtlasNoTrans)
         {
            A += MindexP(UA, 0, R, lda);
            lda = Mpld(UA, R, lda);
         }
         else A += MindexP(UA, R, 0, lda);
         k += R;
      }
   }