Esempio n. 1
0
void row2blkT2(const int M, const int N, const TYPE *A, const int lda,
               TYPE *V, const SCALAR alpha)
{
   const int nNb = ATL_DivByNB(N), nMb = ATL_DivByNB(M);
   const int mr = M - ATL_MulByNB(nMb), nr = N - ATL_MulByNB(nNb);
   const int incA = (ATL_MulByNB(lda) - M + mr)<<1;
   const int incV = ATL_MulByNB(N)<<1, incvv = ATL_MulByNB(mr), incVV=incvv<<1;
   TYPE *v = V, *vv = V + nMb * incV;
   int i, j;

   for (j=nNb; j; j--)
   {
      for (i=nMb; i; i--, A += NB2, v += incV)
         row2blkT_NB(NB, NB, A, lda, v+NBNB, v, alpha);
      if (mr)
      {
         row2blkT_KB(mr, NB, A, lda, vv+incvv, vv, alpha);
         vv += incVV;
      }
      A += incA;
      V += NBNB2;
      v = V;
   }
   if (nr)
   {
      j = ATL_MulByNB(nr);
      for (i=nMb; i; i--, A += NB2, v += incV)
         row2blkT_KB(NB, nr, A, lda, v+j, v, alpha);
      if (mr) row2blkT_KB(mr, nr, A, lda, vv+mr*nr, vv, alpha);
   }
}
Esempio n. 2
0
int ATL_potrfRL(const int N, TYPE *A, const int lda)
{
   TYPE *An, *Ar;
   int Nleft, Nright, ierr;
   static const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   const int lda2=lda+lda;

   if (N > 1)
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB<<1) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;
      ierr = ATL_potrfRL(Nleft, A, lda);
      if (!ierr)
      {
         Ar = A + Nleft * lda2;
         An = Ar + Nleft+Nleft;
         cblas_trsm(CblasRowMajor, CblasRight, CblasLower, CblasConjTrans,
                    CblasNonUnit, Nright, Nleft, ONE, A, lda, Ar, lda);
         cblas_herk(CblasRowMajor, CblasLower, CblasNoTrans, Nright, Nleft,
                    ATL_rnone, Ar, lda, ATL_rone, An, lda);
         ierr = ATL_potrfRL(Nright, An, lda);
         if (ierr) return(ierr+Nleft);
      }
      else return(ierr);
   }
Esempio n. 3
0
static void ATL_rk_recLN
   (const enum PACK_UPLO UA, const enum PACK_TRANS TA,
    const enum ATLAS_UPLO UC, const int CP,
    const int N, const int K, const SCALAR alpha,
    const TYPE *A, int lda, const SCALAR beta, TYPE *C, const int ldc)
/*
 * For lower notrans matrix, use recursion to reduce N until enough memory
 * can be allocated
 */
{
   int Nright, Nleft;
   const enum PACK_UPLO UC2 = (CP ? UC : PackGen);
   if (Mjoin(PATL,prk_kmm)(UC, UA, TA, N, K, alpha, A, lda, beta, CP, C, ldc))
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;
      ATL_rk_recLN(UA, TA, UC, CP, Nleft, K, alpha, A, lda, beta, C, ldc);
      Mjoin(PATL,gpmm)(PackGen, TA, PackGen, AtlasTrans, CP ? PackLower:PackGen,
                       Nright, Nleft, K, alpha, A+(Nleft SHIFT), 0, 0, lda,
                       A, 0, 0, lda, beta,
                       C+MindexP(UC2,Nleft,0,ldc), 0, 0, ldc);
      ATL_rk_recLN(UA, TA, UC, CP, Nright, K, alpha, A+(Nleft SHIFT), lda,
                   beta, C+MindexP(UC2,Nleft,Nleft,ldc), Mpld(UC2,Nleft,ldc));
   }
}
Esempio n. 4
0
void ATL_lauumL(const int N, TYPE *A, const int lda)
{
   int Nleft, Nright;
   #ifdef TREAL
      const TYPE one=ATL_rone;
   #else
      const TYPE one[2]={ATL_rone, ATL_rzero};
   #endif
   TYPE *G, *U0=A, *U1;

   if (N > 1)
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;
      #ifdef RowMajor_
         G = A + Nleft*(lda SHIFT);
         U1 = G + (Nleft SHIFT);
      #else
         G = A + (Nleft SHIFT);
         U1 = G + Nleft*(lda SHIFT);
      #endif
      ATL_lauumL(Nleft, U0, lda);
      my_syrk(MyOrder, CblasLower, my_trans, Nleft, Nright, ATL_rone,
              G, lda, ATL_rone, U0, lda);
      cblas_trmm(MyOrder, CblasLeft, CblasLower, my_trans, CblasNonUnit,
                 Nright, Nleft, one, U1, lda, G, lda);
      ATL_lauumL(Nright, U1, lda);
   }
Esempio n. 5
0
int ATL_getriC(const int N, TYPE *A, const int lda, const int *ipiv,
               TYPE *wrk, const int lwrk)
{
   const int lda2 = lda SHIFT;
   int J, jb, nb, nright, iret;
   TYPE *A0 = A;
   #ifdef TREAL
      const TYPE one=ATL_rone, none=ATL_rnone;
   #else
      const TYPE one[2]={ATL_rone,ATL_rzero}, none[2]={ATL_rnone, ATL_rzero};
   #endif

   iret = ATL_trtri(CblasColMajor, CblasUpper, CblasNonUnit, N, A, lda);
   if (!iret && N > 1)
   {
/*
 *    Find largest NB we can use with our provided workspace
 */
      jb = lwrk / N;
      if (jb >= NB) nb = ATL_MulByNB(ATL_DivByNB(jb));
      else if (jb >= ATL_mmNU) nb = (jb/ATL_mmNU)*ATL_mmNU;
      else nb = jb;
      if (!nb) return(-6);  /* need at least 1 col of workspace */
/*
 *    Only first iteration will have partial block, unroll it
 */
      jb = N - (N/nb)*nb;
      if (!jb) jb = nb;
      J = N - jb;
      A += lda2*J;
      trcpzeroL(jb, jb, A+(J SHIFT), lda, wrk, jb);
      cblas_trsm(CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasUnit,
                 N, jb, one, wrk, jb, A, lda);
      if (J)
      {
         do
         {
            J -= nb;
            A -= nb*lda2;
            nright = N-J;
            trcpzeroL(nright, nb, A+(J SHIFT), lda, wrk, nright);
            cblas_gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, nb,
                       nright-nb, none, A+nb*lda2, lda, wrk+(nb SHIFT), nright,
                       one, A, lda);
            cblas_trsm(CblasColMajor, CblasRight, CblasLower, CblasNoTrans,
                       CblasUnit, N, nb, one, wrk, nright, A, lda);
         }
         while(J);
      }
/*
 *    Apply column interchanges
 */
      for (J=N-2; J >= 0; J--)
      {
         jb = ipiv[J];
         if (jb != J) cblas_swap(N, A+J*lda2, 1, A+jb*lda2, 1);
      }
   }
   return(iret);
}
Esempio n. 6
0
void Mjoin(PATL,sqtrans)(ATL_CINT N, TYPE *C, ATL_CINT ldc)
/*
 * Does an in-place transpose of a square matrix.  This routine is blocked
 * to help with TLB
 */
{
   const size_t ldt = ldc;
   ATL_CINT Nnb = ATL_MulByNB(ATL_DivByNB(N)), Nr = N - Nnb;
   ATL_INT i, j;

   if (N < NB+NB)
   {
      Mjoin(PATL,sqtrans0)(N, C, ldc);
      return;
   }
/*
 * Loop in reverse order, so first part of matrix retained in cache
 */
   if (Nr)
   {
      for (i=0; i < Nnb; i += NB)
         Mjoin(PATL,geswapT)(NB, Nr, C+((Nnb*ldt+i)SHIFT), ldc,
                             C+((Nnb+i*ldt)SHIFT), ldc);
      Mjoin(PATL,sqtrans0)(Nr, C+((Nnb*(ldt+1))SHIFT), ldc);
   }
   for (j=Nnb-NB; j >= 0; j -= NB)
   {

      for (i=0; i < j; i += NB)
         Mjoin(PATL,geswapT)(NB, NB, C+((j*ldt+i)SHIFT), ldc,
                             C+((j+i*ldt)SHIFT), ldc);
      Mjoin(PATL,sqtrans0)(NB, C+((j*(ldt+1))SHIFT), ldc);
   }
}
Esempio n. 7
0
void col2blk(const int M, const int N, const TYPE *A, const int lda, TYPE *V,
             const SCALAR alpha)
{
   const int nMb = ATL_DivByNB(M), ib = M - ATL_MulByNB(nMb);
   const int incA = (lda - M)<<1, incv = ATL_MulByNB(N);
   const int incV = (incv<<1) - NB;
   int i, ii, j;
   TYPE *rp = V+ATL_MulByNB(N), *ip = V, *prp, *pip;
   #ifdef ALPHAXI0
      #ifdef Conj_
         const register TYPE ralpha = *alpha, calpha = -ralpha;
      #else
         const register TYPE ralpha = *alpha;
      #endif
   #elif defined(ALPHAX)
      const register TYPE ralpha = *alpha, ialpha = alpha[1];
      register TYPE ra, ia;
   #endif

   pip = V + (M-ib)*(N<<1);
   prp = pip + ib*N;

   for (j=N; j; j--, V += NB, A += incA)
   {
      ip = V;
      rp = V + incv;
      for (ii=nMb; ii; ii--, rp += incV, ip += incV)
      {
         for (i=NB; i; i--, A += 2, rp++, ip++) scalcp(A, rp, ip);
      }
      for (i=ib; i; i--, A += 2, prp++, pip++) scalcp(A, prp, pip);
   }
}
Esempio n. 8
0
void Mjoin(PATL,prow2blkTF)(const int M, const int N, const SCALAR alpha,
                            const TYPE *A, int lda, const int ldainc, TYPE *V)
{
   const int mb = Mmin(NB,M), nMb = ATL_DivByNB(M);
   const int m = ATL_MulByNB(nMb), n = ATL_MulByNB(ATL_DivByNB(N));
   const int nr = N - n, mr = M - m;
   const int incVm = ATL_MulByNB(N), incVV = ATL_MulByNB(mr);
   int i, j, ib, jb;
   const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper :
      ( (ldainc == -1) ? PackLower : PackGen );
   TYPE *v, *vv = V+nMb*incVm;
   void (*row2blk)(const int M, const int N, const TYPE alpha, const TYPE *A,
                   int lda, const int ldainc, TYPE *V);

   if (ldainc)
   {
      if (alpha == ATL_rone) row2blk = ATL_prow2blk_KB_a1;
      else row2blk = ATL_prow2blk_KB_aX;

      for (j=0; j < n; j += NB)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, NB, alpha, A+MindexP(UA,i,j,lda), Mpld(UA,j,lda),
                    ldainc, v);
         if (mr)
         {
            row2blk(mr, NB, alpha, A+MindexP(UA,m,j,lda), Mpld(UA,j,lda),
                    ldainc, vv);
            vv += incVV;
         }
         V += NBNB;
      }
      if (nr)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, nr, alpha, A+MindexP(UA,i,n,lda), Mpld(UA,n,lda),
                    ldainc, v);
         if (mr)
            row2blk(mr, nr, alpha, A+MindexP(UA,m,n,lda), Mpld(UA,n,lda),
                    ldainc, vv);
      }
   }
   else if (SCALAR_IS_ONE(alpha))
      Mjoin(PATL,row2blkT2_a1)(M, N, A, lda, V, alpha);
   else
      Mjoin(PATL,row2blkT2_aX)(M, N, A, lda, V, alpha);
}
Esempio n. 9
0
int ATL_getrfC(const int M, const int N, TYPE *A, const int lda, int *ipiv)
/*
 * Column-major factorization of form
 *   A = P * L * U
 * where P is a row-permutation matrix, L is lower triangular with unit diagonal
 * elements (lower trapazoidal if M > N), and U is upper triangular (upper
 * trapazoidal if M < N).  This is the recursive Level 3 BLAS version.
 */
{
   const int MN = Mmin(M, N);
   int Nleft, Nright, k, i, ierr=0;
   #ifdef TCPLX
      const TYPE one[2] = {ATL_rone, ATL_rzero};
      const TYPE none[2] = {ATL_rnone, ATL_rzero};
      TYPE inv[2], tmp[2];
   #else
      #define one ATL_rone
      #define none ATL_rnone
      TYPE tmp;
   #endif
   TYPE *Ac, *An;

   if (((size_t)M)*N <= ATL_L1elts)
      return(Mjoin(PATL,getf2)(M, N, A, lda, ipiv));
   #if defined(ATL_USEPTHREADS) && defined(ATL_USEPCA)
      if (N <= (NB<<2) && N >= 16 && M-N >= ATL_PCAMin &&
          ((size_t)ATL_MulBySize(M)*N) <= CacheEdge*ATL_NTHREADS)
      {
         if (N >= 16)
            ierr = Mjoin(PATL,tgetf2)(M, N, A, lda, ipiv);
         else
            ierr = Mjoin(PATL,tgetf2_nocp)(M, N, A, lda, ipiv);
         return(ierr);
      }
   #endif
   if (MN > ATL_luMmin)
   {
      Nleft = MN >> 1;
      #ifdef NB
         if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;
      i = ATL_getrfC(M, Nleft, A, lda, ipiv);  /* factor left to L & U */
      if (i) if (!ierr) ierr = i;
/*
 *    Update trailing submatrix
 */
      Ac = A + (Nleft * lda SHIFT);
      An = Ac + (Nleft SHIFT);
      ATL_laswp(Nright, Ac, lda, 0, Nleft, ipiv, 1);
      cblas_trsm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
                 Nleft, Nright, one, A, lda, Ac, lda);
      cblas_gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M-Nleft, Nright,
                 Nleft, none, A+(Nleft SHIFT), lda, Ac, lda, one, An, lda);
      i = ATL_getrfC(M-Nleft, Nright, An, lda, ipiv+Nleft);
      if (i) if (!ierr) ierr = i + Nleft;
      for (i=Nleft; i != MN; i++) ipiv[i] += Nleft;
      ATL_laswp(Nleft, A, lda, Nleft, MN, ipiv, 1);
   }
Esempio n. 10
0
void col2blk2(const int M, const int N, const TYPE *A, const int lda, TYPE *V,
              const TYPE *alpha)
{
   int j;
   const int nNb = ATL_DivByNB(N), jb = N - ATL_MulByNB(nNb);
   size_t incA = (lda*NB)<<1, incV = (M*NB)<<1;

   for (j=nNb; j; j--)
   {
      col2blk(M, NB, A, lda, V, alpha);
      A += incA;
      V += incV;
   }
   if (jb) col2blk(M, jb, A, lda, V, alpha);
}
Esempio n. 11
0
void row2blkT(const int N, const int nb, const TYPE *A, const int lda,
              TYPE *v, const SCALAR alpha)
{
   const int nNb = ATL_DivByNB(N), incA = ATL_MulByNB(lda)<<1;
   const int incv = ATL_MulByNB(nb), incV = incv<<1;
   int k;

   if (nb == NB)
      for (k=nNb; k; k--, A += incA, v += NBNB2)
         row2blkT_NB(NB, NB, A, lda, v+NBNB, v, alpha);
   else
      for (k=nNb; k; k--, A += incA, v += incV)
         row2blkT_KB(nb, NB, A, lda, v+incv, v, alpha);
   if ( k = N-ATL_MulByNB(nNb) )
      row2blkT_KB(nb, k, A, lda, v+k*nb, v, alpha);
}
Esempio n. 12
0
int ATL_getrfR(const int M, const int N, TYPE *A, const int lda, int *ipiv)
/*
 * Row-major factorization of form
 *   A = L * U * P
 * where P is a column-permutation matrix, L is lower triangular (lower
 * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper
 * trapazoidal if M < N).  This is the recursive Level 3 BLAS version.
 */
{
   const int MN = Mmin(M, N);
   int Nup, Ndown, i, ierr=0;
   #ifdef TCPLX
      const TYPE one[2] = {ATL_rone, ATL_rzero};
      const TYPE none[2] = {ATL_rnone, ATL_rzero};
      TYPE inv[2], tmp[2];
   #else
      #define one ATL_rone
      #define none ATL_rnone
      TYPE tmp;
   #endif
   TYPE *Ar, *Ac, *An;

   if (MN > 1)
   {
      Nup = MN >> 1;
      #ifdef NB
         if (Nup > NB) Nup = ATL_MulByNB(ATL_DivByNB(Nup));
      #endif
      Ndown = M - Nup;
      i = ATL_getrfR(Nup, N, A, lda, ipiv);
      if (i) if (!ierr) ierr = i;
      Ar = A + (Nup * lda SHIFT);
      Ac = A + (Nup SHIFT);
      An = Ar + (Nup SHIFT);

      ATL_laswp(Ndown, Ar, lda, 0, Nup, ipiv, 1);  /* apply pivots */
      cblas_trsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans,
                 CblasUnit, Ndown, Nup, one, A, lda, Ar, lda);
      cblas_gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Ndown, N-Nup, Nup,
                 none, Ar, lda, Ac, lda, one, An, lda);

      i = ATL_getrfR(Ndown, N-Nup, An, lda, ipiv+Nup);
      if (i) if (!ierr) ierr = Nup + i;
      for (i=Nup; i != MN; i++) ipiv[i] += Nup;
      ATL_laswp(Nup, A, lda, Nup, MN, ipiv, 1);  /* apply pivots */
   }
Esempio n. 13
0
int ATL_trtriRL(const enum ATLAS_DIAG Diag, const int N, TYPE *A, const int lda)
{
  int ierr = 0;

   TYPE *Age, *Atr;
   TYPE tmp;
   int Nleft, Nright;
   #ifdef TREAL
      #define one ATL_rone
      #define mone -ATL_rone
      #define none ATL_rnone
   #else
      static const TYPE one[2] = {ATL_rone, ATL_rzero};
      static const TYPE mone[2] = {-ATL_rone, ATL_rzero};
      static const TYPE none[2] = {ATL_rnone, ATL_rzero};
   #endif

#ifdef TREAL
   if (N > REAL_RECURSE_LIMIT)
#else
   if (N > 1)
#endif
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;

      Age = A + ((Nleft*lda) SHIFT);
      Atr = A + (Nleft * (lda+1) SHIFT);

      cblas_trsm(AtlasRowMajor, AtlasRight, AtlasLower, AtlasNoTrans, Diag,
                 Nright, Nleft, one, A, lda, Age, lda);

      cblas_trsm(AtlasRowMajor, AtlasLeft, AtlasLower, AtlasNoTrans, Diag,
                 Nright, Nleft, mone, Atr, lda, Age, lda);

      ierr = ATL_trtriRL(Diag, Nleft, A, lda);
      if (ierr!=0) return(ierr);
      ierr = ATL_trtriRL(Diag, Nright, Atr, lda);
      if (ierr!=0) return(ierr+Nleft);

   }
Esempio n. 14
0
void Mjoin(PATL,MBJBmm)(const int N, const int K, const TYPE *A, const TYPE *B,
                        const TYPE beta, TYPE *C, const int ldc)
{
   const int nKb = ATL_DivByNB(K);
   #ifdef TREAL
      const int incB = ATL_MulByNB(N);
      #define incA NBNB;
      #define zero ATL_rzero
   #else
      const int incB = ATL_MulByNB(N)<<1;
      #define incA NBNB2;
      const TYPE zero[2] = {ATL_rzero, ATL_rzero};
   #endif
   register int k;

   if (nKb)
   {
      if (beta == ATL_rone)
         Mjoin(PATL,pNBmm_b1)(MB, N, KB, ATL_rone, A, KB, B, KB, beta, C, ldc);
      else if (beta == ATL_rzero)
         Mjoin(PATL,pNBmm_b0)(MB, N, KB, ATL_rone, A, KB, B, KB, beta, C, ldc);
      else
         Mjoin(PATL,pNBmm_bX)(MB, N, KB, ATL_rone, A, KB, B, KB, beta, C, ldc);
      A += incA;
      B += incB;
      for (k=nKb-1; k; k--)
      {
         Mjoin(PATL,pNBmm_b1)(MB, N, KB, ATL_rone, A, KB, B, KB,
                              ATL_rone, C, ldc);
         A += incA;
         B += incB;
      }
      if (k = K - ATL_MulByNB(nKb))
         Mjoin(PATL,pKBmm)(MB, N, k, ATL_rone, A, k, B, k, ATL_rone, C, ldc);
   }
   else if (k = K - ATL_MulByNB(nKb))
   {
      if (beta == ATL_rzero) Mjoin(PATL,gezero)(MB, N, C, ldc);
      Mjoin(PATL,pKBmm)(MB, N, k, ATL_rone, A, k, B, k, beta, C, ldc);
   }
}
Esempio n. 15
0
int ATL_potrfL(const int N, TYPE *A, const int lda)
{
   TYPE *An, *Ar;
   const size_t lda2=(lda SHIFT);
   int Nleft, Nright, ierr;
   #ifdef TREAL
      #define lda2 lda
      #define ONE ATL_rone
   #else
      static const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #endif

#ifdef TREAL
   if (N > 4)
#else
   if (N > 1)
#endif
   {
      Nleft = N >> 1;
      #ifdef NB
         if (Nleft > NB<<1) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft));
      #endif
      Nright = N - Nleft;
      ierr = ATL_potrfL(Nleft, A, lda);
      if (!ierr)
      {
         Ar = A + (Nleft SHIFT);
         An = Ar + lda2 * Nleft;
         cblas_trsm(CblasColMajor, CblasRight, CblasLower, llt_trans,
                    CblasNonUnit, Nright, Nleft, ONE, A, lda, Ar, lda);
         llt_syrk(CblasColMajor, CblasLower, CblasNoTrans, Nright, Nleft,
                  ATL_rnone, Ar, lda, ATL_rone, An, lda);
         ierr = ATL_potrfL(Nright, An, lda);
         if (ierr) return(ierr+Nleft);
      }
      else return(ierr);
   }
Esempio n. 16
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda,
                      const TYPE *B, const int ldb, const SCALAR beta,
                      TYPE *C, const int ldc)
{
   int N = N0;
   int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n, incA, incB, incC;
   const int incK = ATL_MulByNB(K);
   void *vA=NULL;
   TYPE *pA, *pB;
   MAT2BLK A2blk, B2blk;
   MATSCAL gescal;
   NBMM0 NBmm0;

   nMb = ATL_DivByNB(M);
   nNb = ATL_DivByNB(N);
   nKb = ATL_DivByNB(K);
   ib = M - ATL_MulByNB(nMb);
   jb = N - ATL_MulByNB(nNb);
   kb = K - ATL_MulByNB(nKb);

   if (beta[1] == ATL_rzero)
   {
      gescal = NULL;
      if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
      else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
      else NBmm0 = Mjoin(PATL,CNBmm_bX);
   }
   else
   {
      gescal = Mjoin(PATL,gescal_bX);
      NBmm0 = Mjoin(PATL,CNBmm_b1);
   }

   i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
   if (i <= ATL_MaxMalloc) vA = malloc(i);
   if (!vA)
   {
      if (TA == AtlasNoTrans && TB == AtlasNoTrans) return(1);
      if (jb) n = nNb + 1;
      else n = nNb;
      for (j=2; !vA; j++)
      {
         k = n / j;
         if (k < 1) break;
         if (k*j < n) k++;
         h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
         if (h <= ATL_MaxMalloc) vA = malloc(h);
      }
      if (!vA) return(-1);
      n = ATL_MulByNB(k);
      jb2 = 0;
   }
   else
   {
      jb2 = jb;
      k = nNb;
      n = N;
   }
   pA = ATL_AlignPtr(vA);
   if (TB == AtlasNoTrans)
   {
      incB = ldb*n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,col2blk2_a1);
         else B2blk = Mjoin(PATL,col2blk2_aXi0);
      }
      else B2blk = Mjoin(PATL,col2blk2_aX);
   }
   else if (TB == AtlasConjTrans)
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkC2_a1);
         else B2blk = Mjoin(PATL,row2blkC2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkC2_aX);
   }
   else
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkT2_a1);
         else B2blk = Mjoin(PATL,row2blkT2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkT2_aX);
   }
   if (TA == AtlasNoTrans)
   {
      incA = NB<<1;
      A2blk = Mjoin(PATL,row2blkT_a1);
   }
   else if (TA == AtlasConjTrans)
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blkConj_a1);
   }
   else
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blk_a1);
   }
   incC = ldc*n<<1;
   pB = pA + (incK<<1);

   do
   {
      if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
      else B2blk(n, K, B, ldb, pB, alpha);
      Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                         incA, A2blk, pB, beta, C, ldc, gescal, NBmm0);
      N -= n;
      nNb -= k;
      if (N < n)
      {
         jb2 = jb;
         n = N;
         k = nNb;
      }
      C += incC;
      B += incB;
   }
   while (N);

   free(vA);
   return(0);
}
Esempio n. 17
0
int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M0, const int N, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
/*
 * Outer three loops for matmul with outer loop over columns of B
 */
{
    int M = M0;
    int nMb, nNb, nKb, ib, jb, kb, ib2, h, i, j, k, m, n;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    size_t incA, incB, incC;
    int AlphaIsOne;
    const size_t incK = ATL_MulByNB((size_t)K);
    void *vB=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    const TYPE one[2] = {1.0,0.0}, zero[2] = {0.0,0.0};
    MAT2BLK A2blk, B2blk;
    MATSCAL gescal;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    pC = C;
    if (beta[1] == ATL_rzero)
    {
        gescal = NULL;
        if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
        else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
        else NBmm0 = Mjoin(PATL,CNBmm_bX);
    }
    else
    {
        NBmm0 = Mjoin(PATL,CNBmm_b1);
        gescal = Mjoin(PATL,gescal_bX);
    }
    /*
     * Special case for when what we are really doing is
     *    C <- beta*C + alpha * A * A'   or   C <- beta*C + alpha * A' * A
     */
    if ( A == B && M == N && TA != TB && (SCALAR_IS_ONE(alpha) || M <= NB)
            && TA != AtlasConjTrans && TB != AtlasConjTrans && lda == ldb )
    {
        AlphaIsOne = SCALAR_IS_ONE(alpha);
        i = ATL_MulBySize(M * K);
        if (!AlphaIsOne && pC == C && !SCALAR_IS_ZERO(beta))
            i += ATL_MulBySize(M*N);
        if (i <= ATL_MaxMalloc) vB = malloc(i + ATL_Cachelen);
        if (vB)
        {
            pA = ATL_AlignPtr(vB);
            if (TA == AtlasNoTrans)
                Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha);
            else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha);
            /*
             *       Can't write directly to C if alpha is not one
             */
            if (!AlphaIsOne)
            {
                if (SCALAR_IS_ZERO(beta)) h = ldc;
                else if (pC == C)
                {
                    pC = pA + (((size_t)M) * K SHIFT);
                    h = M;
                }
                else h = NB;
                Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, one, pA, NULL,
                                   ldb, pA, 0, NULL, zero, pC, h,
                                   Mjoin(PATL,gescal_b0), Mjoin(PATL,CNBmm_b0));

                if (alpha[1] == ATL_rzero)
                    Mjoin(PATL,gescal_bXi0)(M, N, alpha, pC, h);
                else Mjoin(PATL,gescal_bX)(M, N, alpha, pC, h);

                if (C != pC)
                {
                    if (beta[1] == ATL_rzero)
                    {
                        if (*beta == ATL_rone)
                            Mjoin(PATL,putblk_b1)(M, N, pC, C, ldc, beta);
                        else if (*beta == ATL_rnone)
                            Mjoin(PATL,putblk_bn1)(M, N, pC, C, ldc, beta);
                        else if (*beta == ATL_rzero)
                            Mjoin(PATL,putblk_b0)(M, N, pC, C, ldc, beta);
                        else Mjoin(PATL,putblk_bXi0)(M, N, pC, C, ldc, beta);
                    }
                    else Mjoin(PATL,putblk_bX)(M, N, pC, C, ldc, beta);
                }
            }
            else Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, pA, NULL,
                                        ldb, pA, 0, NULL, beta, C, ldc, gescal, NBmm0);
            free(vB);
            if (vC) free(vC);
            return(0);
        }
    }
    i = ATL_Cachelen + ATL_MulBySize(M*K + incK);
    if (i <= ATL_MaxMalloc) vB = malloc(i);
    if (!vB)
    {
        if (TA != AtlasNoTrans && TB != AtlasNoTrans) return(1);
        if (ib) n = nMb + 1;
        else n = nMb;
        for (j=2; !vB; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vB = malloc(h);
        }
        if (!vB) return(-1);
        n = k;
        m = ATL_MulByNB(n);
        ib2 = 0;
    }
    else
    {
        n = nMb;
        m = M;
        ib2 = ib;
    }
    pB = ATL_AlignPtr(vB);
    if (TA == AtlasNoTrans)
    {
        incA = m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,row2blkT2_a1);
            else A2blk = Mjoin(PATL,row2blkT2_aXi0);
        }
        else A2blk = Mjoin(PATL,row2blkT2_aX);
    }
    else if (TA == AtlasConjTrans)
    {
        incA = lda*m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blkConj2_a1);
            else A2blk = Mjoin(PATL,col2blkConj2_aXi0);
        }
        else A2blk = Mjoin(PATL,col2blkConj2_aX);
    }
    else
    {
        incA = lda*m SHIFT;
        if (alpha[1] == ATL_rzero)
        {
            if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blk2_a1);
            else A2blk = Mjoin(PATL,col2blk2_aXi0);
        }
        else A2blk = Mjoin(PATL,col2blk2_aX);
    }
    if (TB == AtlasNoTrans)
    {
        incB = ATL_MulByNB(ldb) SHIFT;
        B2blk = Mjoin(PATL,col2blk_a1);
    }
    else if (TB == AtlasConjTrans)
    {
        incB = NB2;
        B2blk = Mjoin(PATL,row2blkC_a1);
    }
    else
    {
        incB = NB2;
        B2blk = Mjoin(PATL,row2blkT_a1);
    }
    incC = m SHIFT;

    pA = pB + (incK SHIFT);
    do
    {
        if (TA == AtlasNoTrans) A2blk(m, K, A, lda, pA, alpha);
        else A2blk(K, m, A, lda, pA, alpha);
        Mjoin(PATL,mmJIK2)(K, n, nNb, nKb, ib2, jb, kb, alpha, pA, B, ldb, pB,
                           incB, B2blk, beta, C, ldc, gescal, NBmm0);
        M -= m;
        nMb -= n;
        if (M <= m)
        {
            ib2 = ib;
            m = M;
            n = nMb;
        }
        C += incC;
        A += incA;
    }
    while (M);
    free(vB);
    if (vC) free(vC);
    return(0);
}
Esempio n. 18
0
int ATL_pmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
               const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
               const int M, const int N, const int K, const SCALAR alpha,
               const TYPE *A, const int lda, const TYPE *B, const int ldb,
               const SCALAR beta, const enum PACK_UPLO UC,
               TYPE *C, const int ldc)
/*
 * Special packed matmul, calls dense gemm kernel using at most
 * K*NB + 2*NB*NB space.  $B$ is copied only once, but $A$ is copied
 * ceil(N/NB) times.  However, $A$ should start in-cache for kernel call.
 */

{
   const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb);
   const int incK = ATL_MulByNB(K);
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0);
   int ib, jb, i, j, k;
   void *vC;
   TYPE *pC, *pA, *pB;
   NBMM0 pNBmm, pNBmm0;

   vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+NBNB+incK));
   if (!vC) return(-1);
   pC = ATL_AlignPtr(vC);
   pA = pC + NBNB;
   pB = pA + NBNB;

/*
 * Loop over column panels of $B$
 */
   for (j=0; j < N; j += NB)
   {
      jb = N - j;
      jb = Mmin(jb, NB);
/*
 *    Copy column-panel of B to block-major storage
 */
      if (alpha == 1.0)
      {
         if (TB == AtlasNoTrans)
            ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
         else /* TB == AtlasTrans */
            ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
      }
      else if (TB == AtlasNoTrans)
         ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
      else /* TB == AtlasTrans */
         ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
/*
 *    Loop over row-panels of A
 */
      for (i=0; i < M; i += MB)
      {
         ib = M - i;
         ib = Mmin(ib, MB);
         if (jb != NB || ib != MB)
         {
            pNBmm0 = pNBmm = ATL_gNBmm;
            if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB);
         }
         else
         {
            pNBmm = NBmm;
            pNBmm0 = NBmm_b0;
         }
/*
 *       Handle full blocks of K
 */
         if (nKb)
         {
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,0,lda),
                             lda, ldainc, pA);
            else
               ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
            pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib);
            for (k=1; k != nKb; k++)
            {
               if (TA == AtlasNoTrans)
                  ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,ATL_MulByNB(k),lda),
                                Mpld(UA,ATL_MulByNB(k),lda), ldainc, pA);
               else
                  ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,ATL_MulByNB(k),i,lda),
                               Mpld(UA,i,lda), ldainc, pA);
               pNBmm(ib, jb, NB, ATL_rone, pA, NB, pB+jb*NB*k, NB,
                     ATL_rone, pC, ib);
            }
            if (kb)
            {
               if (TA == AtlasNoTrans)
                  ATL_prow2blkT(ib, kb, 1.0,
                                A+MindexP(UA,i,ATL_MulByNB(nKb),lda),
                                Mpld(UA,ATL_MulByNB(nKb),lda), ldainc, pA);
               else
                  ATL_pcol2blk(kb, ib, 1.0,
                               A+MindexP(UA,ATL_MulByNB(nKb),i,lda),
                               Mpld(UA,i,lda), ldainc, pA);
               ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB+jb*NB*nKb, kb,
                         ATL_rone, pC, ib);
            }
         }
         else if (kb)
         {
            Mjoin(PATL,gezero)(ib, jb, pC, ib);
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, kb, 1.0, A+MindexP(UA,i,0,lda),
                             lda, ldainc, pA);
            else
               ATL_pcol2blk(kb, ib, 1.0, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
            ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib);
         }
         ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc),
                     ldcinc, beta);
      }
   }
   free(vC);
   return(0);
}
Esempio n. 19
0
int ATL_pmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
                const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
                const int M, const int N, const int K, const SCALAR alpha,
                const TYPE *A, const int lda, const TYPE *B, const int ldb,
                const SCALAR beta, const enum PACK_UPLO UC,
                TYPE *C, const int ldc)
/*
 * Special packed matmul, calls dense gemm kernel using at most
 * M*K + K*NB + NB*NB space.  If this exceeds ATL_pkMaxMalloc or fails,
 * operates using at most 2*K*NB + NB*NB.  If this fails, returns non-zero.
 * If full space is malloced, both matrices are copied exactly once.  If
 * the smaller space is used, $A$ will be copied ceil(N/NB) times.
 */
{
   const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb);
   const int incK = ATL_MulByNB(K);
   const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0);
   const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0);
   const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0);
   int ib, jb, i, j, k;
   void *vC=NULL;
   TYPE *pC, *pA, *pB, *pA0;
   NBMM0 pNBmm, pNBmm0;
   void (*A2blk)(const int M, const int N, const TYPE alpha, const TYPE *A,
                 int lda, const int ldainc, TYPE *V);

   i = ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K)+M*K);
   if (i <= ATL_pkMaxMalloc) vC = malloc(i);
   if (!vC)
   {
      vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K+K)));
      if (TA == AtlasNoTrans) A2blk = ATL_prow2blkT;
      else A2blk = ATL_pcol2blk;
   }
   else A2blk = NULL;

   if (!vC) return(-1);
   pC = ATL_AlignPtr(vC);
   pB = pC + NBNB;
   pA = pB + ATL_MulByNB(K);

/*
 * If we've got the space, copy all of A up front
 */
   if (!A2blk)
   {
      if (TA == AtlasNoTrans)
         ATL_prow2blkTF(M, K, ATL_rone, A, lda, ldainc, pA);
      else ATL_pcol2blkF(K, M, ATL_rone, A, lda, ldainc, pA);
      pA -= ATL_MulByNB(K);
   }
   pA0 = pA;
/*
 * Loop over column panels of $B$
 */
   for (j=0; j < N; j += NB)
   {
      jb = N - j;
      jb = Mmin(jb, NB);
/*
 *    Copy column-panel of B to block-major storage
 */
      if (alpha == 1.0)
      {
         if (TB == AtlasNoTrans)
            ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
         else /* TB == AtlasTrans */
            ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
      }
      else if (TB == AtlasNoTrans)
         ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb),
                         ldbinc, pB);
      else /* TB == AtlasTrans */
         ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB);
/*
 *    Loop over row-panels of A
 */
      for (i=0; i < M; i += MB)
      {
         ib = M - i;
         ib = Mmin(ib, MB);
         if (A2blk)
         {
            if (TA == AtlasNoTrans)
               ATL_prow2blkT(ib, K, ATL_rone, A+MindexP(UA,i,0,lda), lda,
                             ldainc, pA);
            else /* TA == AtlasTrans */
               ATL_pcol2blk(K, ib, ATL_rone, A+MindexP(UA,0,i,lda),
                            Mpld(UA,i,lda), ldainc, pA);
         }
         else pA += ATL_MulByNB(K);
         if (jb != NB || ib != MB)
         {
            pNBmm0 = pNBmm = ATL_gNBmm;
            if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB);
         }
         else
         {
            pNBmm = NBmm;
            pNBmm0 = NBmm_b0;
         }
/*
 *       Handle full blocks of K
 */
         if (nKb)
         {
            pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib);
            for (k=1; k != nKb; k++)
            {
               pNBmm(ib, jb, NB, ATL_rone, pA+ib*NB*k, NB, pB+jb*NB*k, NB,
                     ATL_rone, pC, ib);
            }
            if (kb)
               ATL_gNBmm(ib, jb, kb, ATL_rone, pA+ib*NB*nKb, kb,
                         pB+jb*NB*nKb, kb, ATL_rone, pC, ib);
         }
         else if (kb)
         {
            Mjoin(PATL,gezero)(ib, jb, pC, ib);
            ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib);
         }
         ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc),
                     ldcinc, beta);
      }
      pA = pA0;
   }
   free(vC);
   return(0);
}
Esempio n. 20
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
{
    size_t incA, incB, incC;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    const size_t incK = ATL_MulByNB((size_t)K);
    int N = N0;
    int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n;
    void *vA=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    MAT2BLK A2blk, B2blk;
    PUTBLK putblk;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    /*
     * If K sufficiently large, write to temporary C as safety measure;  otherwise
     * write directly to C
     */
    if (nKb < 12)
    {
        putblk = NULL;
        pC = C;
        if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1;
        else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0;
        else NBmm0 = NBmm_bX;
    }
    else
    {
        NBmm0 = NBmm_b0;
        vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB));
        if (!vC) return(-1);
        pC = ATL_AlignPtr(vC);
        if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1);
        else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0);
        else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1);
        else putblk = Mjoin(PATL,putblk_bX);
    }
    /*
     * Special case if we don't need to copy one or more input matrix
     */
    if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B))
    {
        if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) &&
                ATL_DataIsMinAligned(A))
        {
            i = NBNB;
            pA = (TYPE *) A;
            A = NULL;
            A2blk = NULL;
            incA = 0;
        }
        else
        {
            vA = malloc(ATL_Cachelen + ATL_MulBySize(incK));
            if (!vA)
            {
                free(vC);
                return(-1);
            }
            pA = ATL_AlignPtr(vA);
            if (TA == AtlasNoTrans)
            {
                incA = NB;
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1);
                else A2blk = Mjoin(PATL,row2blkT_aX);
            }
            else
            {
                incA = ATL_MulByNB(lda);
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1);
                else A2blk = Mjoin(PATL,col2blk_aX);
            }
        }
        Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA,
                           incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0);
        if (vA) free(vA);
        if (vC) free(vC);
        return(0);
    }
    i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
    if (i <= ATL_MaxMalloc) vA = malloc(i);
    if (!vA)
    {
        if (TA == AtlasNoTrans && TB == AtlasNoTrans)
        {
            if (vC) free(vC);
            return(1);
        }
        if (jb) n = nNb + 1;
        else n = nNb;
        for (j=2; !vA; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vA = malloc(h);
        }
        if (!vA)
        {
            if (vC) free(vC);
            return(-1);
        }
        n = ATL_MulByNB(k);
        jb2 = 0;
    }
    else
    {
        jb2 = jb;
        k = nNb;
        n = N;
    }
    pA = ATL_AlignPtr(vA);
    if (TB == AtlasNoTrans)
    {
        incB = ldb*n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1);
        else B2blk = Mjoin(PATL,col2blk2_aX);
    }
    else
    {
        incB = n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1);
        else B2blk = Mjoin(PATL,row2blkT2_aX);
    }
    if (TA == AtlasNoTrans)
    {
        incA = NB;
        A2blk = Mjoin(PATL,row2blkT_a1);
    }
    else
    {
        incA = ATL_MulByNB(lda);
        A2blk = Mjoin(PATL,col2blk_a1);
    }
    incC = ldc*n;
    pB = pA + incK;

    do
    {
        if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
        else B2blk(n, K, B, ldb, pB, alpha);
        Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                           incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0);
        N -= n;
        nNb -= k;
        if (N < n)
        {
            jb2 = jb;
            n = N;
            k = nNb;
        }
        C += incC;
        B += incB;
        if (!putblk) pC = C;
    }
    while (N);

    if (vC) free(vC);
    free(vA);
    return(0);
}