Example #1
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda0,
                      const TYPE *B, const int ldb0, const SCALAR beta,
                      TYPE *C, const int ldc0)
{
    size_t incA, incB, incC;
    const size_t lda=lda0, ldb=ldb0, ldc=ldc0;
    const size_t incK = ATL_MulByNB((size_t)K);
    int N = N0;
    int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n;
    void *vA=NULL, *vC=NULL;
    TYPE *pA, *pB, *pC;
    MAT2BLK A2blk, B2blk;
    PUTBLK putblk;
    NBMM0 NBmm0;

    nMb = ATL_DivByNB(M);
    nNb = ATL_DivByNB(N);
    nKb = ATL_DivByNB(K);
    ib = M - ATL_MulByNB(nMb);
    jb = N - ATL_MulByNB(nNb);
    kb = K - ATL_MulByNB(nKb);

    /*
     * If K sufficiently large, write to temporary C as safety measure;  otherwise
     * write directly to C
     */
    if (nKb < 12)
    {
        putblk = NULL;
        pC = C;
        if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1;
        else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0;
        else NBmm0 = NBmm_bX;
    }
    else
    {
        NBmm0 = NBmm_b0;
        vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB));
        if (!vC) return(-1);
        pC = ATL_AlignPtr(vC);
        if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1);
        else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0);
        else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1);
        else putblk = Mjoin(PATL,putblk_bX);
    }
    /*
     * Special case if we don't need to copy one or more input matrix
     */
    if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B))
    {
        if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) &&
                ATL_DataIsMinAligned(A))
        {
            i = NBNB;
            pA = (TYPE *) A;
            A = NULL;
            A2blk = NULL;
            incA = 0;
        }
        else
        {
            vA = malloc(ATL_Cachelen + ATL_MulBySize(incK));
            if (!vA)
            {
                free(vC);
                return(-1);
            }
            pA = ATL_AlignPtr(vA);
            if (TA == AtlasNoTrans)
            {
                incA = NB;
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1);
                else A2blk = Mjoin(PATL,row2blkT_aX);
            }
            else
            {
                incA = ATL_MulByNB(lda);
                if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1);
                else A2blk = Mjoin(PATL,col2blk_aX);
            }
        }
        Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA,
                           incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0);
        if (vA) free(vA);
        if (vC) free(vC);
        return(0);
    }
    i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
    if (i <= ATL_MaxMalloc) vA = malloc(i);
    if (!vA)
    {
        if (TA == AtlasNoTrans && TB == AtlasNoTrans)
        {
            if (vC) free(vC);
            return(1);
        }
        if (jb) n = nNb + 1;
        else n = nNb;
        for (j=2; !vA; j++)
        {
            k = n / j;
            if (k < 1) break;
            if (k*j < n) k++;
            h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
            if (h <= ATL_MaxMalloc) vA = malloc(h);
        }
        if (!vA)
        {
            if (vC) free(vC);
            return(-1);
        }
        n = ATL_MulByNB(k);
        jb2 = 0;
    }
    else
    {
        jb2 = jb;
        k = nNb;
        n = N;
    }
    pA = ATL_AlignPtr(vA);
    if (TB == AtlasNoTrans)
    {
        incB = ldb*n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1);
        else B2blk = Mjoin(PATL,col2blk2_aX);
    }
    else
    {
        incB = n;
        if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1);
        else B2blk = Mjoin(PATL,row2blkT2_aX);
    }
    if (TA == AtlasNoTrans)
    {
        incA = NB;
        A2blk = Mjoin(PATL,row2blkT_a1);
    }
    else
    {
        incA = ATL_MulByNB(lda);
        A2blk = Mjoin(PATL,col2blk_a1);
    }
    incC = ldc*n;
    pB = pA + incK;

    do
    {
        if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
        else B2blk(n, K, B, ldb, pB, alpha);
        Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                           incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0);
        N -= n;
        nNb -= k;
        if (N < n)
        {
            jb2 = jb;
            n = N;
            k = nNb;
        }
        C += incC;
        B += incB;
        if (!putblk) pC = C;
    }
    while (N);

    if (vC) free(vC);
    free(vA);
    return(0);
}
Example #2
0
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
                      const int M, const int N0, const int K,
                      const SCALAR alpha, const TYPE *A, const int lda,
                      const TYPE *B, const int ldb, const SCALAR beta,
                      TYPE *C, const int ldc)
{
   int N = N0;
   int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n, incA, incB, incC;
   const int incK = ATL_MulByNB(K);
   void *vA=NULL;
   TYPE *pA, *pB;
   MAT2BLK A2blk, B2blk;
   MATSCAL gescal;
   NBMM0 NBmm0;

   nMb = ATL_DivByNB(M);
   nNb = ATL_DivByNB(N);
   nKb = ATL_DivByNB(K);
   ib = M - ATL_MulByNB(nMb);
   jb = N - ATL_MulByNB(nNb);
   kb = K - ATL_MulByNB(nKb);

   if (beta[1] == ATL_rzero)
   {
      gescal = NULL;
      if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1);
      else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0);
      else NBmm0 = Mjoin(PATL,CNBmm_bX);
   }
   else
   {
      gescal = Mjoin(PATL,gescal_bX);
      NBmm0 = Mjoin(PATL,CNBmm_b1);
   }

   i = ATL_Cachelen + ATL_MulBySize(N*K + incK);
   if (i <= ATL_MaxMalloc) vA = malloc(i);
   if (!vA)
   {
      if (TA == AtlasNoTrans && TB == AtlasNoTrans) return(1);
      if (jb) n = nNb + 1;
      else n = nNb;
      for (j=2; !vA; j++)
      {
         k = n / j;
         if (k < 1) break;
         if (k*j < n) k++;
         h = ATL_Cachelen + ATL_MulBySize((k+1)*incK);
         if (h <= ATL_MaxMalloc) vA = malloc(h);
      }
      if (!vA) return(-1);
      n = ATL_MulByNB(k);
      jb2 = 0;
   }
   else
   {
      jb2 = jb;
      k = nNb;
      n = N;
   }
   pA = ATL_AlignPtr(vA);
   if (TB == AtlasNoTrans)
   {
      incB = ldb*n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,col2blk2_a1);
         else B2blk = Mjoin(PATL,col2blk2_aXi0);
      }
      else B2blk = Mjoin(PATL,col2blk2_aX);
   }
   else if (TB == AtlasConjTrans)
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkC2_a1);
         else B2blk = Mjoin(PATL,row2blkC2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkC2_aX);
   }
   else
   {
      incB = n<<1;
      if (alpha[1] == ATL_rzero)
      {
         if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkT2_a1);
         else B2blk = Mjoin(PATL,row2blkT2_aXi0);
      }
      else B2blk = Mjoin(PATL,row2blkT2_aX);
   }
   if (TA == AtlasNoTrans)
   {
      incA = NB<<1;
      A2blk = Mjoin(PATL,row2blkT_a1);
   }
   else if (TA == AtlasConjTrans)
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blkConj_a1);
   }
   else
   {
      incA = ATL_MulByNB(lda)<<1;
      A2blk = Mjoin(PATL,col2blk_a1);
   }
   incC = ldc*n<<1;
   pB = pA + (incK<<1);

   do
   {
      if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha);
      else B2blk(n, K, B, ldb, pB, alpha);
      Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA,
                         incA, A2blk, pB, beta, C, ldc, gescal, NBmm0);
      N -= n;
      nNb -= k;
      if (N < n)
      {
         jb2 = jb;
         n = N;
         k = nNb;
      }
      C += incC;
      B += incB;
   }
   while (N);

   free(vA);
   return(0);
}