Esempio n. 1
0
int cblas_errprn(int ierr, int info, char *form, ...)
{
   va_list argptr;

   va_start(argptr, form);
#ifdef GCCWIN
   vprintf(form, argptr);
#else
   vfprintf(stderr, form, argptr);
#endif
   va_end(argptr);
   return(Mmin(ierr,info));
}
Esempio n. 2
0
static TYPE *ATL_LmulUC(const int M, const int N, const TYPE *LU, const int ldl)
{
   const int lda = ldl SHIFT, MN = Mmin(M,N);
   int i, j, m;
   TYPE *C, *c;
   #ifdef TREAL
      const TYPE ONE=ATL_rone;
   #else
      const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #endif

   C = c = malloc(M*ATL_MulBySize(N));
   ATL_assert(c);
   if (M >= N)
   {
      for (j=0; j < MN; j++)
      {
         m = j SHIFT;
         for (i=0; i < m; i++) c[i] = ATL_rzero;
         #ifdef TCPLX
            c[i++] = ATL_rone;
            c[i++] = ATL_rzero;
         #else
            c[i++] = ATL_rone;
         #endif
         for (m=M SHIFT; i < m; i++) c[i] = LU[i];
         c += m;
         LU += lda;
      }
      LU -= MN * lda;
      for (m=M SHIFT; j < N; j++, c += m) Mjoin(PATL,zero)(M, c, 1);
      cblas_trmm(CblasColMajor, CblasRight, CblasUpper, CblasNoTrans,
                 CblasNonUnit, M, N, ONE, LU, ldl, C, M);
   }
   else /* M < N */
   {
      for (j=0; j < M; j++)
      {
         m = (j+1) SHIFT;
         for (i=0; i < m; i++) c[i] = LU[i];
         for (m=M SHIFT; i < m; i++) c[i] = ATL_rzero;
         c += m;
         LU += lda;
      }
      Mjoin(PATL,gecopy)(M, N-M, LU, ldl, c, M);
      LU -= M * lda;
      cblas_trmm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans,
                 CblasUnit, M, N, ONE, LU, ldl, C, M);
   }
   return(C);
}
Esempio n. 3
0
void Mjoin(prow2blkT,_blk)(const int blk, const int M, const int N,
                           const SCALAR alpha, const TYPE *A, int lda,
                           const int ldainc, TYPE *V)
/*
 * Given a packed Upper matrix A, copies & transposes M rows starting at A into
 * block-major row panel
 *    ldainc =  0 : General rectangular
 *    ldainc =  1 : Upper
 *    ldainc = -1 : Lower
 */
{
   const int kb = Mmin(blk,N);
   const int ncb = N / kb, nr = N - ncb*kb;
   const int incV = kb*M - kb;
   const int VN = kb*M, vn = nr*M;
   int jb, i, j;
   TYPE *v;
   #ifdef ALPHAXI0
      #ifdef Conj_
         const register TYPE ralpha = *alpha, calpha = -ralpha;
      #else
         const register TYPE ralpha = *alpha;
      #endif
   #elif defined(ALPHAX)
      register const TYPE ralpha=(*alpha), ialpha = alpha[1];
      register TYPE ra, ia;
   #endif

   if (ldainc == -1) lda--;
   lda -= M;
   lda += lda;
   for (jb=ncb; jb; jb--)
   {
      for (j=kb; j; j--)
      {
         v = V++;
         for (i=0; i != M; i++, v += kb, A += 2) scalcp(A, v+VN, v);
         A += lda;
         lda += ldainc;
      }
      V += incV;
   }
   for (j=nr; j; j--)
   {
      v = V++;
      for (i=0; i != M; i++, v += nr, A += 2) scalcp(A, v+vn, v);
      A += lda;
      lda += ldainc;
   }
}
Esempio n. 4
0
void Mjoin(pcol2blk,_blk)(const int blk, const int M, const int N,
                          const SCALAR alpha, const TYPE *A, int lda,
                          const int ldainc, TYPE *V)
/*
 * Given a packed matrix A, copies N columns starting at A into
 * block-major column panel
 *    ldainc =  0 : General
 *    ldainc =  1 : Upper
 *    ldainc = -1 : Lower
 * NOTE: specialize to alpha cases after it works!
 */
{
   const int kb = Mmin(M,blk);
   const int nrb = M / kb, mr = M - nrb*kb;
   const int nv = kb*N, nvv = mr*N;
   const int NN = nv+nv - kb;
   const int ldainc2 = ldainc+ldainc, M2 = M+M;
   int i, ib, j, J;
   TYPE *v = V + nrb*(NN+kb);
   #ifdef ALPHAXI0
      #ifdef Conj_
         const register TYPE ralpha = *alpha, calpha = -ralpha;
      #else
         const register TYPE ralpha = *alpha;
      #endif
   #elif defined(ALPHAX)
      const register TYPE ralpha=(*alpha), ialpha = alpha[1];
      register TYPE ra, ia;
   #endif

   if (ldainc == -1) lda--;
   lda += lda;
   ATL_assert(N <= blk);
   for (j=0; j != N; j++)
   {
      for (ib=nrb; ib; ib--)
      {
         for (i=0; i < kb; i++, A += 2, V++) scalcp(A, V+nv, V);
         V += NN;
      }
      if (mr)
      {
         for (i=0; i < mr; i++, A += 2, v++) scalcp(A, v+nvv, v);
      }
      V += kb - nrb*(NN+kb);
      A += lda - M2;
      lda += ldainc2;
   }
}
int solve(int S[500][500], int R, int C){
	int sol[R][C];
	sol[R-1][C-1] = 0;
	for(int i=R-2; i>=0; i--)
		sol[i][C-1] = sol[i+1][C-1]-S[i][C-1];
	for(int j=C-2; j>=0; j--)
		sol[R-1][j] = sol[R-1][j+1]-S[R-1][j];
	for(int i=R-2; i>=0; i--){
		for(int j=C-2; j>=0; j--){
			sol[i][j] = Mmin(sol[i+1][j],sol[i][j+1]) - S[i][j];
			if(sol[i][j]<1)		sol[i][j] = 0;
		}
	}
	return sol[0][0]+1;
}
Esempio n. 6
0
void Mjoin(PATL,prow2blkTF)(const int M, const int N, const SCALAR alpha,
                            const TYPE *A, int lda, const int ldainc, TYPE *V)
{
   const int mb = Mmin(NB,M), nMb = ATL_DivByNB(M);
   const int m = ATL_MulByNB(nMb), n = ATL_MulByNB(ATL_DivByNB(N));
   const int nr = N - n, mr = M - m;
   const int incVm = ATL_MulByNB(N), incVV = ATL_MulByNB(mr);
   int i, j, ib, jb;
   const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper :
      ( (ldainc == -1) ? PackLower : PackGen );
   TYPE *v, *vv = V+nMb*incVm;
   void (*row2blk)(const int M, const int N, const TYPE alpha, const TYPE *A,
                   int lda, const int ldainc, TYPE *V);

   if (ldainc)
   {
      if (alpha == ATL_rone) row2blk = ATL_prow2blk_KB_a1;
      else row2blk = ATL_prow2blk_KB_aX;

      for (j=0; j < n; j += NB)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, NB, alpha, A+MindexP(UA,i,j,lda), Mpld(UA,j,lda),
                    ldainc, v);
         if (mr)
         {
            row2blk(mr, NB, alpha, A+MindexP(UA,m,j,lda), Mpld(UA,j,lda),
                    ldainc, vv);
            vv += incVV;
         }
         V += NBNB;
      }
      if (nr)
      {
         for (v=V, i=0; i < m; i += NB, v += incVm)
            row2blk(NB, nr, alpha, A+MindexP(UA,i,n,lda), Mpld(UA,n,lda),
                    ldainc, v);
         if (mr)
            row2blk(mr, nr, alpha, A+MindexP(UA,m,n,lda), Mpld(UA,n,lda),
                    ldainc, vv);
      }
   }
   else if (SCALAR_IS_ONE(alpha))
      Mjoin(PATL,row2blkT2_a1)(M, N, A, lda, V, alpha);
   else
      Mjoin(PATL,row2blkT2_aX)(M, N, A, lda, V, alpha);
}
Esempio n. 7
0
int ATL_getrfR(const int M, const int N, TYPE *A, const int lda, int *ipiv)
/*
 * Row-major factorization of form
 *   A = L * U * P
 * where P is a column-permutation matrix, L is lower triangular (lower
 * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper
 * trapazoidal if M < N).  This is the recursive Level 3 BLAS version.
 */
{
   const int MN = Mmin(M, N);
   int Nup, Ndown, i, ierr=0;
   #ifdef TCPLX
      const TYPE one[2] = {ATL_rone, ATL_rzero};
      const TYPE none[2] = {ATL_rnone, ATL_rzero};
      TYPE inv[2], tmp[2];
   #else
      #define one ATL_rone
      #define none ATL_rnone
      TYPE tmp;
   #endif
   TYPE *Ar, *Ac, *An;

   if (MN > 1)
   {
      Nup = MN >> 1;
      #ifdef NB
         if (Nup > NB) Nup = ATL_MulByNB(ATL_DivByNB(Nup));
      #endif
      Ndown = M - Nup;
      i = ATL_getrfR(Nup, N, A, lda, ipiv);
      if (i) if (!ierr) ierr = i;
      Ar = A + (Nup * lda SHIFT);
      Ac = A + (Nup SHIFT);
      An = Ar + (Nup SHIFT);

      ATL_laswp(Ndown, Ar, lda, 0, Nup, ipiv, 1);  /* apply pivots */
      cblas_trsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans,
                 CblasUnit, Ndown, Nup, one, A, lda, Ar, lda);
      cblas_gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Ndown, N-Nup, Nup,
                 none, Ar, lda, Ac, lda, one, An, lda);

      i = ATL_getrfR(Ndown, N-Nup, An, lda, ipiv+Nup);
      if (i) if (!ierr) ierr = Nup + i;
      for (i=Nup; i != MN; i++) ipiv[i] += Nup;
      ATL_laswp(Nup, A, lda, Nup, MN, ipiv, 1);  /* apply pivots */
   }
Esempio n. 8
0
void Mjoin(Mjoin(PATL,t),MY_GER)
(ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X,
 ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda)
{
    ATL_INT mb, nb, mu, nu, nblks, nrblks, ncblks, ldaP;
    ATL_TGER_t pd;
    int P;
    static TYPE *A0=NULL, *A0e=NULL;

    if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha))  /* quick return if no-op */
        return;

    pd.M = M;
    pd.N = N;
    pd.incX = incX;
    pd.incY = incY;
    pd.lda = lda;
    pd.alpha = alpha;
    pd.X = X;
    pd.Y = Y;
    pd.A = A;
    pd.flg = (A0 == A || A0e == A+(M SHIFT)) ? 1 : 2;
    A0 = A;
    A0e = A+(M SHIFT);

    P = ATL_DivBySize(CacheEdge);
    P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
    P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P, since it hurts alignment */
//   printf("TGER, P=%d\n", P);
    P = Mmin(ATL_NTHREADS, P);
    /*
     * Make sure we don't overflow 32-bit integer lda
     */
    ldaP = P * lda;
    while ((size_t)ldaP != ((size_t)lda)*P)
    {
        P--;
        ldaP = P * lda;
    }
    if (P > 1)
        ATL_goparallel(P, MY_DOWORK_cols, &pd, NULL);
    else
        MY_GER1(M, N, alpha, X, incX, Y, incY, A, lda);
}
Esempio n. 9
0
static TYPE *ATL_LmulUR(const int M, const int N, const TYPE *LU, const int ldl)
{
   const int lda = ldl SHIFT, ldc = N SHIFT, MN = Mmin(M,N);
   int i, j, m;
   TYPE *C, *c;
   #ifdef TREAL
      const TYPE ONE=ATL_rone;
   #else
      const TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #endif

   C = c = malloc(M*ATL_MulBySize(N));
   ATL_assert(c);
   if (M >= N)
   {
      for (i=0; i != N; i++, LU += lda, C += ldc)
      {
         Mjoin(PATL,copy)(i+1, LU, 1, C, 1);
         Mjoin(PATL,zero)(N-i-1, C+((i+1)SHIFT), 1);
      }
      for(; i != M; i++, LU += lda, C += ldc) Mjoin(PATL,copy)(N, LU, 1, C, 1);
      LU -= lda * M;
      C -= ldc * M;
      cblas_trmm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit,
                 M, N, ONE, LU, ldl, C, N);
   }
   else /* N > M */
   {
      for (i=0; i != M; i++, C += ldc, LU += lda)
      {
         Mjoin(PATL,zero)(i, C, 1);
         C[i SHIFT] = ATL_rone;
         #ifdef TCPLX
            C[(i SHIFT)+1] = ATL_rzero;
         #endif
         Mjoin(PATL,copy)(N-i-1, LU+((i+1)SHIFT), 1, C+((i+1)SHIFT), 1);
      }
      LU -= lda * M;
      C -= ldc * M;
      cblas_trmm(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans,
                 CblasNonUnit, M, N, ONE, LU, ldl, C, N);
   }
   return(C);
}
Esempio n. 10
0
void Mjoin(Mjoin(PATL,pcol2blk),NM)
   (const int M, const int N, const TYPE alpha, const TYPE *A, int lda,
    const int ldainc, TYPE *V)
/*
 * Given a packed matrix A, copies N columns starting at A into
 * block-major column panel
 *    ldainc =  0 : General
 *    ldainc =  1 : Upper
 *    ldainc = -1 : Lower
 * NOTE: specialize to alpha cases after it works!
 */
{
   const int kb = Mmin(M,NB);
   const int nrb = M / kb, mr = M - nrb*kb;
   int i, ib, j, J;
   const int NN = N*kb;
   TYPE *v = V + nrb*NN;

   if (ldainc)
   {
      if (ldainc == -1) lda--;
      ATL_assert(N <= NB);
      for (j=0; j != N; j++)
      {
         for (ib=nrb; ib; ib--)
         {
            for (i=0; i < kb; i++) V[i] = ATL_MulByALPHA(A[i]);
            V += NN;
            A += kb;
         }
         if (mr)
         {
            for (i=0; i < mr; i++) v[i] = ATL_MulByALPHA(A[i]);
            v += mr;
         }
         V += kb - nrb*NN;
         A += lda - nrb*kb;
         lda += ldainc;
      }
   }
   else Mjoin(Mjoin(PATL,col2blk),NM)(M, N, A, lda, V, alpha);
}
Esempio n. 11
0
void Mjoin(Mjoin(PATL,prow2blkT),NM)
   (const int M, const int N, const TYPE alpha, const TYPE *A, int lda,
    const int ldainc, TYPE *V)
/*
 * Given a packed Upper matrix A, copies & transposes M rows starting at A into
 * block-major row panel
 *    ldainc =  0 : General rectangular
 *    ldainc =  1 : Upper
 *    ldainc = -1 : Lower
 * NOTE: specialize to alpha cases after it works!
 */
{
   const int kb = Mmin(NB,N);
   const int ncb = N / kb, nr = N - ncb*kb;
   const int incV = kb*M - kb;
   int jb, i, j;
   TYPE *v;

   if (ldainc)
   {
      if (ldainc == -1) lda--;
      for (jb=ncb; jb; jb--)
      {
         for (j=kb; j; j--)
         {
            v = V++;
            for (i=0; i != M; i++, v += kb) *v = ATL_MulByALPHA(A[i]);
            A += lda;
            lda += ldainc;
         }
         V += incV;
      }
      for (j=nr; j; j--)
      {
         v = V++;
         for (i=0; i != M; i++, v += nr) *v = ATL_MulByALPHA(A[i]);
         A += lda;
         lda += ldainc;
      }
   }
   else Mjoin(Mjoin(PATL,row2blkT),NM)(N, M, A, lda, V, alpha);
}
Esempio n. 12
0
void Mjoin(PATL,geset)
   (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const SCALAR beta,
    TYPE *A, ATL_CINT lda)
/*
 * Sets main diagonal to beta, rest of matrix to alpha
 */
{

   ATL_INT j;
   ATL_CINT MN = Mmin(M,N);
   #ifdef TCPLX
      ATL_CINT lda2 = lda+lda;
   #else
      #define lda2 lda
   #endif

#ifdef TCPLX
   if (*alpha == *beta && alpha[1] == beta[1])
#else
   if (alpha == beta)
#endif
   {
      for (j=0; j < N; j++, A += lda2)
         Mjoin(PATL,set)(M, alpha, A, 1);
      return;
   }
   for (j=0; j < MN; j++, A += lda2)
   {
      if (j)
         Mjoin(PATL,set)(j, alpha, A, 1);
      #ifdef TCPLX
         A[j+j] = *beta;
         A[j+j+1] = beta[1];
      #else
         A[j] = beta;
      #endif
      if (M-j-1)
         Mjoin(PATL,set)(M-j-1, alpha, A+((j+1) SHIFT), 1);
   }
   for (; j < N; j++, A += lda2)
      Mjoin(PATL,set)(M, alpha, A, 1);
}
Esempio n. 13
0
void GoToTown(int *nreps, int flsizeKB, int mflopF, int ldagap,  int rout,
              int *Ns, int *Ms, int *UPLOs, int *SDs)
{
   FILE *fpout=stdout;
   double time, mflop, mfB;
   int *nbs, *flgs, *ms, *ns;
   int itst=0, lda, n, m, u, s, b, r, M, kk, nb0, nrep;

   fprintf(fpout, "***     TUNING FOR %10s ***\n",
           Bitmap2Char(FSRout, (rout<<8)+UPLOs[1]+SDs[1]));
   fprintf(fpout, "*********************************\n");
   fprintf(fpout, "TST     REP  UP  SD       M       N     LDA           TIME          MFLOP\n");
   fprintf(fpout, "======  ===  ==  ==  ======  ======  ======  =============  =============\n");

   for (n=1; n <= Ns[0]; n++)
   {
      for (m=1; m <= Ms[0]; m++)
      {
         M = (Ms[m]) ? Ms[m]:Ns[n];
         for (u=1; u <= UPLOs[0]; u++)
         {
            for (s=1; s <= SDs[0]; s++)
            {
               lda = ldagap + M;
               nrep = GetMyReps(Mmin(M,Ns[n]), nreps);
               for (r=1; r <= nrep; r++)
               {
                  time = GetTime(rout, mflopF, lda, M, Ns[n], CAN_NB,
                                 UPLOs[u], SDs[s], flsizeKB);
                  mflop = Time2Flops(rout, UPLOs[u], M, Ns[n], time);
                  fprintf(fpout,
                          "%6d %4d   %c   %c %7d %7d %7d   %13e %14.2f\n",
                           itst++, r, Uplo2Char(rout, UPLOs[u]+SDs[s]),
                           Side2Char(rout, SDs[s]+UPLOs[u]),
                           M, Ns[n], lda, time, mflop);
               }                                /* end of reps loop */
            }                                   /* end of Side loop */
         }                                      /* end of Uplo loop */
      }                                         /* end of M loop */
   }                                            /* end of N loop */
}
Esempio n. 14
0
void F77WRAP_GETRF(const F77_INTEGER *M, const F77_INTEGER *N,
                   TYPE *A, const F77_INTEGER *lda, F77_INTEGER *ipiv0,
                   F77_INTEGER *info)
{
   const int MN = Mmin(*M,*N);
   int i;
   #ifdef ATL_FunkyInts
      int *ipiv;
      ipiv = malloc(MN*sizeof(int));
      ATL_assert(ipiv);
   #else
      #define ipiv ipiv0
   #endif
   *info = ATL_getrf(AtlasColMajor, *M, *N, A, *lda, ipiv);
   #ifdef ATL_FunkyInts
      for (i=0; i != MN; i++) ipiv0[i] = ipiv[i] + 1;
      free(ipiv);
   #else
      for (i=0; i != MN; i++) ipiv[i]++;
   #endif
}
Esempio n. 15
0
void Mjoin(pcol2blkF,_blk)
   (const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A,
    int lda, const int ldainc, TYPE *V)
/*
 * Copies entire MxN matrix to block major format
 */
{
   int j, jb;
   const int incV = blk*(M+M);
   const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper :
      ( (lda == -1) ? PackLower : PackGen );
   void (*col2blk)(const int blk, const int M, const int N, const SCALAR alpha,
                   const TYPE *A, int lda, const int ldainc, TYPE *V);

#ifdef Conj_
   if (alpha[1] == ATL_rzero)
   {
      if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_a1),_blk);
      else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aXi0),_blk);
   }
   else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aX),_blk);
#else
   if (alpha[1] == ATL_rzero)
   {
      if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blk_a1),_blk);
      else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aXi0),_blk);
   }
   else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aX),_blk);
#endif

   for (j=0; j < N; j += blk)
   {
      jb = N-j;
      jb = Mmin(jb, blk);
      col2blk(blk, M, jb, alpha, A+MindexP(UA,0,j,lda), Mpld(UA,j,lda),
              ldainc, V);
      V += incV;
   }
}
Esempio n. 16
0
int ATL_gerqr(ATL_CINT M, ATL_CINT N, TYPE *A, ATL_CINT LDA, TYPE  *TAU,
               TYPE *ws_RQ2, TYPE *ws_T, ATL_CINT LDT,
               TYPE *WORKM, const int buildT)
{
   int top, bottom, buildT_temp;
   int topMN;
   int I, INFO, IINFO, lbuilt, rbuilt, method;
   int LDA2 = LDA SHIFT;                    /* for complex LDA *2             */
   int LDT2 = LDT SHIFT;                    /* for complex LDT *2             */
   ATL_CINT minMN = Mmin(M, N);

   #ifdef TCPLX
      TYPE ONE[2] = {ATL_rone, ATL_rzero};
   #else
      #define ONE ATL_rone
   #endif

   if (M < 1 || N < 1) return(0);           /* Nothing to do.                 */
   METHOD(method, N, M, LDA);                   /* Find the method.           */
   #if !defined(ATL_USEPTHREADS)
   if (method == 2 || method == 3) method=1;    /* Don't PCA if no affinity.  */
   #endif

   switch(method)                               /* Based on method;           */
   {
      case 0:  /* RECURSION. */

      /*
       * Choose a smart recursive column partitioning based on M:
       */
         if (minMN >= NB+NB)            /* big prob, put remainder on right   */
         {
            topMN = ATL_MulByNB(ATL_DivByNB(minMN>>1));
            bottom = minMN - topMN;
            top  = M - bottom;
         }
         else /* small prob, keep M mult of MU (MU more critical than NU)     */
         {
Esempio n. 17
0
void Mjoin(PATL,symv)
   (const enum ATLAS_UPLO Uplo, const int N, const SCALAR alpha,
    const TYPE *A, const int lda, const TYPE *X, const int incX,
    const SCALAR beta, TYPE *Y, const int incY)
{
   int mb, nb, jb, mb1, incA1, incA, incXY, incXY1, n, j;
   const int lda2=(lda SHIFT);
   const TYPE *x0=X, *x1, *A0=A, *A1;
   TYPE *y1, *y0=Y;

   assert(incX==1 && incY==1 && Uplo == AtlasLower);
   #ifdef TREAL
      assert(alpha == ATL_rone && beta == ATL_rone);
   #else
      assert(*alpha == ATL_rone && *beta == ATL_rone);
      assert(alpha[1] == ATL_rzero && beta[1] == ATL_rzero);
   #endif

   ATL_GetPartSYMV(A, lda, &mb, &nb);
   mb1 = N - ( (N-1) / mb ) * mb;
   incA1 = nb * lda2;  incXY1 = (nb SHIFT);
   incA = incXY = mb SHIFT;
   n = (N-mb)SHIFT;
   A += n; X += n; Y += n;
   for (n=N-mb; n > 0; n -= mb, A -= incA, X -= incXY, Y -= incXY)
   {
      RsymvL(mb, A+n*lda2, lda, X, beta, Y);
      for (j=0, A1=A, x1=x0, y1=y0; j < n; j += nb, A1 += incA1, x1 += incXY1,
           y1 += incXY1)
      {
         jb = n - j;
         jb = Mmin(jb, nb);
         gemvT(jb, mb, alpha, A1, lda, X, 1, beta, y1, 1);
         gemvN(mb, jb, alpha, A1, lda, x1, 1, beta, Y, 1);
      }
   }
   RsymvL(mb1, A0, lda, x0, beta, y0);
}
Esempio n. 18
0
int ATL_ormqr
   (const enum CBLAS_SIDE SIDE, const enum CBLAS_TRANSPOSE TRANS,
    ATL_CINT M, ATL_CINT N, ATL_CINT K, TYPE *A, ATL_CINT lda,
    const TYPE *TAU, TYPE *C, ATL_CINT ldc, TYPE *WORK, ATL_CINT LWORK)
/*
 * This is the C translation of the standard LAPACK Fortran routine:
 *      SUBROUTINE ATL_ormqr( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
 *                        WORK, LWORK, INFO )
 *
 * ATL_ormqr.c :
 * int ATL_ormqr(const enum CBLAS_SIDE SIDE SIDE,
 *        const enum CBLAS_TRANSPOSE TRANS, ATL_CINT M, ATL_CINT N,
 *        ATL_CINT K, TYPE * A, ATL_CINT lda,TYPE * TAU, TYPE * C, ATL_CINT ldc,
 *                       TYPE * WORK, ATL_CINT LWORK)
 *
 *      NOTE :   ATL_ormqr.c will get compiled to four precisions
 *               single precision real,      double precision real
 *               single precision complex,   double precision complex
 *
 *
 *
 *  Purpose
 *  =======
 *
 *  ATL_ormqr overwrites the general real M-by-N matrix C with
 *
 *                  SIDE = 'L'     SIDE = 'R'
 *  TRANS = 'N':      Q * C          C * Q
 *  TRANS = 'T':      Q**T * C       C * Q**T
 *
 *  where Q is,
 *        a real orthogonal matrix defined as the product of k
 *        elementary reflectors
 *
 *        Q = H(1) H(2) . . . H(k)
 *
 *   OR
 *        a complex unitary matrix defined as a product of k
 *        elementary reflectors
 *
 *        Q = H(1) H(2) . . . H(k)
 *
 *  as returned by ATLL_geqrf.c. Q is of order M if SIDE = 'L' and of order N
 *  if SIDE = 'R'.
 *
 *  Arguments
 *  =========
 *
 *  SIDE    (input) CHARACTER*1
 *          = 'L': apply Q or Q**T from the Left;
 *          = 'R': apply Q or Q**T from the Right.
 *
 *  TRANS   (input) CHARACTER*1
 *          = 'N':  No transpose, apply Q;
 *          = 'T':  Transpose, apply Q**T.
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix C. M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix C. N >= 0.
 *
 *  K       (input) INTEGER
 *          The number of elementary reflectors whose product defines
 *          the matrix Q.
 *          If SIDE = 'L', M >= K >= 0;
 *          if SIDE = 'R', N >= K >= 0.
 *
 *  A       (input) array, dimension (LDA,K)
 *          The i-th column must contain the vector which defines the
 *          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *          DGEQRF in the first k columns of its array argument A.
 *          A is modified by the routine but restored on exit.
 *
 *  lda     (input) INTEGER
 *          The leading dimension of the array A.
 *          If SIDE = 'L', LDA >= max(1,M);
 *          if SIDE = 'R', LDA >= max(1,N).
 *
 *  TAU     (input)  array, dimension (K)
 *          TAU(i) must contain the scalar factor of the elementary
 *          reflector H(i), as returned by ATL_geqrf.c.
 *
 *  C       (input/output)  array, dimension (LDC,N)
 *          On entry, the M-by-N matrix C.
 *          On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.
 *
 *  ldc     (input) INTEGER
 *          The leading dimension of the array C. LDC >= max(1,M).
 *
 *  WORK    (workspace/output) array, dimension (MAX(1,LWORK))
 *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *
 *  LWORK   (input) INTEGER
 *          The dimension of the array WORK.
 *          If SIDE = 'L', LWORK >= max(1,N);
 *          if SIDE = 'R', LWORK >= max(1,M).
 *          For optimum performance LWORK >= N*NB if SIDE = 'L', and
 *          LWORK >= M*NB if SIDE = 'R', where NB is the optimal
 *          blocksize.
 *
 *          If LWORK = -1, then a workspace query is assumed; the routine
 *          only calculates the optimal size of the WORK array, returns
 *          this value as the first entry of the WORK array, and no error
 *          message related to LWORK is issued by XERBLA.
 *
 *  INFO    (output) INTEGER
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 */
{
   ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N);
   ATL_INT n, nb, j, ib, mi, ni, ic, jc ;
   TYPE  *ws_QR2,  *ws_T, *ws_larfb;        /* Workspace for QR2,T, larfb     */
   void *vp=NULL;

   nb = clapack_ilaenv(LAIS_OPT_NB, LAormqr, MYOPT+LARight+LAUpper, M, N, K,-1);

/*
 * If it is a workspace query, return the size of work required.
 *    wrksz = wrksz of ATL_larfb + ATL_larft + ATL_geqr2
 */
   if (LWORK < 0)
   {
      if(SIDE == CblasLeft)
      {
         *WORK = ( N*nb + nb*nb + maxMN )  ;
      }
      else
      {
         *WORK = ( M*nb + nb*nb + maxMN )  ;
      }
      return(0);
   }
   else if (M < 1 || N < 1)                 /* quick return if no work to do  */
      return(0);
/*
 * If the user gives us too little space, see if we can allocate it ourselves
 */
   else
   {
      if(SIDE == CblasLeft)
      {
         if (LWORK < (N*nb + nb*nb + maxMN))
         {
            vp = malloc(ATL_MulBySize(N*nb + nb*nb + maxMN) + ATL_Cachelen);
            if (!vp)
               return(-7);
            WORK = ATL_AlignPtr(vp);
         }
      }
      else
      {
         if (LWORK < (M*nb + nb*nb + maxMN))
         {
            vp = malloc(ATL_MulBySize(M*nb + nb*nb + maxMN) + ATL_Cachelen);
            if (!vp)
               return(-7);
            WORK = ATL_AlignPtr(vp);
         }
      } /* if CblasRight */
   }

/*
 * Assign workspace areas for ATL_larft, ATL_geqr2, ATL_larfb
 */

   ws_T = WORK;                             /* T at begining of work          */
   ws_QR2 = WORK +(nb SHIFT)*nb;            /* After T Work space             */
   ws_larfb = ws_QR2 + (maxMN SHIFT);       /* After workspace for T and QR2  */


   if (SIDE == CblasLeft)
   {
      if ( TRANS == CblasNoTrans )
      {
         j = (K/nb)*nb;
         if (j == K)
         {
            j=K -nb;
         }
 	 for (j; j >= 0; j = j - nb)
         {
            ib = nb;
            if ((j+nb) > K)
            {
               ib = K - j;
            }
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *          H or H' is applied to C(i:m,1:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+(j SHIFT), ldc, ws_larfb, N);
          }                                 /* for                            */
      }                                     /* CblasNoTrans                   */
      else
      {
         for (j = 0 ; j < K; j = j + nb)
         {
            ib = Mmin(K-j, nb);
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *          H or H' is applied to C(i:m,1:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+(j SHIFT), ldc, ws_larfb, N);
         }                                  /* for                            */
      }                                     /* CblasNoTran                    */
   }                                        /* cblasLeft                      */
   else
   {
      if ( TRANS == CblasNoTrans )
      {
 	 for (j = 0 ; j < K; j = j + nb)
         {
            ib = Mmin(K-j, nb);
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *              H or H' is applied to C(1:m,i:n)
 */
            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      M, N-j, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+((j SHIFT)*ldc), ldc, ws_larfb, M);
          }                                 /* for                            */
      }
      else
      {
         j = (K/nb)*nb;
         if (j == K)
         {
            j=K -nb;
         }
 	 for (j; j >= 0; j = j - nb)
         {

            ib = nb;
            if ((j+nb) > K)
            {
               ib = K - j;
            }
/*
 *          Form the triangular factor of the block reflector
 *          H = H(i) H(i+1) . . . H(i+ib-1)
 */
            ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1),
                      lda, TAU+(j SHIFT), ws_T, ib);
/*
 *              H or H' is applied to C(1:m,i:n)
 */

            ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore,
                      M, N-j , ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib,
                      C+((j SHIFT)*ldc) , ldc, ws_larfb, M);
         }                                  /* for                            */

      }                                     /* Cblas Tran on Right            */
   }

   if (vp)
      free(vp);
   return(0);
}                                           /* END ATL_ormqr                  */
Esempio n. 19
0
void Mjoin( PATL, tbsvUC )
(
   const enum ATLAS_DIAG      DIAG,
   const int                  N,         /* N > 0 assumed */
   const int                  K,
   const TYPE                 * A,
   const int                  LDA,
   TYPE                       * X
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, tbsvUC ) solves the following triangular system of equations
 *
 *    conjg( A  ) * x = b,
 *
 * where b and x are n-element vectors and  A  is an n by n unit or non-
 * unit, upper triangular band matrix.
 *
 * No test for  singularity  or  near-singularity  is included  in  this
 * routine. Such tests must be performed before calling this routine.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the  ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*tbsv0)( const int, const int, const TYPE *,
                              const int, TYPE * );
#ifdef TREAL
#define    lda2               LDA
#define    one                ATL_rone
#define    none               ATL_rnone
#else
   TYPE                       none[2] = { ATL_rnone, ATL_rzero },
                              one [2] = { ATL_rone,  ATL_rzero };
   const int                  lda2   = ( LDA SHIFT );
#endif
#ifdef ATL_AXPYMV
   int                        ia, ian, j, kl, ku, m1, ma, mb, mb1, n, na, nb;
#else
   int                        ia, ian, kl, ku, ma, mb, na, nb, nb1;
#endif
/* ..
 * .. Executable Statements ..
 *
 */
   ATL_GetPartMVN( A, LDA, &mb, &nb );

   if( DIAG == AtlasNonUnit ) tbsv0 = Mjoin( PATL, tbsvUCN );
   else                       tbsv0 = Mjoin( PATL, tbsvUCU );

#ifdef ATL_AXPYMV
   mb1  = N - ( m1 = ( ( N - 1 ) / mb ) * mb );

   tbsv0( mb1, K, A+m1*lda2, LDA, X+(m1 SHIFT) );

   for( n = mb1, j = m1 - mb; n < N; n += mb, j -= mb )
   {
      ian = j + mb; ia = mb - K; ia = j + Mmax( ia, 0 ); ma = ian - ia;
      na = N - ian; na = Mmin( na, K ); kl = ma - 1; kl = Mmax( kl, 0 );
      ku = K - 1 - kl; ku = Mmax( ku, 0 );
      Mjoin( PATL, gbmv )( AtlasConj,    ma, na, kl, ku, none, A+ian*lda2,
                           LDA, X+(ian SHIFT), 1, one, X+(ia SHIFT), 1 );
      tbsv0( mb, K, A+j*lda2, LDA, X+(j SHIFT) );
   }
#else
   nb1 = N - ( ( N - 1 ) / nb ) * nb;

   for( ian = N - nb; ian > 0; ian -= nb )
   {
      ia = ian - K; ia = Mmax( ia, 0 ); ma = ian - ia;  na = Mmin( nb, K );
      kl = ma - 1; kl = Mmax( kl, 0 ); ku = K - 1 - kl; ku = Mmax( ku, 0 );
      tbsv0( nb, K, A+ian*lda2, LDA, X+(ian SHIFT) );
      Mjoin( PATL, gbmv )( AtlasConj,    ma, na, kl, ku, none, A+ian*lda2,
                           LDA, X+(ian SHIFT), 1, one, X+(ia SHIFT), 1 );
   }
   tbsv0( nb1, K, A, LDA, X );
#endif
/*
 * End of Mjoin( PATL, tbsvUC )
 */
}
Esempio n. 20
0
void Mjoin(PATL,ttrsm)(const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo,
                       const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag,
                       ATL_CINT M, ATL_CINT N, const SCALAR alpha,
                       const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb)
{
   ATL_TTRSM_t trsms[ATL_NTHREADS];
   TYPE *b;
   ATL_INT n, nblks, minblks;
   double tblks;
   int nr, p, i, j, extrablks;
   static int nb=0;

   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))
   {
      Mjoin(PATL,gezero)(M, N, B, ldb);
      return;
   }
   #if defined(ATL_ARCH_XeonPHI) && defined(TREAL)
   {
      int Mjoin(PATL,ttrsm_amm)
         (const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo,
          const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag,
          ATL_CINT M, ATL_CINT N, const SCALAR alpha,
          const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb);
     if (!Mjoin(PATL,ttrsm_amm)(side, uplo, TA, diag, M, N, alpha,
                                A, lda, B, ldb))
        return;
   }
   #endif
/*
 * Distribute RHS over the processors
 */
   if (!nb) nb = Mjoin(PATL,GetNB)();
   if (side == AtlasLeft)
   {
      nblks = N/nb;
      nr = N - nblks*nb;
      tblks = ((double)(M*N)) / ( (double)nb * nb );
      p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER;
      p = Mmin(p, ATL_NTHREADS);
      p = p ? p : 1;

      b = B;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         trsms[i].A = A;
         trsms[i].M = M;
         trsms[i].N = n;
         trsms[i].lda = lda;
         trsms[i].ldb = ldb;
         trsms[i].B = b;
         trsms[i].alpha = SADD alpha;
         trsms[i].side = side;
         trsms[i].uplo = uplo;
         trsms[i].TA   = TA;
         trsms[i].diag = diag;
         n *= (ldb << Mjoin(PATL,shift));
         b = MindxT(b, n);
      }
   }
   else /* Side == AtlasRight */
   {
      nblks = M/nb;
      nr = M - nblks*nb;
      tblks = (N/nb)*nblks;
      p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER;
      p = Mmin(p, ATL_NTHREADS);
      p = p ? p : 1;

      b = B;
      minblks = nblks / p;
      extrablks = nblks - minblks*p;
      for (i=0; i < p; i++)
      {
         if (i < extrablks)
            n = (minblks+1)*nb;
         else if (i == extrablks)
            n = minblks*nb + nr;
         else
            n = minblks*nb;
         trsms[i].A = A;
         trsms[i].M = n;
         trsms[i].N = N;
         trsms[i].lda = lda;
         trsms[i].ldb = ldb;
         trsms[i].B = b;
         trsms[i].alpha = SADD alpha;
         trsms[i].side = side;
         trsms[i].uplo = uplo;
         trsms[i].TA   = TA;
         trsms[i].diag = diag;
         n <<= Mjoin(PATL,shift);
         b = MindxT(b, n);
      }
   }
   if (p < 2)
   {
      Mjoin(PATL,trsm)(side, uplo, TA, diag, M, N, alpha, A, lda, B, ldb);
      return;
   }
   for (; i < ATL_NTHREADS; i++)  /* flag rest of struct as uninitialized */
      trsms[i].B = NULL;
   ATL_goparallel(p, Mjoin(PATL,DoWorkTRSM), trsms, NULL);
}
Esempio n. 21
0
int
HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs,
  double *triadGBs, int *failure) {
    int   quantum;
    int   BytesPerWord;
    register int j, k;
    double  scalar, t, times[4][NTIMES];
    FILE *outFile;
    double GiBs = 1073741824.0, curGBs;

    if (doIO) {
      // outFile = fopen( params->outFname, "w+" );
	  outFile = stdout;
      if (! outFile) {
        outFile = stderr;
        fprintf( outFile, "Cannot open output file.\n" );
        return 1;
      }
    }

    // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */
	// HARDCODED VectorSize
    // params->StreamVectorSize = VectorSize;

    a = HPCC_XMALLOC( double, VectorSize );
    b = HPCC_XMALLOC( double, VectorSize );
    c = HPCC_XMALLOC( double, VectorSize );

    if (!a || !b || !c) {
      if (c) HPCC_free(c);
      if (b) HPCC_free(b);
      if (a) HPCC_free(a);
      if (doIO) {
        fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize );
        fflush( outFile );
        fclose( outFile );
      }
      return 1;
    }

    /* --- SETUP --- determine precision and check timing --- */

    if (doIO) {
    fprintf (outFile, "Generated on %s\n", params->nowASCII);
    fprintf( outFile, HLINE);
    BytesPerWord = sizeof(double);
    fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n",
             BytesPerWord);

    fprintf( outFile, HLINE);
    fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET);
    fprintf( outFile, "Total memory required = %.4f GiB.\n",
             (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs));
    fprintf( outFile, "Each test is run %d times, but only\n", NTIMES);
    fprintf( outFile, "the *best* time for each is used.\n");
	fflush ( outFile);
    }

#ifdef _OPENMP
    if (doIO) fprintf( outFile, HLINE);
#pragma omp parallel private(k)
    {
#pragma omp single nowait
      {
        k = omp_get_num_threads();
        if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k);
        params->StreamThreads = k;
      }
    }
#endif

    /* Get initial value for system clock. */
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j=0; j<VectorSize; j++) {
      a[j] = 1.0;
      b[j] = 2.0;
      c[j] = 0.0;
    }

    if (doIO) fprintf( outFile, HLINE);

    if  ( (quantum = checktick()) >= 1) {
      if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be "
                         "%d microseconds.\n", quantum);
    } else {
      if (doIO) fprintf( outFile, "Your clock granularity appears to be "
                         "less than one microsecond.\n");
    }

    t = mysecond();
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j = 0; j < VectorSize; j++)
      a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    if (doIO) {
    fprintf( outFile, "Each test below will take on the order"
             " of %d microseconds.\n", (int) t  );
    fprintf( outFile, "   (= %d clock ticks)\n", (int) (t/quantum) );
    fprintf( outFile, "Increase the size of the arrays if this shows that\n");
    fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n");

    fprintf( outFile, HLINE);

    fprintf( outFile, "WARNING -- The above is only a rough guideline.\n");
    fprintf( outFile, "For best results, please be sure you know the\n");
    fprintf( outFile, "precision of your system timer.\n");
    fprintf( outFile, HLINE);
    }

    /* --- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
    {
      times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j];
#endif
        times[0][k] = mysecond() - times[0][k];

        times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          b[j] = scalar*c[j];
#endif
        times[1][k] = mysecond() - times[1][k];

        times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j]+b[j];
#endif
        times[2][k] = mysecond() - times[2][k];

        times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          a[j] = b[j]+scalar*c[j];
#endif
        times[3][k] = mysecond() - times[3][k];
    }

    /* --- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
    {
      for (j=0; j<4; j++)
      {
        avgtime[j] = avgtime[j] + times[j][k];
        mintime[j] = Mmin(mintime[j], times[j][k]);
        maxtime[j] = Mmax(maxtime[j], times[j][k]);
      }
    }

    if (doIO)
    fprintf( outFile, "Function      Rate (GB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
      avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */

      /* make sure no division by zero */
      curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0);
      curGBs *= 1e-9 * bytes[j] * VectorSize;
        if (doIO)
          fprintf( outFile, "%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
                   curGBs,
                   avgtime[j],
                   mintime[j],
                   maxtime[j]);
        switch (j) {
          case 0: *copyGBs = curGBs; break;
          case 1: *scaleGBs = curGBs; break;
          case 2: *addGBs = curGBs; break;
          case 3: *triadGBs = curGBs; break;
        }
    }
    if (doIO) fprintf( outFile, HLINE);

    /* --- Check Results --- */
    checkSTREAMresults( outFile, doIO, failure );
    if (doIO) fprintf( outFile, HLINE);

    HPCC_free(c);
    HPCC_free(b);
    HPCC_free(a);

    if (doIO) {
      fflush( outFile );
      fclose( outFile );
    }

    return 0;
}
Esempio n. 22
0
void Mjoin(PATL,tgemv)
   (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
    const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
    const SCALAR beta, TYPE *Y, ATL_CINT incY)
{
   static size_t ALb=0, ALe=0;
   size_t at = (size_t) A;
   ATL_INT n, P, ldaP;
   ATL_TGEMV_t pd;
/*
 * quick return if possible.
 */
   if (M < 1 || N < 1)
      return;
   if (SCALAR_IS_ZERO(alpha))   /* No contrib from alpha*A*x */
   {
      ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M;
      if (!SCALAR_IS_ONE(beta))
      {
         if (SCALAR_IS_ZERO(beta))
            Mjoin(PATL,zero)(NY, Y, incY);
         else
            Mjoin(PATL,scal)(NY, beta, Y, incY);
      }
      return;
   }
   pd.flg = (at >= ALb && at <= ALe) ? 1 : 0;
   ALb = (size_t)A;
   ALe = (size_t)(A+(M SHIFT));
   #ifdef TREAL
      pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0;
   #else
      if (TA != AtlasNoTrans)
      {
         if (TA == AtlasConj)
            pd.flg |= 4;
         else if (TA == AtlasTrans)
            pd.flg |= 2;
         else /* if (TA == AtlasConjTrans) */
            pd.flg |= (2|4);
      }
   #endif
   P = ATL_DivBySize(CacheEdge);
   P = ((size_t)M*N+P-1) / P;   /* add more procs only when cache is full */
   P = (P&1 && P > 1)?P+1 : P;  /* don't use odd P; it hurts alignment */
   P = Mmin(ATL_NTHREADS, P);
if (TA == AtlasNoTrans || TA == AtlasConj)
   P=1;
//fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N);
/*
 * Make sure we don't overflow 32-bit integer lda
 */
   ldaP = P * lda;
   while ((size_t)ldaP != ((size_t)lda)*P)
   {
      P--;
      ldaP = P * lda;
   }
   if (P > 1)
   {
      pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda;
      pd.alpha = alpha; pd.beta = beta;
      pd.X = X; pd.Y = Y; pd.A = A;
      pd.P = P;
      n = N / P;
      pd.n = n;
      pd.nr = N - n*P;
      if (pd.flg & 2)   /* Transpose case */
      {
         ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL);
         return;
      }
/*
 *    For gemvN, everyone needs a private M-length y.  Don't do this unless
 *    we are sure the combine cost is likely dominated by the parallelism
 */
      else if (n > Mmax(P,8))
      {
         int vrank;
         const TYPE *a;
         TYPE *y, *y0;
         #ifdef TCPLX
            TYPE one[2] = {ATL_rone, ATL_rzero};
            TYPE zero[2] = {ATL_rzero, ATL_rzero};
         #endif

         y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M)));
         ATL_assert(y);
         pd.Y = y;
         pd.incY = 1;
         #ifdef TREAL
            pd.alpha = ATL_rone;
            pd.beta  = ATL_rzero;
         #else
            pd.alpha = one;
            pd.beta  = zero;
         #endif
         ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd,
                        Mjoin(PATL,CombineMVN));
/*
 *       goparallel reduces all node's Ys to node 0's.  Extract his from the
 *       work array, and combine it with input array, applying both alpha
 *       and beta in the process
 */
         vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1;
         a = A + (lda SHIFT)*vrank;
         y = ATL_Align2Ptr(y, a);
         Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY);
         free(y0);
         return;
      }
   }
/*
 * If we haven't parallelized this thing, just do it serial
 */
   Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
}
Esempio n. 23
0
static int
dtr2mx_(double *a, int *lda, double *beta, double *t, int *ldt, int *nrow, int *ncol, int *
	mb, int *nb, int *ilt, int *jlt) {
    /* System generated locals */
    long a_dim1, a_offset, t_dim1, t_offset;
    int i__1, i__2, i__3, i__4;

    /* Local variables */
    static int k, ia, ja, jj, ki, kj, it, jt, mr, irm, jrm;


/*  -- PUMMA Package routine (version 2.1) -- */
/*     Jaeyoung Choi, Oak Ridge National Laboratory. */
/*     Jack Dongarra, Univ. of Tennessee, Oak Ridge National Laboratory. */
/*     David Walker,  Oak Ridge National Laboratory. */
/*     October 31, 1994. */

/*  Purpose */

/*  T <== A' + beta*T (assume beta = 0.0, or 1.0) */
/*  T is a scattered 2-D array from a scattered 2-D array A */

/*     T = A' */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    t_dim1 = *ldt;
    t_offset = 1 + t_dim1;
    t -= t_offset;

    /* Function Body */
    ia = 0;
    jt = 0;

    if (*beta == 0.) {
	i__1 = *nrow - 2;
	for (ki = 0; ki <= i__1; ++ki) {
	    ja = 0;
	    it = 0;
	    i__2 = *ncol - 2;
	    for (kj = 0; kj <= i__2; ++kj) {
		i__3 = *nb;
		for (jj = 1; jj <= i__3; ++jj) {
		    i__4 = *mb;
		    for (k = 1; k <= i__4; ++k) {
			t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1];
/* L10: */
		    }
		}
		ja += commtrb_1.jaz;
		it += commtrb_1.itz;
/* L20: */
	    }

	    jrm = *jlt - ja;
	    if (jrm > 0) {
		i__2 = Mmin(*nb,jrm);
		for (jj = 1; jj <= i__2; ++jj) {
		    i__4 = *mb;
		    for (k = 1; k <= i__4; ++k) {
			t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1];
/* L30: */
		    }
		}
	    }
	    ia += commtrb_1.iaz;
	    jt += commtrb_1.jtz;
/* L40: */
	}

	irm = *ilt - ia;
	if (irm > 0) {
	    ja = 0;
	    it = 0;
	    mr = Mmin(irm,*mb);
	    i__1 = *ncol - 2;
	    for (kj = 0; kj <= i__1; ++kj) {
		i__4 = *nb;
		for (jj = 1; jj <= i__4; ++jj) {
		    i__2 = mr;
		    for (k = 1; k <= i__2; ++k) {
			t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1];
/* L50: */
		    }
		}
		ja += commtrb_1.jaz;
		it += commtrb_1.itz;
/* L60: */
	    }

	    jrm = *jlt - ja;
	    if (jrm > 0) {
		i__1 = Mmin(*nb,jrm);
		for (jj = 1; jj <= i__1; ++jj) {
		    i__2 = mr;
		    for (k = 1; k <= i__2; ++k) {
			t[it + jj + (jt + k) * t_dim1] = a[ia + k + (ja + jj) * a_dim1];
/* L70: */
		    }
		}
	    }
	}

    } else {

/*         T = A' + T */

	i__2 = *nrow - 2;
	for (ki = 0; ki <= i__2; ++ki) {
	    ja = 0;
	    it = 0;
	    i__1 = *ncol - 2;
	    for (kj = 0; kj <= i__1; ++kj) {
		i__4 = *nb;
		for (jj = 1; jj <= i__4; ++jj) {
		    i__3 = *mb;
		    for (k = 1; k <= i__3; ++k) {
			t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1];
/* L80: */
		    }
		}
		ja += commtrb_1.jaz;
		it += commtrb_1.itz;
/* L90: */
	    }

	    jrm = *jlt - ja;
	    if (jrm > 0) {
		i__1 = Mmin(*nb,jrm);
		for (jj = 1; jj <= i__1; ++jj) {
		    i__3 = *mb;
		    for (k = 1; k <= i__3; ++k) {
			t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1];
/* L100: */
		    }
		}
	    }
	    ia += commtrb_1.iaz;
	    jt += commtrb_1.jtz;
/* L110: */
	}

	irm = *ilt - ia;
	if (irm > 0) {
	    ja = 0;
	    it = 0;
	    mr = Mmin(irm,*mb);
	    i__2 = *ncol - 2;
	    for (kj = 0; kj <= i__2; ++kj) {
		i__3 = *nb;
		for (jj = 1; jj <= i__3; ++jj) {
		    i__1 = mr;
		    for (k = 1; k <= i__1; ++k) {
			t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1];
/* L120: */
		    }
		}
		ja += commtrb_1.jaz;
		it += commtrb_1.itz;
/* L130: */
	    }

	    jrm = *jlt - ja;
	    if (jrm > 0) {
		i__2 = Mmin(*nb,jrm);
		for (jj = 1; jj <= i__2; ++jj) {
		    i__1 = mr;
		    for (k = 1; k <= i__1; ++k) {
			t[it + jj + (jt + k) * t_dim1] += a[ia + k + (ja + jj) * a_dim1];
/* L140: */
		    }
		}
	    }
	}
    }

    return 0;
} /* dtr2mx_ */
Esempio n. 24
0
int Mjoin(PATL,gels)
   (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, ATL_CINT NRHS,
    TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb, TYPE *work, ATL_CINT lwork)
/*
 *  GELS solves overdetermined or underdetermined linear systems
 *  involving an M-by-N matrix A, or its conjugate-transpose, using a QR
 *  or LQ factorization of A.  It is assumed that A has full rank.
 *
 *  This is a straight translation from LAPACK 3.2.1; the only performance
 *  improvements come from using ATLAS's improved QR (and slighly ORMQR)
 *  implementations.
 *
 *  The following options are provided:
 *
 *  1. If TRANS = 'N' and m >= n:  find the least squares solution of
 *     an overdetermined system, i.e., solve the least squares problem
 *                  minimize || B - A*X ||.
 *
 *  2. If TRANS = 'N' and m < n:  find the minimum norm solution of
 *     an underdetermined system A * X = B.
 *
 *  3. If TRANS = 'C/T' and m >= n:  find the minimum norm solution of
 *     an undetermined system A**H * X = B.
 *
 *  4. If TRANS = 'C/T' and m < n:  find the least squares solution of
 *     an overdetermined system, i.e., solve the least squares problem
 *                  minimize || B - A**H * X ||.
 *
 *  Several right hand side vectors b and solution vectors x can be
 *  handled in a single call; they are stored as the contiguously in the
 *  M-by-NRHS right hand side matrix B and the N-by-NRHS solution
 *  matrix X.
 *
 *  TRANS   (input) CHARACTER*1
 *          = 'N': the linear system involves A;
 *          = 'C': the linear system involves A**H (complex only).
 *          = 'T': the linear system involves A**T (real only).
 *
 *  M       (input) INTEGER
 *          The number of rows of the matrix A.  M >= 0.
 *
 *  N       (input) INTEGER
 *          The number of columns of the matrix A.  N >= 0.
 *
 *  NRHS    (input) INTEGER
 *          The number of right hand sides, i.e., the number of
 *          columns of the matrices B and X. NRHS >= 0.
 *
 *  A       (input/output) COMPLEX*16 array, dimension (LDA,N)
 *          On entry, the M-by-N matrix A.
 *            if M >= N, A is overwritten by details of its QR
 *                       factorization as returned by GEQRF;
 *            if M <  N, A is overwritten by details of its LQ
 *                       factorization as returned by GELQF.
 *  LDA     (input) INTEGER
 *          The leading dimension of the array A.  LDA >= max(1,M).
 *
 *  B       (input/output) COMPLEX*16 array, dimension (LDB,NRHS)
 *          On entry, the matrix B of right hand side vectors, stored
 *          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS
 *          if TRANS = 'C/T'.
 *          On exit, if INFO = 0, B is overwritten by the solution
 *          vectors, stored columnwise:
 *          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least
 *          squares solution vectors; the residual sum of squares for the
 *          solution in each column is given by the sum of squares of the
 *          modulus of elements N+1 to M in that column;
 *          if TRANS = 'N' and m < n, rows 1 to N of B contain the
 *          minimum norm solution vectors;
 *          if TRANS = 'C' and m >= n, rows 1 to M of B contain the
 *          minimum norm solution vectors;
 *          if TRANS = 'C' and m < n, rows 1 to M of B contain the
 *          least squares solution vectors; the residual sum of squares
 *          for the solution in each column is given by the sum of
 *          squares of the modulus of elements M+1 to N in that column.
 *
 *  LDB     (input) INTEGER
 *          The leading dimension of the array B. LDB >= MAX(1,M,N).
 *
 * RETURNS:
 *          = 0:  successful exit
 *          < 0:  if INFO = -i, the i-th argument had an illegal value
 *          > 0:  if INFO =  i, the i-th diagonal element of the
 *                triangular factor of A is zero, so that A does not have
 *                full rank; the least squares solution could not be
 *                computed.
 */
{
   TYPE *TAU;
   #ifdef TCPLX
      const enum ATLAS_TRANS RTRAN = (TA == AtlasNoTrans) ?
                                     AtlasConjTrans : AtlasNoTrans;
      const TYPE one[3] = {ATL_rone, ATL_rzero, ATL_rzero}, *zero=one+1;
      TYPE wsq[4];
   #else
      const enum ATLAS_TRANS RTRAN = (TA == AtlasNoTrans) ?
                                     AtlasTrans : AtlasNoTrans;
      #define one ATL_rone
      #define zero ATL_rzero
      TYPE wsq[2];
   #endif
   TYPE *free0=NULL;
   TYPE anrm, bnrm;
   ATL_INT scalN, wlen;
   ATL_CINT MN = Mmin(M,N);
   int iascal=0, ibscal=0, ierr;
/*
 * Quick return for degenerate cases
 */
   if (!NRHS)
      return(0);
   else if (!M || !N)
   {
      Mjoin(PATL,geset)(Mmax(M,N), NRHS, zero, zero, B, ldb);
      return(0);
   }
/*
 * If no workspace given, routines will simply allocate their own, we need TAU
 */
   if (lwork == 0 || lwork < -1)
   {
      free0 = TAU = malloc(MN*ATL_sizeof);
      ATL_assert(TAU);
      work = NULL;
      wlen = 0;
   }
/*
 * If the user is providing workspace, or doing a workspace query, we must
 * compute the required workspace
 */
   else
   {
      wlen = MN;    /* space needed for TAU */
      if (M >= N)
      {
         ATL_assert(!Mjoin(PATL,geqrf)(M, N, A, lda, NULL, wsq, -1));
         ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, RTRAN, M, NRHS, N, A, lda,
                                   NULL, B, ldb, wsq+(1 SHIFT), -1));
      }
      else
      {
         ATL_assert(!Mjoin(PATL,gelqf)(M, N, A, lda, NULL, wsq, -1));
         ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, RTRAN, N, NRHS, M, A, lda,
                                   NULL, B, ldb, wsq+(1 SHIFT), -1));
      }
      if (wsq[1 SHIFT] > wsq[0])
         wsq[0] = wsq[1 SHIFT];
      wlen += wsq[0];
/*
 *    If this was a workspace query, return optimal workspace in *work
 */
      if (lwork == -1)
      {
         *work = wlen;
         return(0);
      }
/*
 *    Otherwise, take action if user's workspace is inadequate
 */
     if (lwork < wlen)
     {
        if (lwork >= wlen-MN) /* users space is work, we alloc TAU */
        {
           free0 = TAU = malloc(MN*ATL_sizeof);
           wlen -= MN;
           ATL_assert(TAU);
           work = work;
        }
        else if (lwork < MN) /* can't even use workspace for TAU */
        {
           free0 = TAU = malloc(MN*ATL_sizeof);
           ATL_assert(TAU);
           work = NULL;
           wlen = 0;
        }
        else  /* user's workspace becomes TAU; let worker routs alloc work */
        {
           TAU = work;
           work = NULL;
           wlen = 0;
        }
     }
     else  /* user provided adequate workspace for everything */
     {
        wlen = lwork - MN;
        TAU = work;
        work += MN SHIFT;
     }
   }
// TPSD is (TA != AtlasNoTrans)
/*
 * ===============================================================
 * Scale if max elt in A is outside safe range, return if nrm is 0
 * ===============================================================
 */
   anrm = Mjoin(PATL,gemaxnrm)(M, N, A, lda);
/*
 * If it is below it, scale matrix norm up to smallest safe number
 */
   if (anrm > ATL_rzero && anrm < ATL_labadUNDERTHRESH)
   {
      Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadUNDERTHRESH, M, N, A, lda);
      iascal = 1;
   }
/*
 * If matrix norm huge, scale it down by largest safe number
 */
   else if (anrm > ATL_labadOVERTHRESH)
   {
      Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadOVERTHRESH, M, N, A, lda);
      iascal = 2;
   }
/*
 * If norm is 0, entire matrix is 0, return zero solution
 */
   else if (anrm == ATL_rzero)
   {
      Mjoin(PATL,geset)(Mmax(M,N), NRHS, zero, zero, B, ldb);
      if (free0)
         free(free0);
      return(0);
   }
/*
 * ===============================================================
 * Scale if max elt in B is outside safe range, return if nrm is 0
 * ===============================================================
 */
   scalN = (TA != AtlasNoTrans) ? N : M;
   bnrm = Mjoin(PATL,gemaxnrm)(scalN, NRHS, B, ldb);
/*
 * If it is below it, scale matrix norm up to smallest safe number
 */
   if (bnrm > ATL_rzero && bnrm < ATL_labadUNDERTHRESH)
   {
      Mjoin(PATL,lascl)(LAMATG, 0, 0, bnrm, ATL_labadUNDERTHRESH,
                        scalN, NRHS, B, ldb);
      ibscal = 1;
   }
/*
 * If matrix norm huge, scale it down by largest safe number
 */
   else if (bnrm > ATL_labadOVERTHRESH)
   {
      Mjoin(PATL,lascl)(LAMATG, 0, 0, bnrm, ATL_labadOVERTHRESH,
                        scalN, NRHS, B, ldb);
      ibscal = 2;
   }
   if (M >= N)  /* overdetermined system */
   {
/*
 *    Compute QR factorization of A
 */
      ATL_assert(!Mjoin(PATL,geqrf)(M, N, A, lda, TAU, work, wlen));
/*
 *    Least-squares problem min || A * X - B ||
 */
      if (TA == AtlasNoTrans)
      {
         ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, RTRAN, M, NRHS, N, A, lda,
                                       TAU, B, ldb, work, wlen));
         ierr = Mjoin(PATL,trtrs)(AtlasUpper, AtlasNoTrans, AtlasNonUnit,
                                  N, NRHS, A, lda, B, ldb);
         if (ierr)
         {
            if (free0)
               free(free0);
           return(ierr);
         }
         scalN = N;
      }
/*
 *    Overdetermined system of equations A' * X = B
 */
      else  /* transposed case */
      {
         ierr = Mjoin(PATL,trtrs)(AtlasUpper, TA, AtlasNonUnit,
                                  N, NRHS, A, lda, B, ldb);
         if (ierr)
         {
            if (free0)
               free(free0);
            return(ierr);
         }
         Mjoin(PATL,gezero)(M-N, NRHS, B+(N SHIFT), ldb);
          ATL_assert(!Mjoin(PATL,ormqr)(AtlasLeft, AtlasNoTrans, M, NRHS, N,
                                        A, lda, TAU, B, ldb, work, wlen));
         scalN = M;
      }
   }
/*
 * Compute LQ factorization of A
 */
   else    /* M < N */
   {
      ATL_assert(!Mjoin(PATL,gelqf)(M, N, A, lda, TAU, work, wlen));
/*
 *    Underdetermined system of equations A * X = B
 */
      if (TA == AtlasNoTrans)
      {
/*
 *       B(1:M,1:NRHS) = inv(L) * B(1:M,1:NRHS)
 */
         ierr = Mjoin(PATL,trtrs)(AtlasLower, AtlasNoTrans, AtlasNonUnit,
                                  M, NRHS, A, lda, B, ldb);
         if (ierr)
         {
            if (free0)
               free(free0);
            return(ierr);
         }
         Mjoin(PATL,gezero)(N-M, NRHS, B+(M SHIFT), ldb);
         ATL_assert(!Mjoin(PATL,ormlq)(AtlasLeft, RTRAN, N, NRHS, M, A, lda,
                                       TAU, B, ldb, work, wlen));
         scalN = N;
      }
/*
 *    Overdetermined system min || A' * X - B ||
 */
      else
      {
         ATL_assert(!Mjoin(PATL,ormlq)(AtlasLeft, AtlasNoTrans, N, NRHS, M,
                                       A, lda, TAU, B, ldb, work, wlen));
/*
 *       B(1:M,1:NRHS) := inv(L') * B(1:M,1:NRHS)
 */
         ierr = Mjoin(PATL,trtrs)(AtlasLower, mytrans, AtlasNonUnit,
                                  M, NRHS, A, lda, B, ldb);
         if (ierr)
         {
            if (free0)
               free(free0);
            return(ierr);
         }
         scalN = M;
      }
   }
/*
 * Undo scaling
 */
   if (iascal == 1)
      Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadUNDERTHRESH,
                        scalN, NRHS, B, ldb);
   else if (iascal == 2)
      Mjoin(PATL,lascl)(LAMATG, 0, 0, anrm, ATL_labadOVERTHRESH,
                        scalN, NRHS, B, ldb);
   if (ibscal == 1)
      Mjoin(PATL,lascl)(LAMATG, 0, 0, ATL_labadUNDERTHRESH, bnrm,
                        scalN, NRHS, B, ldb);
   else if (ibscal == 2)
      Mjoin(PATL,lascl)(LAMATG, 0, 0, ATL_labadOVERTHRESH, bnrm,
                        scalN, NRHS, B, ldb);
   if (free0)
      free(free0);
   return(0);
}
Esempio n. 25
0
static int
dmv2mx_(double *t, int *ldt, double *beta, double *a, int *lda, int *nrow, int *ncol,
  int *mb, int *nb, int *ilt, int *jlt) {
    /* System generated locals */
    long t_dim1, t_offset, a_dim1, a_offset;
    int i__1, i__2, i__3, i__4;

    /* Local variables */
    static int k, ia, ja, jj, ki, kj, it, jt, mr, irm, jrm;


/*  -- PUMMA Package routine (version 2.1) -- */
/*     Jaeyoung Choi, Oak Ridge National Laboratory. */
/*     Jack Dongarra, Univ. of Tennessee, Oak Ridge National Laboratory. */
/*     David Walker,  Oak Ridge National Laboratory. */
/*     October 31, 1994. */

/*  Purpose */

/*  A <== T + beta*A (assume beta = 0.0, or 1.0) */
/*  A is a scattered 2-D array from a condensed 2-D buffer T */

    /* Parameter adjustments */
    t_dim1 = *ldt;
    t_offset = 1 + t_dim1;
    t -= t_offset;
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;

    /* Function Body */
    it = 0;
    ia = 0;

/*     A <== T */

    if (*beta == 0.) {

/*         If NPROW = 1, use DCOPY */

	if (*nrow == 1) {
	    jt = 0;
	    ja = 0;
	    i__1 = *ncol - 2;
	    for (kj = 0; kj <= i__1; ++kj) {
		i__2 = *nb;
		for (jj = 1; jj <= i__2; ++jj) {
		    i__3 = Mmin(*mb,*ilt);
		    HPL_dcopy(i__3, &t[(jt + jj) * t_dim1 + 1], 1, &a[(ja + jj) * a_dim1 + 1], 1);
/* L10: */
		}
		jt += *nb;
		ja += commtrb_1.jtz;
/* L20: */
	    }

	    jrm = *jlt - ja;
	    if (jrm > 0) {
		i__1 = Mmin(*nb,jrm);
		for (jj = 1; jj <= i__1; ++jj) {
		    i__2 = Mmin(*mb,*ilt);
		    HPL_dcopy(i__2, &t[(jt + jj) * t_dim1 + 1], 1, &a[(ja + jj) * a_dim1 + 1], 1);
/* L30: */
		}
	    }

	} else {

	    i__1 = *nrow - 2;
	    for (ki = 0; ki <= i__1; ++ki) {
		jt = 0;
		ja = 0;
		i__2 = *ncol - 2;
		for (kj = 0; kj <= i__2; ++kj) {
		    i__3 = *nb;
		    for (jj = 1; jj <= i__3; ++jj) {
			i__4 = *mb;
			for (k = 1; k <= i__4; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1];
/* L40: */
			}
		    }
		    jt += *nb;
		    ja += commtrb_1.jtz;
/* L50: */
		}

		jrm = *jlt - ja;
		if (jrm > 0) {
		    i__2 = Mmin(*nb,jrm);
		    for (jj = 1; jj <= i__2; ++jj) {
			i__4 = *mb;
			for (k = 1; k <= i__4; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1];
/* L60: */
			}
		    }
		}
		it += *mb;
		ia += commtrb_1.itz;
/* L70: */
	    }

	    irm = *ilt - ia;
	    if (irm > 0) {
		jt = 0;
		ja = 0;
		mr = Mmin(*mb,irm);
		i__1 = *ncol - 2;
		for (kj = 0; kj <= i__1; ++kj) {
		    i__4 = *nb;
		    for (jj = 1; jj <= i__4; ++jj) {
			i__2 = mr;
			for (k = 1; k <= i__2; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1];
/* L80: */
			}
		    }
		    jt += *nb;
		    ja += commtrb_1.jtz;
/* L90: */
		}

		jrm = *jlt - ja;
		if (jrm > 0) {
		    i__1 = Mmin(*nb,jrm);
		    for (jj = 1; jj <= i__1; ++jj) {
			i__2 = mr;
			for (k = 1; k <= i__2; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] = t[it + k + (jt + jj) * t_dim1];
/* L100: */
			}
		    }
		}
	    }
	}

/*         A <== T + A */

    } else {

/*         If NPROW = 1, use DAXPY */

	if (*nrow == 1) {
	    jt = 0;
	    ja = 0;
	    i__2 = *ncol - 2;
	    for (kj = 0; kj <= i__2; ++kj) {
		i__1 = *nb;
		for (jj = 1; jj <= i__1; ++jj) {
		    i__4 = Mmin(*mb,*ilt);
		    HPL_daxpy(i__4, 1.0, &t[(jt + jj) * t_dim1 + 1], 1, &a[(ja + jj) * a_dim1 + 1], 1);
/* L110: */
		}
		jt += *nb;
		ja += commtrb_1.jtz;
/* L120: */
	    }

	    jrm = *jlt - ja;
	    if (jrm > 0) {
		i__2 = Mmin(*nb,jrm);
		for (jj = 1; jj <= i__2; ++jj) {
		    i__1 = Mmin(*mb,*ilt);
		    HPL_daxpy(i__1, 1.0, &t[(jt + jj) * t_dim1 + 1], 1, & a[(ja + jj) * a_dim1 + 1], 1);
/* L130: */
		}
	    }

	} else {
	    i__2 = *nrow - 2;
	    for (ki = 0; ki <= i__2; ++ki) {
		jt = 0;
		ja = 0;
		i__1 = *ncol - 2;
		for (kj = 0; kj <= i__1; ++kj) {
		    i__4 = *nb;
		    for (jj = 1; jj <= i__4; ++jj) {
			i__3 = *mb;
			for (k = 1; k <= i__3; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1];
/* L140: */
			}
		    }
		    jt += *nb;
		    ja += commtrb_1.jtz;
/* L150: */
		}

		jrm = *jlt - ja;
		if (jrm > 0) {
		    i__1 = Mmin(*nb,jrm);
		    for (jj = 1; jj <= i__1; ++jj) {
			i__3 = *mb;
			for (k = 1; k <= i__3; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1];
/* L160: */
			}
		    }
		}
		it += *mb;
		ia += commtrb_1.itz;
/* L170: */
	    }

	    irm = *ilt - ia;
	    if (irm > 0) {
		jt = 0;
		ja = 0;
		mr = Mmin(*mb,irm);
		i__2 = *ncol - 2;
		for (kj = 0; kj <= i__2; ++kj) {
		    i__3 = *nb;
		    for (jj = 1; jj <= i__3; ++jj) {
			i__1 = mr;
			for (k = 1; k <= i__1; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1];
/* L180: */
			}
		    }
		    jt += *nb;
		    ja += commtrb_1.jtz;
/* L190: */
		}

		jrm = *jlt - ja;
		if (jrm > 0) {
		    i__2 = Mmin(*nb,jrm);
		    for (jj = 1; jj <= i__2; ++jj) {
			i__1 = mr;
			for (k = 1; k <= i__1; ++k) {
			    a[ia + k + (ja + jj) * a_dim1] += t[it + k + (jt + jj) * t_dim1];
/* L200: */
			}
		    }
		}
	    }
	}
    }

    return 0;
} /* dmv2mx_ */
Esempio n. 26
0
PT_TREE_T Mjoin( PATL, pthescal_nt )
(
   const unsigned int         THREADS,
   pthread_attr_t             * ATTR,
   const enum ATLAS_UPLO      UPLO,
   const int                  M,
   const int                  N,
   const void                 * ALPHA,
   void                       * A,
   const int                  LDA
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, pthescal_nt )  scales a trapezoidal Hermitian m-by-n matrix
 * A by the real scalar alpha.  The imaginary parts of the diagonal ele-
 * ments of A need not be set on input, they are assumed to be zero, and
 * on exit they are set to zero.
 *
 * This is a multi-threaded version of the algorithm.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   PT_TREE_T                  root = NULL;
   PT_MISC_TYPE_T             type;
   double                     tblks, tmnblks;
   unsigned int               nthreads;
   int                        mn, nb, nbm1;
/* ..
 * .. Executable Statements ..
 *
 */
/*
 * Make sure we don't thread this for the time being
 */
   if( THREADS >= 1 )
   {
      Mjoin( PATL, hescal )( UPLO, M, N, ((TYPE *)(ALPHA))[0],
                             (TYPE *)(A), LDA );
      return( root );
   }

   nbm1    = ( nb = Mjoin( PATL, GetNB )() ) - 1;
   mn      = Mmin( M, N );
   tmnblks = (double)( (mn+nbm1) / nb );
   tblks   = tmnblks * tmnblks;

   if( UPLO == AtlasLower )
   { tblks += (double)( (N+nbm1) / nb ) * (double)( (M-mn+nbm1) / nb ); }
   else
   { tblks += (double)( (M+nbm1) / nb ) * (double)( (N-mn+nbm1) / nb ); }

   if( ( tblks <= (double)(ATL_XOVER_MI_DEFAULT) ) || ( THREADS <= 1 ) )
   {
      Mjoin( PATL, hescal )( UPLO, M, N, ((TYPE *)(ALPHA))[0],
                             (TYPE *)(A), LDA );
      return( root );
   }

   if( tblks >= (double)(THREADS) ) { nthreads = THREADS; }
   else    { nthreads = (unsigned int)floor( tblks + 0.5 ); }

   type.size = sizeof( TYPE[2] ); type.fun = Mjoin( PATL, pthescal0 );

   if( UPLO == AtlasLower )
   {
      root = ATL_Stzscal( &type, 0, nthreads, ATTR, nb, AtlasLower,
                          M - mn, 0, mn, ALPHA, A, LDA );
   }
   else
   {
      root = ATL_Stzscal( &type, 0, nthreads, ATTR, nb, AtlasUpper,
                          0, N - mn, mn, ALPHA, A, LDA );
   }
   ATL_thread_tree( root, ATTR );

   return( root );
/*
 * End of Mjoin( PATL, pthescal_nt )
 */
}
Esempio n. 27
0
void Mjoin( PATL, hpmv )
(
   const enum ATLAS_UPLO      UPLO,
   const int                  N,
   const SCALAR               ALPHA,
   const TYPE                 * A,
   const TYPE                 * X,
   const int                  INCX,
   const SCALAR               BETA,
   TYPE                       * Y,
   const int                  INCY
)
{
/*
 * Purpose
 * =======
 *
 * Mjoin( PATL, hpmv ) performs the matrix-vector operation
 *
 *    y := alpha * A * x + beta * y,
 *
 * where alpha and beta are scalars, x and y are n-element vectors and A
 * is an n by n Hermitian matrix, supplied in packed form.
 *
 * This is a blocked version of the algorithm.  For a more detailed des-
 * cription of  the arguments of this function, see the reference imple-
 * mentation in the ATLAS/src/blas/reference directory.
 *
 * ---------------------------------------------------------------------
 */
/*
 * .. Local Variables ..
 */
   void                       (*gpmv0)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmv1)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
   void                       (*gpmvN)( const int, const int, const SCALAR,
                              const TYPE *, const int, const TYPE *, const int,
                              const SCALAR, TYPE *, const int );
#ifdef TREAL
   TYPE                       alphaY, beta0;
#define one                   ATL_rone
#define zero                  ATL_rzero
#else
   const TYPE                 * alphaY, * beta0;
   const TYPE                 one [2] = { ATL_rone,  ATL_rzero },
                              zero[2] = { ATL_rzero, ATL_rzero };
#endif
   void                       * vx = NULL, * vy = NULL;
   TYPE                       * A0, * A1, * x, * x0, * x1, * y, * y00, * y0,
                              * y1;
   int                        incXY, incXY1, j, jb, lda, lda0, lda1, mb, mb1,
                              n, nb;
/* ..
 * .. Executable Statements ..
 *
 */
   if( N == 0 ) return;

   if( SCALAR_IS_ZERO( ALPHA ) )
   {
      if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY );
      return;
   }

   if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) )
   {
      vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vx ); x = ATL_AlignPtr( vx );
      Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 );
      alphaY = one;
   }
   else { x = (TYPE *)(X); alphaY = ALPHA; }

   if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) )
   {
      vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) );
      ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy );
      beta0 = zero;
   }
   else { y00 = y = (TYPE *)(Y); beta0 = BETA; }

   ATL_GetPartSPMV( A, N, &mb, &nb );

   mb1 = N - ( ( N - 1 ) / mb ) * mb; incXY1 = (nb SHIFT);

   if( UPLO == AtlasUpper )
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 );

      lda = 1; lda0 = lda; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 );
      incXY = (mb SHIFT); x0 = x + incXY; y0 = y + incXY;

      for( n = N - mb; n > 0; n -= mb, x0 += incXY, x += incXY,
           y0 += incXY, y += incXY )
      {
         Mjoin( PATL, hpmvU )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 - (mb SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MUpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1; lda = lda0; A = A0; MUpnext( mb, A0, lda0 );
      }
      Mjoin( PATL, hpmvU )( mb1, A, lda, x, beta0, y );
   }
   else
   {
      if(      SCALAR_IS_ZERO( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b0_y1 );
      else if( SCALAR_IS_ONE ( beta0 ) )
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      else
         gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_bX_y1 );
      gpmv1 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 );
      gpmvN = Mjoin( PATL, gpmvLN_a1_x1_b1_y1 );

      lda = N; lda0 = lda; A0 = (TYPE *)(A); MLpnext( N, A, lda );
      incXY = (mb SHIFT); x0 = x; y0 = y;

      for( n  = N - mb, x += ((N-mb) SHIFT), y += ((N-mb) SHIFT); n > 0;
           n -= mb, x -= incXY, y -= incXY )
      {
         MLpprev( mb, A, lda );
         Mjoin( PATL, hpmvL )( mb, A, lda, x, beta0, y );

         for( j = 0, lda1 = lda0, A1 = A0 + (n SHIFT), x1 = x0, y1 = y0; j < n;
              j += nb, x1 += incXY1, y1 += incXY1 )
         {
            jb = n - j; jb = Mmin( jb, nb );
            gpmv0( jb, mb, one, A1, lda1, x,  1, beta0, y1, 1 );
            gpmvN( mb, jb, one, A1, lda1, x1, 1, one,   y,  1 );
            MLpnext( jb, A1, lda1 ); A1 -= (jb SHIFT);
         }
         beta0 = one; gpmv0 = gpmv1;
      }
      Mjoin( PATL, hpmvL )( mb1, A0, lda0, x0, beta0, y0 );
   }

   if( vx ) free( vx );
   if( vy )
   { Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); }
/*
 * End of Mjoin( PATL, hpmv )
 */
}
Esempio n. 28
0
static int ATL_trmvUT
(
   const enum ATLAS_DIAG  Diag,
   const int nb,
   ATL_CINT N,
   const TYPE *A,
   ATL_CINT lda,
   TYPE *X,
   ATL_CINT incX
)
/*
 * RETURNS: 0 if TRMV was performed, non-zero if nothing done
 */
{
   static void (*trmvK)(ATL_CINT, const TYPE*, ATL_CINT, const TYPE*, TYPE*);
   void (*gemv)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
                const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
   void *vp;
   TYPE *x, *y;
   const size_t opsize = (N*N+N+N)*sizeof(TYPE)SHIFT;
   size_t t0;
   #ifdef TCPLX
      size_t N2=N+N, lda2 = lda+lda;
      TYPE one[2] = {ATL_rone, ATL_rzero};
   #else
      #define N2 N
      #define lda2 lda
      #define one ATL_rone
   #endif
   const size_t incA = ((size_t)lda2)*nb;
   ATL_INT j;

   if (N < nb+nb)
      return(1);
   if (opsize > MY_CE)
      gemv = Mjoin(PATL,gemvT);
   else
      gemv = (opsize <= ATL_MulBySize(ATL_L1elts)) ? Mjoin(PATL,gemvT_L1) :
             Mjoin(PATL,gemvT_L2);
   trmvK = (Diag == AtlasNonUnit) ? ATL_trmvUTNk : ATL_trmvUTUk;
/*
 * If X is aligned to Cachelen wt inc=1, use it as y
 */
   t0 = (size_t) X;
   if (incX == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(t0)) == t0))
   {
      ATL_INT i;
      vp = malloc(ATL_Cachelen+ATL_MulBySize(N));
      if (!vp)
         return(2);
      x = ATL_AlignPtr(vp);
      y = X;
      for (i=0; i < N2; i++)
      {
         x[i] = X[i];
         X[i] = ATL_rzero;
      }

   }
   else  /* allocate both X and Y */
   {
      vp = malloc((ATL_Cachelen+ATL_MulBySize(N))<<1);
      if (!vp)
         return(3);
      x = ATL_AlignPtr(vp);
      y = x + N2;
      y = ATL_AlignPtr(y);
      Mjoin(PATL,copy)(N, X, incX, x, 1);
      Mjoin(PATL,zero)(N, y, 1);
   }
   trmvK(nb, A, lda, x, y);
   A += incA;
   for (j=nb; j < N; j += nb, A += incA)
   {
      int kb = N-j;
      #ifdef TCPLX
         const register size_t j2 = j + j;
      #else
         #define j2 j
      #endif
      kb = Mmin(nb, kb);
      gemv(j, kb, one, A, lda, x, 1, one, y+j2, 1);
      trmvK(kb, A+j2, lda, x+j2, y+j2);
      #ifndef TCPLX
         #undef j2
      #endif
   }
   if (y != X)
      Mjoin(PATL,copy)(N, y, 1, X, incX);
   free(vp);
   return(0);
}
Esempio n. 29
0
main(int nargs, char *args[])
/*
 *  tst <tst> <# TA> <TA's> <# TB's> <TB's> <M0> <MN> <incM> <N0> <NN> <incN>
 *      <K0> <KN> <incK> <# alphas> <alphas> <# betas> <betas>
 *
 */
{
   int M0, MN, incM, N0, NN, incN, K0, KN, incK, lda, ldb, ldc, MFLOP;
   int i, k, m, n, im, in, ik, ita, itb, ia, ib, nTA, nTB, nalph, nbeta;
   int itst=0, ipass=0, TEST, LDA_IS_M, MSAME=0, KSAME=0;
   int ndiag, nuplo, nside;
   TYPE *alph, *beta, *A, *B, *C, *D=NULL;
   #ifdef TREAL
      TYPE bet1 = 1.0, alp1 = -1.0;
   #else
      TYPE bet1[2] = {1.0, 0.0}, alp1[2] = {-1.0, 0.0};
   #endif
   char TA, TB;
   enum ATLAS_SIDE *Side;
   enum ATLAS_UPLO *Uplo;
   enum ATLAS_TRANS *TransA, *TransB, TAc, TBc;
   enum ATLAS_DIAG *Diag;
   int CACHESIZE;

   GetFlags(nargs, args, &TEST, &nside, &Side, &nuplo, &Uplo,
            &nTA, &TransA, &nTB, &TransB, &ndiag, &Diag,
            &M0, &MN, &incM, &N0, &NN, &incN, &K0, &KN, &incK,
            &nalph, &alph, &nbeta, &beta, &LDA_IS_M, &MFLOP,&CACHESIZE);

   if (M0 == -1)
   {
      MSAME = 1;
      M0 = MN = incM = NN;
   }
   if (K0 == -1)
   {
      KSAME = 1;
      K0 = KN = incK = NN;
   }

   if (!MFLOP)
   {
      A = malloc(MN*KN*ATL_sizeof);
      B = malloc(NN*KN*ATL_sizeof);
      C = malloc(MN*NN*ATL_sizeof);
      if (TEST) D = malloc(MN*NN*ATL_sizeof);
      else D = NULL;
      if (!A || !B || !C || (TEST && !D))
      {
         fprintf(stderr, "Not enough memory to run tests!!\n");
         exit(-1);
      }
   }
/*
 * Page the code in from disk, so first timing doesn't blow
 */
   if (MFLOP)
   {
      mmcase0(10, 1, 'n', 'n', 100, 100, 100, alp1, 100, 100, bet1, 100);
      mmcase0(10, 1, 'n', 't', 100, 100, 100, alp1, 100, 100, bet1, 100);
      mmcase0(10, 1, 't', 'n', 100, 100, 100, alp1, 100, 100, bet1, 100);
      mmcase0(10, 1, 't', 't', 100, 100, 100, alp1, 100, 100, bet1, 100);
   }
   else
   {
      m = Mmin(100, MN);
      k = Mmin(100, KN);
      n = Mmin(100, NN);
      matgen(m, k, A, m, m*k);
      matgen(k, n, B, k, n*k);
      matgen(m, n, C, m, m*n);
      TA = TB = 'N';
      TAc = TBc = AtlasNoTrans;
      trusted_gemm(TAc, TBc, m, n, k, alp1, A, m, B, k, bet1, C, m);
      test_gemm(TAc, TBc, m, n, k, alp1, A, m, B, k, bet1, C, m);
   }

#ifdef TREAL
   printf("\nTEST  TA  TB    M    N    K  alpha   beta    Time  Mflop  SpUp  PASS\n");
   printf("====  ==  ==  ===  ===  ===  =====  =====  ======  =====  ====  ====\n\n");
#else
   printf("\nTEST  TA  TB    M    N    K        alpha         beta    Time  Mflop  SpUp  PASS\n");
   printf("====  ==  ==  ===  ===  ===  ===== =====  ===== =====  ======  =====  ====  ====\n\n");
#endif
   for (im=M0; im <= MN; im += incM)
   {
      for (n=N0; n <= NN; n += incN)
      {
         if (MSAME) m = n;
         else m = im;
         for (ik=K0; ik <= KN; ik += incK)
         {
            if (KSAME) k = n;
            else k = ik;
            for (ita=0; ita != nTA; ita++)
            {
               if (TransA[ita] == AtlasNoTrans) TA = 'N';
               else if (TransA[ita] == AtlasTrans) TA = 'T';
               else if (TransA[ita] == AtlasConjTrans) TA = 'C';

               for (itb=0; itb != nTB; itb++)
               {
                  if (TransB[itb] == AtlasNoTrans) TB = 'N';
                  else if (TransB[itb] == AtlasTrans) TB = 'T';
                  else if (TransB[itb] == AtlasConjTrans) TB = 'C';
                  for (ia=0; ia != nalph; ia++)
                  {
                     for (ib=0; ib != nbeta; ib++)
                     {
                        itst++;
                        if (LDA_IS_M)
                        {
                           if (TA == 'n' || TA == 'N') lda = m;
                           else lda = k;
                           if (TB == 'n' || TB == 'N') ldb = k;
                           else ldb = n;
                           ldc = m;
                        }
                        else
                        {
                           if (TA == 'n' || TA == 'N') lda = MN;
                           else lda = KN;
                           if (TB == 'n' || TB == 'N') ldb = KN;
                           else ldb = NN;
                           ldc = MN;
                        }

                        if (MFLOP)
                        {
                           ipass++;
#ifdef TREAL
                              mmcase0(MFLOP, CACHESIZE, TA, TB, m, n, k,
				      alph[ia], lda, ldb, beta[ib], ldc);
#else
                              mmcase0(MFLOP, CACHESIZE, TA, TB, m, n, k,
				      alph+(ia SHIFT), lda, ldb,
				      beta+(ib SHIFT), ldc);
#endif
                        }
                        else
                        {
#ifdef TREAL
                              ipass += mmcase(TEST, CACHESIZE, TA, TB, m,
					      n, k, alph[ia], A, lda, B, ldb,
                                              beta[ib], C, ldc, D,ldc);
#else
                              ipass += mmcase(TEST, CACHESIZE, TA, TB, m,
					      n, k, alph+(ia SHIFT), A,
					      lda, B, ldb, beta+(ib SHIFT),
					      C, ldc, D,ldc);
#endif
                        }
                     }
                  }
               }
            }
         }
      }
   }
   if (TEST && !MFLOP)
      printf("\nNTEST=%d, NUMBER PASSED=%d, NUMBER FAILURES=%d\n",
             itst, ipass, itst-ipass);
   else printf("\nDone with %d timing runs\n",itst);
   free(Side);
   free(Uplo);
   free(TransA);
   free(TransB);
   free(Diag);
   free(alph);
   free(beta);
   if (!MFLOP)
   {
      free(A);
      free(B);
      free(C);
      if (D) free(D);
   }
   exit(0);
}
Esempio n. 30
0
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order,
            int M, int N, int lda)
{
   char *cord = (Order == AtlasColMajor ? "Col" : "Row");
   const double maxMN = Mmax(M,N), minMN = Mmin(M,N);
   unsigned long nreps=0;
   int npiv=(-1), *ipiv;
   const int incA = (Order == AtlasColMajor ? N*lda : M*lda);
   double mflops, mflop, resid, tim=(-1.0), t0;
   TYPE *A, *a;
   int i;

   #ifdef TREAL
      mflops = maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) -
               (minMN*minMN) / 2.0;
   #else
      mflops = (maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) +
                (maxMN*minMN) / 2.0)*4.0 - 3.0 * minMN*minMN;
   #endif
   mflops /= 1000000.0;

   if (thresh > ATL_rzero)
   {
      if (Order == AtlasColMajor)
         resid = lutestC(CacheSize, M, N, lda, &npiv, &tim);
      else resid = lutestR(CacheSize, M, N, lda, &npiv, &tim);
   }
   else resid = -1.0;
   if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */
   {
      nreps = (mflops*1000000);
      nreps = (MFLOP*1000000 + nreps-1) / nreps;
      if (nreps < 1) nreps = 1;
      i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL;
      i = (i + M*N) / (M*N);
      if (i < nreps) i = nreps;  /* don't reuse mem or no pivoting */
      a = A = malloc(i * ATL_MulBySize(incA));
      if (A != NULL)
      {
         ipiv = malloc(Mmin(M,N)*sizeof(int));  /* what the hell - reuse ipiv */
         if (ipiv)
         {
            Mjoin(PATL,gegen)(i*incA, 1, A, i*incA, incA+M+3012);
            t0 = time00();
            for (i=nreps; i; i--, a += incA)
               test_getrf(Order, M, N, a, lda, ipiv);
            tim = time00() - t0;
            tim /= nreps;
            if (npiv == 0) npiv = findnpvt(Mmin(M,N), ipiv);
            free(ipiv);
         }
         else fprintf(stderr, "   WARNING: not enough mem to run timings!\n");
         free(A);
      }
      else fprintf(stderr, "   WARNING: not enough mem to run timings!\n");
   }
   if (tim > 0.0) mflop = mflops / tim;
   else mflop = 0.0;
   fprintf(stdout, "%5d  %3s   %6d %6d %6d %6d %9.3f %9.3f %9.3e\n",
           nreps, cord, M, N, lda, npiv, tim, mflop, resid);
   return(resid <= thresh);
}