void ATL_signal_tree ( PT_TREE_T ROOT ) { /* * Purpose * ======= * * ATL_signal_tree signals the end of the node function to its peer. * * Arguments * ========= * * ROOT (input) PT_TREE_T * On entry, ROOT specifies the node emitting the signal. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ ATL_assert(!pthread_mutex_lock ( &(ROOT->mutex) )); ROOT->count++; ATL_assert(!pthread_cond_signal ( &(ROOT->cond) )); ATL_assert(!pthread_mutex_unlock( &(ROOT->mutex) )); /* * End of ATL_signal_tree */ }
static TYPE *DupMat(enum ATLAS_ORDER Order, int M, int N, TYPE *A, int lda, int ldc) /* * returns a duplicate of the A matrix, with new leading dimension */ { int i, j, M2; const int ldc2 = (ldc SHIFT), lda2 = (lda SHIFT); TYPE *C; if (Order == CblasRowMajor) { i = M; M = N; N = i; } M2 = M SHIFT; ATL_assert(ldc >= M); C = malloc(ATL_MulBySize(ldc)*N); ATL_assert(C); for (j=0; j != N; j++) { for (i=0; i != M2; i++) C[i] = A[i]; C += ldc2; A += lda2; } return(C-N*ldc2); }
int f77getrf(const enum ATLAS_ORDER Order, const int M, const int N, TYPE *A, const int lda, int *ipiv) { int i; const int MN=Mmin(M,N); #ifdef ATL_FunkyInts const F77_INTEGER F77M=M, F77N=N, F77lda=lda; F77_INTEGER info, *F77ipiv; #else int info; #define F77M M #define F77N N #define F77lda lda #define F77ipiv ipiv #endif #ifdef ATL_FunkyInts F77ipiv = malloc(MN * sizeof(F77_INTEGER)); ATL_assert(F77ipiv); #endif ATL_assert(Order == AtlasColMajor); F77GETRF(&F77M, &F77N, A, &F77lda, F77ipiv, &info); #ifdef ATL_FunkyInts for (i=0; i < MN; i++) ipiv[i] = F77ipiv[i] - 1; free(F77ipiv); #else for (i=0; i < MN; i++) ipiv[i]--; #endif return(info); }
int *RoutNames2IntList(int nargs, char **args, int i) { int n, *iarr, k; if (++i >= nargs) PrintUsage(args[0], i, NULL); n = atoi(args[i]); ATL_assert(n > 0); iarr = malloc(sizeof(int)*(n+1)); ATL_assert(iarr); iarr[0] = n; for (k=0; k < n; k++) { if (++i >= nargs) PrintUsage(args[0], i, NULL); if (!strcmp(args[i], "getrf") || !strcmp(args[i], "GETRF")) iarr[k+1] = LAgetrf; else if (!strcmp(args[i], "potrf") || !strcmp(args[i], "POTRF")) iarr[k+1] = LApotrf; else if (!strcmp(args[i], "geqrf") || !strcmp(args[i], "GEQRF")) iarr[k+1] = LAgeqrf; else if (!strcmp(args[i], "geqlf") || !strcmp(args[i], "GEQLF")) iarr[k+1] = LAgeqrf; else if (!strcmp(args[i], "gerqf") || !strcmp(args[i], "GERQF")) iarr[k+1] = LAgeqrf; else if (!strcmp(args[i], "gelqf") || !strcmp(args[i], "GELQF")) iarr[k+1] = LAgeqrf; else PrintUsage(args[0], i, args[i]); } return(iarr); }
double GetTimeWithReps_LU (int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { double mflop, t0, t1, drep; char *wrksets; /* working sets for kernel calls */ #ifdef TCPLX const int lda2 = lda+lda; #else const int lda2 = lda; #endif size_t setsz, setszT; /* work set size in memory, and amnt of it touched */ size_t nrep; /* # of reps required to force mflopF flops */ size_t nset; /* # of working sets allocated */ int i; /* * Keep setsz a multiple of TYPE size for alignment reasons. LU only accesses * M*N of matrix and all of IPIV. */ setsz = lda*N*ATL_sizeof + ((M*sizeof(int)+ATL_sizeof-1)/ATL_sizeof)*ATL_sizeof; setszT = M*N*ATL_sizeof + M*sizeof(int); mflop = GetFlopCount(LAgetrf, 0, M, N, 0, 0, CAN_NB); /* * Cannot reuse matrices (bogus to factor an already factored matrix), so we * must take as our total memspace MAX(nrep,nset)*setsz */ ATL_assert(mflop > 0.0); drep = (mflopF*1.0e6) / mflop; nrep = (int)(drep+0.999999); /* * If cacheline flush doesn't work, then we must use this method */ #if ATL_LINEFLUSH if (nrep < 2) return(-1.0); /* do wt normal timer */ #else nrep = (nrep >= 1) ? nrep : 1; #endif nset = (flsizeKB*1024+setszT-1)/setszT; if (nset < nrep) nset = nrep; wrksets = malloc(nset * setsz); ATL_assert(wrksets); for (i=0; i < nset; i++) Mjoin(PATL,gegen)(M, N, (TYPE*)(wrksets+i*setsz), lda, M*N+lda); t0 = time00(); for (i=0; i < nrep; i++) { test_getrf(CblasColMajor, M, N, (TYPE*)(wrksets+i*setsz), lda, (int*)(wrksets+i*setsz+lda*N*ATL_sizeof)); } t1 = time00(); free(wrksets); return((t1-t0)/((double)nrep)); }
/* * computes (i,j) non-diagonal block of C */ static void DoCblk(const int rank, ATL_tsyrk_ammN_t *pd, TYPE *wC, int i, int j) { const ammkern_t amm = pd->amm_b1; const unsigned int nkblks=pd->nkblks, bs=pd->blkszA, kb=pd->kb, NB=pd->nb; unsigned int nmu, nnu, mb, nb; const TYPE *wA, *wB, *wAn, *wBn; TYPE *c; int k; if (!(pd->LOWER)) { k = i; i = j; j = k; } if (j != pd->ndiag-1) { nnu = pd->nnu; nb = pd->nb; } else { nnu = pd->nnuf; nb = pd->nbf; } if (i != pd->ndiag-1) { nmu = pd->nmu; mb = pd->nb; } else { nmu = pd->nmuf; mb = pd->nbf; } wA = pd->wA + i*pd->panszA; wB = pd->wAt + j*pd->panszA; wA = pd->wA + i*pd->panszA; wB = pd->wAt + j*pd->panszA; wAn = wA + bs; wBn = wB + bs; #ifdef DEBUG2 if (!ATL_IsBitSetBV(pd->cpydonBV, i) || !ATL_IsBitSetBV(pd->cpydonBV, j)) fprintf(stderr, "%d: ndiag=%d, i=%d, j=%d\n", rank, pd->ndiag, i, j); ATL_assert(ATL_IsBitSetBV(pd->cpydonBV, i)); ATL_assert(ATL_IsBitSetBV(pd->cpydonBV, j)); #endif pd->ammK(nmu, nnu, pd->KB0, wA, wB, wC, wAn, wBn, wC); for (k=1; k < nkblks; k++) { wA = wAn; wB = wBn; wAn += bs; wBn += bs; amm(nmu, nnu, kb, wA, wB, wC, wAn, wBn, wC); } pd->blk2c(mb, nb, *(pd->alpha), wC, *(pd->beta), pd->C+ NB*(j*(size_t)(pd->ldc) + i), pd->ldc); }
double GetTimeWithReps_LLT (int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { double mflop, t0, t1, drep; char *wrksets; /* working sets for kernel calls */ #ifdef TCPLX const int lda2 = lda+lda; #else const int lda2 = lda; #endif size_t setsz, setszT; /* work set size in memory, and amnt of it touched */ size_t nrep; /* # of reps required to force mflopF flops */ size_t nset; /* # of working sets allocated */ int i; setsz=lda*N*ATL_sizeof; /* matrix is entire working set of LLt */ setszT=N*N*ATL_sizeof; /* only touch N*N portion */ mflop = GetFlopCount(LApotrf, Uplo, M, N, 0, 0, CAN_NB); /* * Cannot reuse matrices (bogus to factor an already factored matrix), so we * must take as our total memspace MAX(nrep,nset)*setsz */ ATL_assert(mflop > 0.0); drep = (mflopF*1.0e6) / mflop; nrep = (int)(drep+0.999999); /* * If cacheline flush doesn't work, then we must use this method */ #if ATL_LINEFLUSH if (nrep < 2) return(-1.0); /* do wt normal timer */ #else nrep = (nrep >= 1) ? nrep : 1; #endif nset = (flsizeKB*1024+setszT-1)/setszT; if (nset < nrep) nset = nrep; wrksets = malloc(nset * setsz); ATL_assert(wrksets); for (i=0; i < nset; i++) PosDefGen(CblasColMajor, Uplo_LA2ATL(Uplo), N, (TYPE*)(wrksets+i*setsz), lda); t0 = time00(); for (i=0; i < nrep; i++) { test_potrf(Uplo, N, (TYPE*)(wrksets+i*setsz), lda); } t1 = time00(); free(wrksets); return((t1-t0)/((double)nrep)); }
int ATL_thread_join(ATL_thread_t *thr) /* waits on completion of thread */ { #ifdef ATL_WINTHREADS ATL_assert(WaitForSingleObject(thr->thrH, INFINITE) != WAIT_FAILED); ATL_assert(CloseHandle(thr->thrH)); #elif defined(ATL_OMP_THREADS) fprintf(stderr, "Cannot call thread_join using OpenMP!!\n"); ATL_assert(0); /* should never enter this rout when using OMP */ #else ATL_assert(!pthread_join(thr->thrH, NULL)); #endif return(0); }
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif if (K > SYRK_Xover) { vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if ( SCALAR_IS_NONE(beta) ) Mjoin(syr_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr_put,_bX)(N, c, beta, C, ldc); free(vc); } else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda, beta, C, ldc); }
double *TimeOnCores(struct kmm_struct *kb) { struct kmm_struct *kp; pthread_t *threads; pthread_attr_t *attr; cpu_set_t cpuset; double *mflops; int i, p; p = kb->p; kp = malloc(sizeof(struct kmm_struct)*p); threads = malloc(sizeof(pthread_t)*p); attr = malloc(sizeof(pthread_attr_t)*p); mflops = malloc(sizeof(double)*p); ATL_assert(kp && threads && attr && mflops); for (i=0; i < p; i++) { memcpy(kp+i, kb, sizeof(struct kmm_struct)); kp[i].iam = i; CPU_ZERO(&cpuset); CPU_SET(kp->pids[i], &cpuset); assert(!pthread_attr_setaffinity_np(attr+i, sizeof(cpuset), &cpuset)); pthread_create(threads+i, attr+i, TimeOnCore, kp+i); } for (i=0; i < p; i++) { pthread_join(threads[i], NULL); mflops[i] = kp[i].mf; } free(kp->pids); free(kp); free(threads); free(attr); return(mflops); }
void Mjoin(Mjoin(PATL,trmmL),ATLP) (const int M, const int N, const void *valpha, const void *A, const int lda, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR one=1.0, zero=0.0; #else const TYPE zero[2]={0.0,0.0}; #define alpha valpha #endif void *va; TYPE *a; if (N > TRMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(M)*M); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(ATL_trcopy,_a1)(M, alpha, A, lda, a); else Mjoin(ATL_trcopy,_aX)(M, alpha, A, lda, a); CAgemmTN(M, N, M, one, a, M, C, ldc, zero, C, ldc); #else ATL_trcopy(M, A, lda, a); CAgemmTN(M, N, M, valpha, a, M, C, ldc, zero, C, ldc); #endif free(va); } else Mjoin(PATL,reftrmm)(AtlasLeft, Uplo_, Trans_, Unit_, M, N, alpha, A, lda, C, ldc); }
void Mjoin(Mjoin(PATL,symmR),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif void *va; TYPE *a; if (M > SYMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(N)*N); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(N, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(N, alpha, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, one, B, ldb, a, N, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(N, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, valpha, B, ldb, a, N, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasRight, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
void Mjoin(Mjoin(Mjoin(PATL,herk),UploNM),N) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; TYPE alpha[2]; const TYPE beta = *( (const TYPE *)vbeta ); const TYPE zero[2] = {0.0, 0.0}; alpha[0] = *( (const TYPE *)valpha ); if (K > HERK_Xover) { alpha[1] = 0.0; vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmNC(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( beta == 1.0 ) Mjoin(her_put,_b1)(N, c, vbeta, C, ldc); else if ( beta == 0.0 ) Mjoin(her_put,_b0)(N, c, vbeta, C, ldc); else Mjoin(her_put,_bXi0)(N, c, vbeta, C, ldc); free(vc); } else Mjoin(PATL,refherk)(Uplo_, AtlasNoTrans, N, K, *alpha, A, lda, beta, C, ldc); }
void Mjoin(PATL,gpmm) (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum PACK_UPLO UB, const enum PACK_TRANS TB, const enum PACK_UPLO UC, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int IA, const int JA, const int lda, const TYPE *B, const int IB, const int JB, const int ldb, const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc) { int j; #ifdef CacheEdge static const int CE_K = ((ATL_DivBySize(CacheEdge)-(NBNB SHIFT)) / (NB*(NB+NB)))*NB; #else #define CE_K K #endif if (!M || !N) return; if (!K || SCALAR_IS_ZERO(alpha)) { for (j=0; j != N; j++) Mjoin(PATL,scal)(M, beta, C+MindexP(UC,IC,JC+j,ldc), 1); return; } /* * Packed gpmm not yet implemented for complex, * so die if not really a dense gemm */ #ifdef TCPLX ATL_assert (UA == PackGen && UB == PackGen && UC == PackGen); Mjoin(PATL,gemm)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); #else Mjoin(PATL,prankK)(UA, TA, UB, TB, M, N, K, CE_K, alpha, A+MindexP(UA,IA,JA,lda), Mpld(UA,JA,lda), B+MindexP(UB,IB,JB,ldb), Mpld(UB,JB,ldb), beta, UC, C+MindexP(UC,IB,JB,ldc), Mpld(UC,JC,ldc)); #endif }
int f77gesv(const int N, const int NRHS, TYPE *A, const int lda, int *ipiv, TYPE *B, const int ldb) { #ifdef ATL_FunkyInts const F77_INTEGER F77N=N, F77lda=lda, F77ldb=ldb, F77NRHS=NRHS; F77_INTEGER info; F77_INTEGER *F77ipiv; #else int info; #define F77N N #define F77NRHS NRHS #define F77lda lda #define F77ldb ldb #define F77ipiv ipiv #endif int i; #ifdef ATL_FunkyInts F77ipiv = malloc(N*sizeof(F77_INTEGER)); ATL_assert(F77ipiv); #endif F77GESV(&F77N, &F77NRHS, A, &F77lda, F77ipiv, B, &F77ldb, &info); #ifdef ATL_FunkyInts for (i=0; i < N; i++) ipiv[i] = F77ipiv[i] - 1; free(F77ipiv); #else for (i=0; i < N; i++) ipiv[i]--; #endif return(info); }
static TYPE *ATL_LmulLt(const int N, const TYPE *L, const int ldl) /* * A = L * L^H */ { const int incA = 1 SHIFT, incL = (ldl+1) SHIFT; TYPE *A; int i, j; #ifdef TCPLX int i1, i2; TYPE tmp; #endif A = malloc(N*ATL_MulBySize(N)); ATL_assert(A); for (j=0; j < N; j++) { for (i=j; i < N; i++) { #ifdef TREAL A[i+j*N] = L[i+j*ldl] * L[j+j*ldl] + Mjoin(PATL,dot)(j, L+i, ldl, L+j, ldl); #else tmp = L[(j+j*ldl)<<1]; i1 = (i + j * N)<<1; i2 = (i + j * ldl)<<1; Mjoin(PATL,dotc_sub)(j, L+(j<<1), ldl, L+(i<<1), ldl, A+i1); A[i1] += L[i2] * tmp; if (i != j) A[i1+1] += tmp * L[i2+1]; #endif } } return(A); }
static void row2blkT_NB(const int M, const int N, const TYPE *A, const int lda, TYPE *vr, TYPE *vi, const SCALAR alpha) { const int incA = lda<<2, incv = 2 - NBNB; const TYPE *pA0 = A, *pA1 = A + (lda<<1); int i, j; #ifdef ALPHAXI0 #ifdef Conj_ const register TYPE ralpha = *alpha, calpha = -ralpha; #else const register TYPE ralpha = *alpha; #endif #elif defined(ALPHAX) const register TYPE ralpha = *alpha, ialpha = alpha[1]; register TYPE ra, ia; #endif #if ((NB/2)*2 != NB) /* ATLAS should ensure NB divisible by 2 */ ATL_assert((NB/2)*2 == NB); #endif for (j=(NB>>1); j; j --, pA0 += incA, pA1 += incA, vr += incv, vi += incv) { for (i=0; i != NB2; i += 2, vr += NB, vi += NB) { scalcp(pA0+i, vr, vi); scalcp(pA1+i, vr+1, vi+1); } } }
static TYPE geresid(enum CBLAS_ORDER Order, int N, TYPE *A, int lda, TYPE *AI, int ldi) /* * returns ||A - AI|| / (N * eps * ||A|| * ||AI||); * for row-major, we are not using 1-norm, since we are adding rows instead * of cols, but it should be an equally good norm, so don't worry about it. */ { TYPE numer, denom, eps; const int ldcp1 = (N+1)SHIFT; TYPE *C; int i; #ifdef TREAL TYPE one = ATL_rone, zero = ATL_rzero; #else TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero}; #endif eps = Mjoin(PATL,epsilon)(); C = malloc(N*ATL_MulBySize(N)); ATL_assert(C); cblas_gemm(Order, CblasNoTrans, CblasNoTrans, N, N, N, one, A, lda, AI, ldi, zero, C, N); /* C now has A*inv(A) */ for (i=0; i != N; i++) C[i*ldcp1] -= ATL_rone; /* C now has A*inv(A)-I */ numer = Mjoin(PATL,genrm1)(N, N, C, N); denom = Mjoin(PATL,genrm1)(N, N, A, lda) * Mjoin(PATL,genrm1)(N, N, AI, ldi) * N * eps; free(C); return(numer/denom); }
void Mjoin(Mjoin(PATL,symmL),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif TYPE *a; void *va; if (N > SYMM_Xover) { va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M)); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(M, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(M, alpha, A, lda, a); CgemmTN(M, N, M, one, a, M, B, ldb, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(M, A, lda, a); CgemmTN(M, N, M, valpha, a, M, B, ldb, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
static void geinv (const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda) { int *ipiv; TYPE *wrk; int lwrk; ipiv = malloc(sizeof(int)*N); ATL_assert(ipiv); #ifdef TimeF77 lwrk = N * Mjoin(PATL,GetNB)(); wrk = malloc(ATL_MulBySize(lwrk)); if (Order == AtlasRowMajor) Mjoin(PATL,tstsqtran)(N, A, lda); ATL_assert(Mjoin(PATL,f77getrf)(AtlasColMajor, N, N, A, lda, ipiv) == 0); ATL_assert(Mjoin(PATL,f77getri) (AtlasColMajor, N, A, lda, ipiv, wrk, &lwrk) == 0); if (Order == AtlasRowMajor) Mjoin(PATL,tstsqtran)(N, A, lda); free(wrk); #elif defined(TimeC) ATL_assert(Mjoin(CLP,getrf)(Order, N, N, A, lda, ipiv) == 0); ATL_assert(Mjoin(CLP,getri)(Order, N, A, lda, ipiv) == 0); #else lwrk = N * Mjoin(PATL,GetNB)(); wrk = malloc(ATL_MulBySize(lwrk)); ATL_assert(Mjoin(PATL,getrf)(Order, N, N, A, lda, ipiv) == 0); ATL_assert(Mjoin(PATL,getri)(Order, N, A, lda, ipiv, wrk, &lwrk) == 0); free(wrk); #endif free(ipiv); }
void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda) { int info = 2000; const double *x = X, *y = Y; void *vy; double *y0; double one[2] = {ATL_rone, ATL_rzero}; #ifndef NoCblasErrorChecks if (M < 0) info = cblas_errprn(2, info, "M cannot be less than zero; is set to %d.", M); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(8, info, "incY cannot be zero; is set to %d.", incY); if (Order == CblasColMajor) { if (lda < M || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(M,1): lda=%d M=%d", lda, M); } else if (Order == CblasRowMajor) { if (lda < N || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d M=%d", lda, N); } else info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (info != 2000) { cblas_xerbla(info, "cblas_zgerc", ""); return; } #endif if (incX < 0) x += (1-M)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (Order == CblasColMajor) ATL_zgerc(M, N, alpha, x, incX, y, incY, A, lda); else { vy = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vy); y0 = ATL_AlignPtr(vy); ATL_zmoveConj(N, alpha, y, incY, y0, 1); ATL_zgeru(N, M, one, y0, 1, x, incX, A, lda); free(vy); } }
void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda) { int info = 2000; void *vx, *vy; float *x0, *y0; const float *x=X, *y=Y, *alp=alpha; const float one[2]={ATL_rone, ATL_rzero}; #ifndef NoCblasErrorChecks if (Order != CblasColMajor && Order != CblasRowMajor) info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (Uplo != CblasUpper && Uplo != CblasLower) info = cblas_errprn(2, info, "UPLO must be %d or %d, but is set to %d", CblasUpper, CblasLower, Uplo); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(8, info, "incY cannot be zero; is set to %d.", incY); if (lda < N || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d N=%d", lda, N); if (info != 2000) { cblas_xerbla(info, "cblas_cher2", ""); return; } #endif if (incX < 0) x += (1-N)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (Order == CblasColMajor) ATL_cher2(Uplo, N, alpha, x, incX, y, incY, A, lda); else if (alp[0] != ATL_rzero || alp[1] != ATL_rzero) { vx = malloc(ATL_Cachelen + ATL_MulBySize(N)); vy = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vx != NULL && vy != NULL); x0 = ATL_AlignPtr(vx); y0 = ATL_AlignPtr(vy); ATL_cmoveConj(N, alpha, y, incY, y0, 1); ATL_ccopyConj(N, x, incX, x0, 1); ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, one, y0, 1, x0, 1, A, lda); free(vx); free(vy); } else ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, alpha, y, incY, x, incX, A, lda); }
int Mjoin(PC2F,gels)(const enum CBLAS_TRANSPOSE TA, ATL_CINT M, ATL_CINT N, ATL_CINT NRHS, TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb) { TYPE work[2]; TYPE *wrk; ATL_INT lwrk; int iret; /* * Query routine for optimal workspace, allocate it, and call routine with it */ ATL_assert(!Mjoin(PC2F,gels_wrk)(TA, M, N, NRHS, A, lda, B, ldb, work, -1)); lwrk = work[0]; wrk = malloc(ATL_MulBySize(lwrk)); ATL_assert(wrk); iret = Mjoin(PC2F,gels_wrk)(TA, M, N, NRHS, A, lda, B, ldb, wrk, lwrk); free(wrk); return(iret); }
FLSTRUCT *ATL_GetFlushStruct(void *p, int length, FLSTRUCT *next) { FLSTRUCT *fp; fp = malloc(sizeof(FLSTRUCT)); ATL_assert(fp); fp->p = p; fp->length = length; fp->next = next; return(fp); }
void ATL_UGER2K (ATL_CINT M, ATL_CINT N, const TYPE *X0, const TYPE *Y0, const TYPE *X1, const TYPE *Y1, TYPE *A, ATL_CINT lda) { const TYPE *x0, *x1; register ATL_INT i, j; ATL_CINT incA = lda+lda + (lda<<2); ATL_assert((N/3)*3 == N); for (j=0; j < N; j += 3, A += incA, Y0 += 6, Y1 += 6) ATL_rk2(M, X0, X1, Y0, Y1, A, lda); }
int *GetIntList1(int ival) /* * returns integer array with iarr[0] = 1, iarr[1] = ival */ { int *iarr; iarr = malloc(2*sizeof(int)); ATL_assert(iarr); iarr[0] = 1; iarr[1] = ival; return(iarr); }
double ATL_ptflushcache(long long size) /* * flush cache by reading enough mem; note that if the compiler gets * really smart, may be necessary to make vp a global variable so it * can't figure out it's not being modified other than during setup; * the fact that ATL_dzero is external will confuse most compilers */ { static void *vp=NULL; static double *cache=NULL; double dret=0.0; static long long i, N = 0; ATL_FC fct[ATL_NTHREADS]; if (size < 0) /* flush cache */ { ATL_assert(cache); for (i=0; i < ATL_NTHREADS; i++) { fct[i].N = N; fct[i].dp = cache+i*N; } ATL_goparallel(ATL_NTHREADS, ATL_DoWorkFC, fct, NULL); } else if (size > 0) /* initialize */ { vp = malloc(ATL_Cachelen + (size * ATL_NTHREADS)); ATL_assert(vp); cache = ATL_AlignPtr(vp); N = size / sizeof(double); ATL_dzero(N*ATL_NTHREADS, cache, 1); } else if (size == 0) /* free cache */ { if (vp) free(vp); vp = cache = NULL; N = 0; } return(dret); }
main(int nargs, char **args) { char pre, fnam[128], cta; enum ATLAS_TRANS TA; int MFLOP, M, N, lda, i, l2size; double mf, mfs[3]; #ifdef TREAL TYPE alpha, beta; #else TYPE alpha[2], beta[2]; #endif FILE *fp; GetFlags(nargs, args, &pre, &l2size, &MFLOP, &cta, &M, &N, SADD alpha, &lda, SADD beta, fnam); if (cta == 'N' || cta == 'n') TA = AtlasNoTrans; else TA = AtlasTrans; if (!FileExists(fnam)) { fp = fopen(fnam, "w"); ATL_assert(fp); for (i=0; i < 3; i++) { mf = gemvcase(MFLOP, TA, l2size, M, N, alpha, lda, beta); fprintf(stdout, " %s : %f MFLOPS\n", fnam, mf); fprintf(fp, "%lf\n", mf); mfs[i] = mf; } } else { fp = fopen(fnam, "r"); for (i=0; i < 3; i++) ATL_assert(fscanf(fp, " %lf", &mfs[i]) == 1); } fclose(fp); mf = (mfs[0] + mfs[1] + mfs[2]) / 3.0; fprintf(stdout, " %s : %.2f MFLOPS\n", fnam, mf); exit(0); }
int *IntRange2IntList(int N0, int NN, int incN) { int i, n; int *iarr; for (i=N0, n=0; i <= NN; i += incN) n++; iarr = malloc(sizeof(int)*(n+1)); ATL_assert(iarr); iarr[0] = n; for (i=N0, n=1 ; i <= NN; i += incN, n++) iarr[n] = i; return(iarr); }
int *GetIntList2(int ival1, int ival2) /* * returns integer array with iarr[0] = 1, iarr[1] = ival1, ival[2] = ival2 */ { int *iarr; iarr = malloc(3*sizeof(int)); ATL_assert(iarr); iarr[0] = 1; iarr[1] = ival1; iarr[2] = ival2; return(iarr); }