void MLGP_GEMV(char transA, unsigned M, unsigned N, FLOAT a, FLOAT* A, unsigned LDA, FLOAT* X, unsigned incX, FLOAT b, FLOAT* Y, unsigned incY) { #ifdef DOUBLE #define GEMV(...) dgemv_(__VA_ARGS__) #else #define GEMV(...) sgemv_(__VA_ARGS__) #endif return GEMV(&transA, &M, &N, &a, A, &LDA, X, &incX, &b, Y, &incY); }
void Orthogonalize(OrthoContext* c, double* p, int numBases, double* orthonormalBases) { memcpy(c->Pv->Data, p, c->Pv->Count * sizeof(double)); memcpy(c->Bases->Data, orthonormalBases, numBases * c->Pv->Count * sizeof(double)); c->Bases->RowCount = numBases; c->Dp->Count = numBases; int basisLen = c->Pv->Count; GEMV(1, c->Bases, c->Pv, 0, c->Dp); for (int i = 0, offset = 0; i < numBases; i++, offset += basisLen) AXPY2(-1 * c->Dp->Data[i], c->Bases->Data + offset, basisLen, c->Pv->Data); double mag = cblas_dnrm2(basisLen, c->Pv->Data, 1); cblas_dscal(basisLen, 1.0 / mag, c->Pv->Data, 1); memcpy(p, c->Pv->Data, basisLen * sizeof(double)); }
//========================================================================= int EpetraExt_BlockDiagMatrix::ApplyInverse(const Epetra_MultiVector& X, Epetra_MultiVector& Y) const{ int info; // Sanity Checks int NumVectors=X.NumVectors(); if(NumVectors!=Y.NumVectors()) EPETRA_CHK_ERR(-1); if(!HasComputed_ && (ApplyMode_==AM_INVERT || ApplyMode_==AM_FACTOR)) EPETRA_CHK_ERR(-2); //NTS: MultiVector's MyLength and [] Operators are "points" level operators //not a "block/element" level operators. const int *vlist=DataMap_->FirstPointInElementList(); const int *xlist=Map().FirstPointInElementList(); const int *blocksize=Map().ElementSizeList(); if(ApplyMode_==AM_MULTIPLY || ApplyMode_==AM_INVERT){ // Multiply & Invert mode have the same apply int NumBlocks=NumMyBlocks(); for(int i=0;i<NumBlocks;i++){ int Nb=blocksize[i]; int vidx0=vlist[i]; int xidx0=xlist[i]; for(int j=0;j<NumVectors;j++){ if(Nb==1) { // Optimize for size = 1 Y[j][xidx0]=Values_[vidx0]*X[j][xidx0]; } else if(Nb==2){ // Optimize for size = 2 Y[j][xidx0 ]=Values_[vidx0 ]*X[j][xidx0] + Values_[vidx0+2]*X[j][xidx0+1]; Y[j][xidx0+1]=Values_[vidx0+1]*X[j][xidx0] + Values_[vidx0+3]*X[j][xidx0+1]; } else{ // "Large" Block - Use BLAS //void GEMV (const char TRANS, const int M, const int N, const double ALPHA, const double *A, const int LDA, const double *X, const double BETA, double *Y, const int INCX=1, const int INCY=1) const GEMV('N',Nb,Nb,1.0,&Values_[vidx0],Nb,&X[j][xidx0],0.0,&Y[j][xidx0]); } } } } else{ // Factorization mode has a different apply int NumBlocks=NumMyBlocks(); for(int i=0;i<NumBlocks;i++){ int Nb=blocksize[i]; int vidx0=vlist[i]; int xidx0=xlist[i]; for(int j=0;j<NumVectors;j++){ if(Nb==1) { // Optimize for size = 1 - use the inverse Y[j][xidx0]=Values_[vidx0]*X[j][xidx0]; } else if(Nb==2){ // Optimize for size = 2 - use the inverse Y[j][xidx0 ]=Values_[vidx0 ]*X[j][xidx0] + Values_[vidx0+2]*X[j][xidx0+1]; Y[j][xidx0+1]=Values_[vidx0+1]*X[j][xidx0] + Values_[vidx0+3]*X[j][xidx0+1]; } else{ // "Large" Block - use LAPACK // void GETRS (const char TRANS, const int N, const int NRHS, const double *A, const int LDA, const int *IPIV, double *X, const int LDX, int *INFO) const for(int k=0;k<Nb;k++) Y[j][xidx0+k]=X[j][xidx0+k]; LAPACK.GETRS('N',Nb,1,&Values_[vidx0],Nb,&Pivots_[xidx0],&Y[j][xidx0],Nb,&info); if(info) EPETRA_CHK_ERR(info); } } } } return 0; }
int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char trans='N'; blasint m, i, j; blasint inc_x=1,inc_y=1; blasint n=0; int has_param_n = 0; int has_param_m = 0; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} int tomax = to; if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); if ((n>0)) has_param_n = 1; if ( n > tomax ) tomax = n; } if ( has_param_n == 0 ) if ((p = getenv("OPENBLAS_PARAM_M"))) { m = atoi(p); if ((m>0)) has_param_m = 1; if ( m > tomax ) tomax = m; } fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); if (has_param_m == 0) { for(m = from; m <= to; m += step) { timeg=0; if ( has_param_n == 0 ) n = m; fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for (l=0; l<loops; l++) { for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; timeg += time1; } timeg /= loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); } } else { for(n = from; n <= to; n += step) { timeg=0; fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for (l=0; l<loops; l++) { for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; timeg += time1; } timeg /= loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); } } return 0; }