// Tensors must be sorted from most to least significant dimension. // nc is the number of contracted indices, and // ind is an array of (nx by 2) integers (pairs of contracted indices). Tensor *tensdot(Tensor *a, Tensor *b, int nc, int *ind, int nref, Slice *m) { int i = 0; int ord_a, ord_b; int *anum, *bnum; int an, bn; Tensor *t; #ifdef PARANOID if(nc > a->n || nc > b->n) { fprintf(stderr, "tensdor: more contracted than input dims.\n"); exit(1); } #endif anum = number_indices(a->n, nc, ind); bnum = number_indices(b->n, nc, ind+1); t = tensor_ctor(a->n + b->n - 2*nc); an = copy_shape(a, anum, t->shape); bn = copy_shape(b, bnum, t->shape + a->n - nc); t->len = an*bn; t->b = reserve_block(m, nref, t->len*sizeof(float)); if(nc == 0) { // trivial case A <- a X Y^T + A, A m x n GER(bn, an, 1.0, b->b->x, 1, a->b->x, 1, t->b->x, an); //GER(bn, an, a->scale*b->scale, b->b->x, 1, a->b->x, 1, // t->b->x, an); free(anum); free(bnum); return t; } if( (ord_a = ck_ordered(a->n, anum))) { if( (ord_b = ck_ordered(b->n, bnum)) && ck_same(nc, ind)) { // Straightforward dgemm. GEMM(ord_b == 1, ord_a == -1, bn, an, a->len / an, 1.0, a->b->x, an, b->b->x, bn, 0.0, t->b->x, bn); //GEMM(ord_b == 1, ord_a == -1, bn, an, a->len / an, // a->scale*b->scale, a->b->x, an, b->b->x, bn, // 0.0, t->b->x, bn); // TODO: decide if transposing A would give better perf. } else { // Need to transpose B. printf("Need to transpose B.\n"); } } else if( (ord_b = ck_ordered(b->n, bnum))) { // Need to transpose A printf("Need to transpose A.\n"); } else { // Need to transpose both A and B. printf("Need to transpose A & B.\n"); } free(anum); free(bnum); return t; }
void MLGP_GEMM(char transA, char transB, unsigned M, unsigned N, unsigned K, FLOAT a, FLOAT* A, unsigned LDA, FLOAT* B, unsigned LDB, FLOAT b, FLOAT* C, unsigned LDC) { #ifdef DOUBLE #define GEMM(...) dgemm_(__VA_ARGS__) #else #define GEMM(...) sgemm_(__VA_ARGS__) #endif return GEMM(&transA, &transB, &M, &N, &K, &a, A, &LDA, B, &LDB, &b, C, &LDC); }
/* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a) * using a block size of bsize. The product is returned in b. * Since a^T a is symmetric, its computation can be speeded up by computing only its * upper triangular part and copying it to the lower part. * * More details on blocking can be found at * http://www-2.cs.cmu.edu/afs/cs/academic/class/15213-f02/www/R07/section_a/Recitation07-SectionA.pdf */ void TRANS_MAT_MAT_MULT(LM_REAL *a, LM_REAL *b, int n, int m) { #ifdef HAVE_LAPACK /* use BLAS matrix multiply */ LM_REAL alpha=CNST(1.0), beta=CNST(0.0); /* Fool BLAS to compute a^T*a avoiding transposing a: a is equivalent to a^T in column major, * therefore BLAS computes a*a^T with a and a*a^T in column major, which is equivalent to * computing a^T*a in row major! */ GEMM("N", "T", &m, &m, &n, &alpha, a, &m, a, &m, &beta, b, &m); #else /* no LAPACK, use blocking-based multiply */ register int i, j, k, jj, kk; register LM_REAL sum, *bim, *akm; const int bsize=__BLOCKSZ__; #define __MIN__(x, y) (((x)<=(y))? (x) : (y)) #define __MAX__(x, y) (((x)>=(y))? (x) : (y)) /* compute upper triangular part using blocking */ for(jj=0; jj<m; jj+=bsize){ for(i=0; i<m; ++i){ bim=b+i*m; for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j) bim[j]=0.0; //b[i*m+j]=0.0; } for(kk=0; kk<n; kk+=bsize){ for(i=0; i<m; ++i){ bim=b+i*m; for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j){ sum=0.0; for(k=kk; k<__MIN__(kk+bsize, n); ++k){ akm=a+k*m; sum+=akm[i]*akm[j]; //a[k*m+i]*a[k*m+j]; } bim[j]+=sum; //b[i*m+j]+=sum; } } } } /* copy upper triangular part to the lower one */ for(i=0; i<m; ++i) for(j=0; j<i; ++j) b[i*m+j]=b[j*m+i]; #undef __MIN__ #undef __MAX__ #endif /* HAVE_LAPACK */ }
//============================================================================= int Epetra_SerialDenseSVD::Invert( double rthresh, double athresh ) { if (!Factored()) Factor(); // Need matrix factored. //apply threshold double thresh = S_[0]*rthresh + athresh; int num_replaced = 0; for( int i = 0; i < M_; ++i ) if( S_[i] < thresh ) { //cout << num_replaced << thresh << " " << S_[0] << " " << S_[i] << std::endl; // S_[i] = thresh; S_[i] = 0.0; ++num_replaced; } //scale cols of U_ with reciprocal singular values double *p = U_; for( int i = 0; i < N_; ++i ) { double scale = 0.0; if( S_[i] ) scale = 1./S_[i]; for( int j = 0; j < M_; ++j ) *p++ *= scale; } //create new Inverse_ if necessary if( Inverse_ == 0 ) { Inverse_ = new Epetra_SerialDenseMatrix(); Inverse_->Shape( N_, M_ ); AI_ = Inverse_->A(); LDAI_ = Inverse_->LDA(); } /* else //zero it out { for( int i = 0; i < Inverse_->M(); ++i ) for( int j = 0; j < Inverse_->N(); ++j ) (*Inverse_)(i,j) = 0.0; } */ GEMM( 'T', 'T', M_, M_, M_, 1.0, Vt_, M_, U_, M_, 0.0, AI_, M_ ); double DN = N_; UpdateFlops((DN*DN*DN)); Inverted_ = true; Factored_ = false; EPETRA_CHK_ERR(INFO_); return(num_replaced); }
//--------------------------------------------------------- void umAxB(const DMat& A, const DMat& B, DMat& C) //--------------------------------------------------------- { //------------------------- // C = A * B //------------------------- // A = op(A) is (M,K) // B = op(B) is (K,N) // C is (M,N) //------------------------- int M=A.num_rows(), K=A.num_cols(), N=B.num_cols(); int LDA=M, LDB=K, LDC=M; double one=1.0, zero=0.0; if (B.num_rows() != K) { umERROR("umAxB(A,B,C)", "wrong dimensions"); } C.resize(M,N); GEMM ('N','N',M,N,K, one,A.data(),LDA, B.data(),LDB, zero,C.data(),LDC); }
int main(int argc, char *argv[]) { FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char trans='N'; blasint m, n, i, j; int loops = 1; int has_param_n=0; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--; argv++; if (argc > 0) { from = atol(*argv); argc--; argv++; } if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++; } if (argc > 0) { step = atol(*argv); argc--; argv++; } if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n"); exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n"); exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n"); exit(1); } p = getenv("OPENBLAS_LOOPS"); if ( p != NULL ) loops = atoi(p); if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); has_param_n=1; } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; if ( has_param_n == 1 && n <= m ) n=n; else n=m; fprintf(stderr, " %6dx%d : ", (int)m, (int)n); for (l=0; l<loops; l++) { for(j = 0; j < m; j++) { for(i = 0; i < m * COMPSIZE; i++) { a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; timeg += time1; } timeg /= loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6); } return 0; }
//************************************************************************ // ilEuclidDist // Computes Euclidean distance between sets of points in src1 and src2. // src1 is m1*n array; src2 is m2*n array where m1 and m2 are // the number of points in each set and n is the dimensionality // of the Euclidean space. The result dst is an m1*m2 matrix // with distances between all pairs of points. // If m1=m2=1 Euclidean distance between two vectors is computed. //************************************************************************ void ilEuclidDist(pMat const& src1, pMat const& src2, pMat dst) { int m1 = src1->rows; int m2 = src2->rows; int n = src1->cols; int type=src1->type; #if 0 //*** matrix version pMat ones_nm2 = CreateMat(n, m2, type); pMat ones_m1n = CreateMat(m1, n, type); pMat src1sq = CreateMat(m1, n, type); pMat src2sq = CreateMat(m2, n, type); pMat res1 = CreateMat(m1, m2, type); pMat res2 = CreateMat(m1, m2, type); Multiply(src1, src1, src1sq); Multiply(src2, src2, src2sq); SetValue(ones_nm2, cvScalar(1.0)); SetValue(ones_m1n, cvScalar(1.0)); //*** Euclidean distance with matrix multiplications MatrixMultiply(src1sq, ones_nm2, res1); GEMM(ones_m1n, src2sq, 1.0, res1, 1, res2, CV_GEMM_B_T); GEMM(src1, src2, -2.0, res2, 1, dst, CV_GEMM_B_T); PowerMatrix(dst, dst, .5); #endif #if 1 //*** element-wise version pMat src1sq = CreateMat(m1, n, type); pMat src2sq = CreateMat(m2, n, type); Multiply(src1, src1, src1sq); Multiply(src2, src2, src2sq); float *ptrsrc1sq, *ptrsrc2sq, *ptrdst; ptrsrc1sq = Ptr2D<float>(src1sq); ptrsrc2sq = Ptr2D<float>(src2sq); ptrdst = Ptr2D<float>(dst); int indsrc1sq, indsrc2sq, inddst=0; float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, vdst; for(int i = 0; i<m1; ++i) { indsrc1sq = i * n; v0 = ptrsrc1sq[indsrc1sq + 0]; v1 = ptrsrc1sq[indsrc1sq + 1]; v2 = ptrsrc1sq[indsrc1sq + 2]; v3 = ptrsrc1sq[indsrc1sq + 3]; v4 = ptrsrc1sq[indsrc1sq + 4]; v5 = ptrsrc1sq[indsrc1sq + 5]; v6 = ptrsrc1sq[indsrc1sq + 6]; v7 = ptrsrc1sq[indsrc1sq + 7]; v8 = ptrsrc1sq[indsrc1sq + 8]; v9 = ptrsrc1sq[indsrc1sq + 9]; for (int j=0; j<m2; ++j) { inddst = i * m2 + j; indsrc2sq = j * n; vdst = (v0-ptrsrc2sq[indsrc2sq]) * (v0-ptrsrc2sq[indsrc2sq]); vdst += (v1-ptrsrc2sq[indsrc2sq+1]) * (v1-ptrsrc2sq[indsrc2sq+1]); vdst += (v2-ptrsrc2sq[indsrc2sq+2]) * (v2-ptrsrc2sq[indsrc2sq+2]); vdst += (v3-ptrsrc2sq[indsrc2sq+3]) * (v3-ptrsrc2sq[indsrc2sq+3]); vdst += (v4-ptrsrc2sq[indsrc2sq+4]) * (v4-ptrsrc2sq[indsrc2sq+4]); vdst += (v5-ptrsrc2sq[indsrc2sq+5]) * (v5-ptrsrc2sq[indsrc2sq+5]); vdst += (v6-ptrsrc2sq[indsrc2sq+6]) * (v6-ptrsrc2sq[indsrc2sq+6]); vdst += (v7-ptrsrc2sq[indsrc2sq+7]) * (v7-ptrsrc2sq[indsrc2sq+7]); vdst += (v8-ptrsrc2sq[indsrc2sq+8]) * (v8-ptrsrc2sq[indsrc2sq+8]); vdst += (v9-ptrsrc2sq[indsrc2sq+9]) * (v9-ptrsrc2sq[indsrc2sq+9]); ptrdst[inddst] = vdst; } } #endif }
void SpinAdapted::MatrixMultiply (const Matrix& a, char conjA, const Matrix& b, char conjB, Matrix& c, Real scale, double cfactor) { //dmrginp.justmultiply.start(); //dmrginp.justmultiply -> start(); //ROA Matrix& a_ref = const_cast<Matrix&>(a); // for BLAS calls Matrix& b_ref = const_cast<Matrix&>(b); try { int aRows = a_ref.Nrows (); int aCols = a_ref.Ncols (); int bRows = b_ref.Nrows (); int bCols = b_ref.Ncols (); int cRows = c.Nrows (); int cCols = c.Ncols (); if (conjA == 'n' && conjB == 'n') { assert ((aCols == bRows) && (cRows == aRows) && (cCols == bCols)); #ifdef BLAS GEMM ('n', 'n', bCols, aRows, bRows, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bCols); #else c += (scale * a) * b; #endif } else if (conjA == 'n' && conjB == 't') { assert ((aCols == bCols) && (cRows == aRows) && (cCols == bRows)); #ifdef BLAS GEMM ('t', 'n', bRows, aRows, bCols, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bRows); #else c += (scale * a) * b.t (); #endif } else if (conjA == 't' && conjB == 'n') { assert ((aRows == bRows) && (cRows == aCols) && (cCols == bCols)); #ifdef BLAS GEMM ('n', 't', bCols, aCols, bRows, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bCols); #else c += (scale * a.t ()) * b; #endif } else if (conjA == 't' && conjB == 't') { assert ((aRows == bCols) && (cRows == aCols) && (cCols == bRows)); #ifdef BLAS GEMM ('t', 't', bRows, aCols, bCols, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bRows); #else c += (scale * a.t ()) * b.t (); #endif } else abort (); } catch (Exception) { pout << Exception::what () << endl; abort (); } //dmrginp.justmultiply.stop(); //dmrginp.justmultiply -> stop(); //ROA }
void l2ls_learn_basis_dual(DOUBLE *Dopt, DOUBLE *Dorig, DOUBLE *X, DOUBLE *S, DOUBLE l2norm, INT length, INT N, INT K, INT numSamples) { DOUBLE *SSt = (DOUBLE *) MALLOC(K * K * sizeof(DOUBLE)); CHAR uplo = 'U'; CHAR trans = 'N'; INT SYRKN = K; INT SYRKK = numSamples; DOUBLE alpha = 1; INT SYRKLDA = K; DOUBLE beta = 0; INT SYRKLDC = K; SYRK(&uplo, &trans, &SYRKN, &SYRKK, &alpha, S, &SYRKLDA, &beta, SSt, &SYRKLDC); DOUBLE *XSt = (DOUBLE *) MALLOC(N * K * sizeof(DOUBLE)); CHAR transa = 'N'; CHAR transb = 'T'; INT GEMMM = N; INT GEMMN = K; INT GEMMK = numSamples; alpha = 1; INT GEMMLDA = N; INT GEMMLDB = K; beta = 0; INT GEMMLDC = N; GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, X, &GEMMLDA, S, &GEMMLDB, &beta, XSt, &GEMMLDC); DOUBLE *SXt = (DOUBLE *) MALLOC(N * K * sizeof(DOUBLE)); transpose(XSt, SXt, N, K); INT iterK; DOUBLE *dualLambdaOrig = (DOUBLE *) MALLOC(K * sizeof(DOUBLE)); if (Dorig == NULL) { srand(time(NULL)); for (iterK = 0; iterK < K; ++iterK) { dualLambdaOrig[iterK] = 10 * (DOUBLE) rand() / (DOUBLE) RAND_MAX; } } else { INT maxNK = IMAX(N, K); DOUBLE *B = (DOUBLE *) MALLOC(maxNK * maxNK * sizeof(DOUBLE)); for (iterK = 0; iterK < K; ++iterK) { datacpy(&B[iterK * maxNK], &XSt[iterK * N], K); } INT GELSYM = N; INT GELSYN = K; INT GELSYNRHS = K; INT GELSYLDA = N; INT GELSYLDB = maxNK; INT *jpvt = (INT *) MALLOC(K * sizeof(INT)); DOUBLE rcond; INT rank; INT lwork = -1; DOUBLE work_temp; DOUBLE *work; INT INFO; GELSY(&GELSYM, &GELSYN, &GELSYNRHS, Dorig, &GELSYLDA, B, &GELSYLDB, jpvt, &rcond, &rank, &work_temp, &lwork, &INFO); lwork = (INT) work_temp; work = (DOUBLE*) MALLOC(lwork * sizeof(DOUBLE)); GELSY(&GELSYM, &GELSYN, &GELSYNRHS, Dorig, &GELSYLDA, XSt, &GELSYLDB, jpvt, &rcond, &rank, work, &lwork, &INFO); for (iterK = 0; iterK < K; ++iterK) { dualLambdaOrig[K] = B[iterK * K + iterK] - SSt[iterK * K + iterK]; } FREE(work); FREE(B); FREE(jpvt); } DOUBLE *SXtXSt = (DOUBLE *) MALLOC(K * K * sizeof(DOUBLE)); uplo = 'U'; trans = 'N'; SYRKN = K; SYRKK = N; alpha = 1; SYRKLDA = K; beta = 0; SYRKLDC = K; SYRK(&uplo, &trans, &SYRKN, &SYRKK, &alpha, SXt, &SYRKLDA, &beta, SXtXSt, &SYRKLDC); DOUBLE c = SQR(l2norm); CHAR norm = 'F'; INT LANGEM = N; INT LANGEN = numSamples; INT LANGELDA = N; DOUBLE trXXt = LANGE(&norm, &LANGEM, &LANGEN, X, &LANGELDA, NULL); trXXt = SQR(trXXt); /* DOUBLE *dualLambdaOpt = (DOUBLE *) MALLOC(K * sizeof(DOUBLE)); */ DOUBLE *dualLambdaOpt = XSt; minimize_dual(dualLambdaOpt, dualLambdaOrig, length, SSt, SXt, SXtXSt, trXXt, c, N, K); for (iterK = 0; iterK < K; ++iterK) { SSt[iterK * K + iterK] += dualLambdaOpt[iterK]; } uplo = 'U'; INT POTRSN = K; INT POTRSLDA = K; INT INFO; POTRF(&uplo, &POTRSN, SSt, &POTRSLDA, &INFO); INT POTRSNRHS = N; INT POTRSLDB = K; POTRS(&uplo, &POTRSN, &POTRSNRHS, SSt, &POTRSLDA, SXt, &POTRSLDB, &INFO); transpose(SXt, Dopt, K, N); FREE(SSt); FREE(XSt); FREE(SXt); FREE(dualLambdaOrig); FREE(SXtXSt); }
//============================================================================= int Epetra_SerialDenseSVD::Solve(void) { //FOR NOW, ONLY ALLOW SOLVES IF INVERTED!!!! //NO REFINEMENT!!! //NO EQUILIBRATION!!! // We will call one of four routines depending on what services the user wants and // whether or not the matrix has been inverted or factored already. // // If the matrix has been inverted, use DGEMM to compute solution. // Otherwise, if the user want the matrix to be equilibrated or wants a refined solution, we will // call the X interface. // Otherwise, if the matrix is already factored we will call the TRS interface. // Otherwise, if the matrix is unfactored we will call the SV interface. /* if (Equilibrate_) { ierr = EquilibrateRHS(); B_Equilibrated_ = true; } EPETRA_CHK_ERR(ierr); if (A_Equilibrated_ && !B_Equilibrated_) EPETRA_CHK_ERR(-1); // Matrix and vectors must be similarly scaled if (!A_Equilibrated_ && B_Equilibrated_) EPETRA_CHK_ERR(-2); if (B_==0) EPETRA_CHK_ERR(-3); // No B if (X_==0) EPETRA_CHK_ERR(-4); // No B if (ShouldEquilibrate() && !A_Equilibrated_) ierr = 1; // Warn that the system should be equilibrated. */ double DN = N_; double DNRHS = NRHS_; if (Inverted()) { if (B_==X_) EPETRA_CHK_ERR(-100); // B and X must be different for this case GEMM(TRANS_, 'N', N_, NRHS_, N_, 1.0, AI_, LDAI_, B_, LDB_, 0.0, X_, LDX_); if (INFO_!=0) EPETRA_CHK_ERR(INFO_); UpdateFlops(2.0*DN*DN*DNRHS); Solved_ = true; } else EPETRA_CHK_ERR(-101); //Currently, for solve must have inverse /* else { if (!Factored()) Factor(); // Matrix must be factored if (B_!=X_) *LHS_ = *RHS_; // Copy B to X if needed GETRS(TRANS_, N_, NRHS_, AF_, LDAF_, IPIV_, X_, LDX_, &INFO_); if (INFO_!=0) EPETRA_CHK_ERR(INFO_); UpdateFlops(2.0*DN*DN*DNRHS); Solved_ = true; } int ierr1=0; if (RefineSolution_ && !Inverted()) ierr1 = ApplyRefinement(); if (ierr1!=0) EPETRA_CHK_ERR(ierr1) else EPETRA_CHK_ERR(ierr); if (Equilibrate_) ierr1 = UnequilibrateLHS(); EPETRA_CHK_ERR(ierr1); */ return(0); }
//============================================================================= int Epetra_SerialSpdDenseSolver::Solve(void) { int ierr = 0; // We will call one of four routines depending on what services the user wants and // whether or not the matrix has been inverted or factored already. // // If the matrix has been inverted, use DGEMM to compute solution. // Otherwise, if the user want the matrix to be equilibrated or wants a refined solution, we will // call the X interface. // Otherwise, if the matrix is already factored we will call the TRS interface. // Otherwise, if the matrix is unfactored we will call the SV interface. if (Equilibrate_) { ierr = Epetra_SerialDenseSolver::EquilibrateRHS(); B_Equilibrated_ = true; } EPETRA_CHK_ERR(ierr); if (A_Equilibrated_ && !B_Equilibrated_) EPETRA_CHK_ERR(-1); // Matrix and vectors must be similarly scaled if (!A_Equilibrated_ && B_Equilibrated_) EPETRA_CHK_ERR(-2); if (B_==0) EPETRA_CHK_ERR(-3); // No B if (X_==0) EPETRA_CHK_ERR(-4); // No B if (ShouldEquilibrate() && !A_Equilibrated_) ierr = 1; // Warn that the system should be equilibrated. double DN = N_; double DNRHS = NRHS_; if (Inverted()) { if (B_==X_) EPETRA_CHK_ERR(-100); // B and X must be different for this case GEMM('N', 'N', N_, NRHS_, N_, 1.0, AF_, LDAF_, B_, LDB_, 0.0, X_, LDX_); if (INFO_!=0) EPETRA_CHK_ERR(INFO_); UpdateFlops(2.0*DN*DN*DNRHS); Solved_ = true; } else { if (!Factored()) Factor(); // Matrix must be factored if (B_!=X_) { *LHS_ = *RHS_; // Copy B to X if needed X_ = LHS_->A(); LDX_ = LHS_->LDA(); } POTRS(SymMatrix_->UPLO(), N_, NRHS_, AF_, LDAF_, X_, LDX_, &INFO_); if (INFO_!=0) EPETRA_CHK_ERR(INFO_); UpdateFlops(2.0*DN*DN*DNRHS); Solved_ = true; } int ierr1=0; if (RefineSolution_) ierr1 = ApplyRefinement(); if (ierr1!=0) { EPETRA_CHK_ERR(ierr1); } else { EPETRA_CHK_ERR(ierr); } if (Equilibrate_) ierr1 = Epetra_SerialDenseSolver::UnequilibrateLHS(); EPETRA_CHK_ERR(ierr1); return(0); }
int gpu_gemm(const real *h_A, const real *h_B, real *h_C, const real alpha, const real beta, const int N) { real *d_A = 0; real *d_B = 0; real *d_C = 0; int n2 = N * N; cublasStatus_t status; cublasHandle_t handle; status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); /* Performs operation using cublas */ status = GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } return 0; }
void nuclear_hard_thresholding(DOUBLE *X, DOUBLE *norm, INT rank, INT M, INT N, DOUBLE *sv, \ DOUBLE *svecsmall, DOUBLE *sveclarge, DOUBLE *work, INT lwork) { INT MINMN = IMIN(M, N); INT MAXMN = IMAX(M, N); INT svFlag = 0; if (sv == NULL) { sv = (DOUBLE *) malloc(MINMN * 1 * sizeof(DOUBLE)); svFlag = 1; } INT svecsmallFlag = 0; if (svecsmall == NULL) { svecsmall = (DOUBLE *) malloc(MINMN * MINMN * sizeof(DOUBLE)); svecsmallFlag = 1; } INT sveclargeFlag = 0; if (sveclarge == NULL) { sveclarge = (DOUBLE *) malloc(MAXMN * MINMN * sizeof(DOUBLE)); sveclargeFlag = 1; } CHAR jobu = 'S'; CHAR jobvt = 'S'; DOUBLE *u; DOUBLE *vt; if (MAXMN == M) { u = sveclarge; vt = svecsmall; } else { u = svecsmall; vt = sveclarge; } INT GESVDM = M; INT GESVDN = N; INT GESVDLDA = M; INT GESVDLDU = M; INT GESVDLDVT = MINMN; INT info; if (lwork == -1) { GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, X, &GESVDLDA, sv, u, &GESVDLDU, vt, &GESVDLDVT, work, &lwork, &info); if (svFlag == 1) { free(sv); } if (svecsmallFlag == 1) { free(svecsmall); } if (sveclargeFlag == 1) { free(sveclarge); } return; } INT workFlag = 0; if (lwork == 0) { DOUBLE workTemp; lwork = -1; GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, X, &GESVDLDA, sv, u, &GESVDLDU, vt, &GESVDLDVT, &workTemp, &lwork, &info); if (info != 0) { PRINTF("Error, INFO = %d. ", info); ERROR("LAPACK error."); } lwork = (INT) workTemp; work = (DOUBLE *) malloc(lwork * 1 * sizeof(DOUBLE)); workFlag = 1; } GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, X, &GESVDLDA, sv, u, &GESVDLDU, vt, &GESVDLDVT, work, &lwork, &info); if (info != 0) { PRINTF("Error, INFO = %d. ", info); ERROR("LAPACK error."); } INT iterMN; DOUBLE normtemp = 0; for (iterMN = 0; iterMN < rank; ++iterMN) { normtemp += sv[iterMN]; } if (norm != NULL) { *norm = normtemp; } for (iterMN = rank; iterMN < MINMN; ++iterMN) { sv[iterMN] = 0; } /* * TODO: Only multiply for singular vectors corresponding to non-zero singular values. */ if (MAXMN == M) { INT SCALN = M; INT incx = 1; for (iterMN = 0; iterMN < MINMN; ++iterMN) { SCAL(&SCALN, &sv[iterMN], &u[iterMN * M], &incx); } CHAR transa = 'N'; CHAR transb = 'N'; INT GEMMM = M; INT GEMMN = N; INT GEMMK = MINMN; DOUBLE alpha = 1; INT GEMMLDA = M; INT GEMMLDB = MINMN; DOUBLE beta = 0; INT GEMMLDC = M; GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, u, &GEMMLDA, vt, &GEMMLDB, &beta, X, &GEMMLDC); } else { INT SCALN = M; INT incx = 1; for (iterMN = 0; iterMN < MINMN; ++iterMN) { SCAL(&SCALN, &sv[iterMN], &u[iterMN * M], &incx); } CHAR transa = 'N'; CHAR transb = 'N'; INT GEMMM = M; INT GEMMN = N; INT GEMMK = MINMN; DOUBLE alpha = 1; INT GEMMLDA = M; INT GEMMLDB = MINMN; DOUBLE beta = 0; INT GEMMLDC = M; GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, u, &GEMMLDA, vt, &GEMMLDB, &beta, X, &GEMMLDC); } if (svFlag == 1) { free(sv); } if (svecsmallFlag == 1) { free(svecsmall); } if (sveclargeFlag == 1) { free(sveclarge); } if (workFlag == 1) { free(work); } }
// TODO: Change so that adaptive depending on M and N // TODO: Check what standard I use for copy of original data, and get rid of it // if not necessary. void nuclear_approx_obj_grad(DOUBLE *obj, DOUBLE *deriv, DOUBLE *X, DOUBLE *rp, \ INT M, INT N, INT derivFlag, DOUBLE *svdVec, DOUBLE *vtMat, \ DOUBLE *dataBuffer, DOUBLE *derivVec, DOUBLE *work, INT lwork) { INT MN = IMIN(M, N); if (lwork == -1) { CHAR jobu; CHAR jobvt; if (derivFlag == 1) { jobu = 'O'; jobvt = 'S'; } else { jobu = 'N'; jobvt = 'N'; } INT GESVDM = M; INT GESVDN = N; INT GESVDLDA = M; INT GESVDLDU = M; INT GESVDLDVT = MN; INT INFO; GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, dataBuffer, &GESVDLDA, svdVec, NULL, &GESVDLDU, vtMat, &GESVDLDVT, work, &lwork, &INFO); if (INFO != 0) { PRINTF("Error, INFO = %d. ", INFO); ERROR("LAPACK error."); } return; } INT svdVecFlag = 0; if (svdVec == NULL) { svdVec = (DOUBLE *) MALLOC(1 * MN * sizeof(DOUBLE)); svdVecFlag = 1; } INT derivVecFlag = 0; if ((derivVec == NULL) && (derivFlag == 1)) { derivVec = (DOUBLE *) MALLOC(1 * MN * sizeof(DOUBLE)); derivVecFlag = 1; } INT vtMatFlag = 0; if ((vtMat == NULL) && (derivFlag == 1)) { vtMat = (DOUBLE *) MALLOC(MN * N * sizeof(DOUBLE)); vtMatFlag = 1; } INT dataBufferFlag = 0; if (dataBuffer == NULL) { dataBuffer = (DOUBLE *) MALLOC(M * N * sizeof(DOUBLE)); dataBufferFlag = 1; } INT workFlag = 0; if (work == NULL) { workFlag = 1; } CHAR jobu; CHAR jobvt; if (derivFlag == 1) { jobu = 'O'; jobvt = 'S'; } else { jobu = 'N'; jobvt = 'N'; } datacpy(dataBuffer, X, M * N); INT GESVDM = M; INT GESVDN = N; INT GESVDLDA = M; INT GESVDLDU = M; INT GESVDLDVT = MN; INT INFO; if (workFlag == 1) { lwork = -1; DOUBLE work_temp; GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, dataBuffer, &GESVDLDA, svdVec, NULL, &GESVDLDU, vtMat, &GESVDLDVT, &work_temp, &lwork, &INFO); if (INFO != 0) { PRINTF("Error, INFO = %d. ", INFO); ERROR("LAPACK error."); } lwork = (INT) work_temp; work = (DOUBLE*) MALLOC(lwork * sizeof(DOUBLE)); } GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, dataBuffer, &GESVDLDA, svdVec, NULL, &GESVDLDU, vtMat, &GESVDLDVT, work, &lwork, &INFO); if (INFO != 0) { PRINTF("Error, INFO = %d. ", INFO); ERROR("LAPACK error."); } abs_smooth_obj_grad(svdVec, derivVec, svdVec, rp, MN, derivFlag); INT ASUMN = MN; INT incx = 1; *obj = ASUM(&ASUMN, svdVec, &incx); if (derivFlag == 1) { INT iterMN; INT SCALN = M; INT incx = 1; DOUBLE alpha; for (iterMN = 0; iterMN < MN; ++iterMN) { alpha = derivVec[iterMN]; SCAL(&SCALN, &alpha, &dataBuffer[iterMN * M], &incx); } CHAR transa = 'N'; CHAR transb = 'N'; INT GEMMM = M; INT GEMMN = N; INT GEMMK = MN; alpha = 1.0; INT GEMMLDA = M; INT GEMMLDB = MN; DOUBLE beta = 0.0; INT GEMMLDC = M; GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, dataBuffer, &GEMMLDA, vtMat, &GEMMLDB, &beta, deriv, &GEMMLDC); } if (svdVecFlag == 1) { FREE(svdVec); } if (derivVecFlag == 1) { FREE(derivVec); } if (vtMatFlag == 1) { FREE(vtMat); } if (dataBufferFlag == 1) { FREE(dataBuffer); } if (workFlag == 1) { FREE(work); } }