C++ (Cpp) GEMMの例 - HotExamples

コード例 #1

0

ファイルを表示

ファイル: tdot_ord.c プロジェクト: frobnitzem/slack

// Tensors must be sorted from most to least significant dimension.
// nc is the number of contracted indices, and
// ind is an array of (nx by 2) integers (pairs of contracted indices).
Tensor *tensdot(Tensor *a, Tensor *b, int nc, int *ind, int nref, Slice *m) {
    int i = 0;
    int ord_a, ord_b;
    int *anum, *bnum;
    int an, bn;
    Tensor *t;

#ifdef PARANOID
    if(nc > a->n || nc > b->n) {
        fprintf(stderr, "tensdor: more contracted than input dims.\n");
        exit(1);
    }
#endif
    anum = number_indices(a->n, nc, ind);
    bnum = number_indices(b->n, nc, ind+1);

    t = tensor_ctor(a->n + b->n - 2*nc);
    an = copy_shape(a, anum, t->shape);
    bn = copy_shape(b, bnum, t->shape + a->n - nc);
    t->len = an*bn;
    t->b = reserve_block(m, nref, t->len*sizeof(float));

    if(nc == 0) { // trivial case A <- a X Y^T + A, A m x n
        GER(bn, an, 1.0, b->b->x, 1, a->b->x, 1,
                t->b->x, an);
        //GER(bn, an, a->scale*b->scale, b->b->x, 1, a->b->x, 1,
        //        t->b->x, an);
        free(anum); free(bnum);
        return t;
    }

    if( (ord_a = ck_ordered(a->n, anum))) {
        if( (ord_b = ck_ordered(b->n, bnum))
                && ck_same(nc, ind)) { // Straightforward dgemm.
            GEMM(ord_b == 1, ord_a == -1, bn, an, a->len / an,
                    1.0, a->b->x, an, b->b->x, bn,
                    0.0, t->b->x, bn);
            //GEMM(ord_b == 1, ord_a == -1, bn, an, a->len / an,
            //      a->scale*b->scale, a->b->x, an, b->b->x, bn,
            //      0.0, t->b->x, bn);
        // TODO: decide if transposing A would give better perf.
        } else { // Need to transpose B.
            printf("Need to transpose B.\n");
        }
    } else if( (ord_b = ck_ordered(b->n, bnum))) { // Need to transpose A
            printf("Need to transpose A.\n");
    } else { // Need to transpose both A and B.
            printf("Need to transpose A & B.\n");
    }

    free(anum);
    free(bnum);
    return t;
}

コード例 #2

0

ファイルを表示

ファイル: blas.c プロジェクト: ngjw/mlgp

void MLGP_GEMM(char transA, char transB, unsigned M, unsigned N, unsigned K,
               FLOAT a, FLOAT* A, unsigned LDA, FLOAT* B, unsigned LDB,
               FLOAT b, FLOAT* C, unsigned LDC)
{
  #ifdef DOUBLE
  #define GEMM(...) dgemm_(__VA_ARGS__)
  #else
  #define GEMM(...) sgemm_(__VA_ARGS__)
  #endif
  return GEMM(&transA, &transB, &M, &N, &K, &a, A, &LDA, B, &LDB, &b, C, &LDC);
  
}

コード例 #3

0

ファイルを表示

ファイル: misc_core.c プロジェクト: diehard2/stochfit

/* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a)
 * using a block size of bsize. The product is returned in b.
 * Since a^T a is symmetric, its computation can be speeded up by computing only its
 * upper triangular part and copying it to the lower part.
 *
 * More details on blocking can be found at 
 * http://www-2.cs.cmu.edu/afs/cs/academic/class/15213-f02/www/R07/section_a/Recitation07-SectionA.pdf
 */
void TRANS_MAT_MAT_MULT(LM_REAL *a, LM_REAL *b, int n, int m)
{
#ifdef HAVE_LAPACK /* use BLAS matrix multiply */

LM_REAL alpha=CNST(1.0), beta=CNST(0.0);
  /* Fool BLAS to compute a^T*a avoiding transposing a: a is equivalent to a^T in column major,
   * therefore BLAS computes a*a^T with a and a*a^T in column major, which is equivalent to
   * computing a^T*a in row major!
   */
  GEMM("N", "T", &m, &m, &n, &alpha, a, &m, a, &m, &beta, b, &m);

#else /* no LAPACK, use blocking-based multiply */

register int i, j, k, jj, kk;
register LM_REAL sum, *bim, *akm;
const int bsize=__BLOCKSZ__;

#define __MIN__(x, y) (((x)<=(y))? (x) : (y))
#define __MAX__(x, y) (((x)>=(y))? (x) : (y))

  /* compute upper triangular part using blocking */
  for(jj=0; jj<m; jj+=bsize){
    for(i=0; i<m; ++i){
      bim=b+i*m;
      for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j)
        bim[j]=0.0; //b[i*m+j]=0.0;
    }

    for(kk=0; kk<n; kk+=bsize){
      for(i=0; i<m; ++i){
        bim=b+i*m;
        for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j){
          sum=0.0;
          for(k=kk; k<__MIN__(kk+bsize, n); ++k){
            akm=a+k*m;
            sum+=akm[i]*akm[j]; //a[k*m+i]*a[k*m+j];
          }
          bim[j]+=sum; //b[i*m+j]+=sum;
        }
      }
    }
  }

  /* copy upper triangular part to the lower one */
  for(i=0; i<m; ++i)
    for(j=0; j<i; ++j)
      b[i*m+j]=b[j*m+i];

#undef __MIN__
#undef __MAX__

#endif /* HAVE_LAPACK */
}

コード例 #4

0

ファイルを表示

ファイル: Epetra_SerialDenseSVD.cpp プロジェクト: 00liujj/trilinos

//=============================================================================
int Epetra_SerialDenseSVD::Invert( double rthresh, double athresh )
{
  if (!Factored()) Factor(); // Need matrix factored.

  //apply threshold
  double thresh = S_[0]*rthresh + athresh;
  int num_replaced = 0;
  for( int i = 0; i < M_; ++i )
    if( S_[i] < thresh )
    {
//cout <<  num_replaced << thresh << " " << S_[0] << " " << S_[i] << std::endl;
//      S_[i] = thresh;
      S_[i] = 0.0;
      ++num_replaced;
    }

  //scale cols of U_ with reciprocal singular values
  double *p = U_;
  for( int i = 0; i < N_; ++i )
  {
    double scale = 0.0;
    if( S_[i] ) scale = 1./S_[i];
    for( int j = 0; j < M_; ++j ) *p++ *= scale;
  }

  //create new Inverse_ if necessary
  if( Inverse_ == 0 )
  {
    Inverse_ = new Epetra_SerialDenseMatrix();
    Inverse_->Shape( N_, M_ );
    AI_ = Inverse_->A();
    LDAI_ = Inverse_->LDA();
  }
/*
  else //zero it out
  {
    for( int i = 0; i < Inverse_->M(); ++i )
      for( int j = 0; j < Inverse_->N(); ++j )
        (*Inverse_)(i,j) = 0.0;
  }
*/

  GEMM( 'T', 'T', M_, M_, M_, 1.0, Vt_, M_, U_, M_, 0.0, AI_, M_ );

  double DN = N_;
  UpdateFlops((DN*DN*DN));
  Inverted_ = true;
  Factored_ = false;

  EPETRA_CHK_ERR(INFO_);
  return(num_replaced);
}

コード例 #5

0

ファイルを表示

ファイル: Mat_COL.cpp プロジェクト: Chang-Liu-0520/nodal-dg

//---------------------------------------------------------
void umAxB(const DMat& A, const DMat& B, DMat& C)
//---------------------------------------------------------
{
  //-------------------------
  // C = A * B
  //-------------------------
  // A = op(A) is (M,K)
  // B = op(B) is (K,N)
  //        C  is (M,N)
  //-------------------------
  int M=A.num_rows(), K=A.num_cols(), N=B.num_cols();
  int LDA=M, LDB=K, LDC=M;
  double one=1.0, zero=0.0;
  if (B.num_rows() != K) { umERROR("umAxB(A,B,C)", "wrong dimensions"); }
  C.resize(M,N);

  GEMM ('N','N',M,N,K, one,A.data(),LDA, 
                           B.data(),LDB, 
                      zero,C.data(),LDC);
}

コード例 #6

0

ファイルを表示

ファイル: gemm.c プロジェクト: balagopalraj/clearlinux

int main(int argc, char *argv[]) {

    FLOAT *a, *b, *c;
    FLOAT alpha[] = {1.0, 1.0};
    FLOAT beta [] = {1.0, 1.0};
    char trans='N';
    blasint m, n, i, j;
    int loops = 1;
    int has_param_n=0;
    int l;
    char *p;

    int from =   1;
    int to   = 200;
    int step =   1;

    struct timeval start, stop;
    double time1,timeg;

    argc--;
    argv++;

    if (argc > 0) {
        from     = atol(*argv);
        argc--;
        argv++;
    }
    if (argc > 0) {
        to       = MAX(atol(*argv), from);
        argc--;
        argv++;
    }
    if (argc > 0) {
        step     = atol(*argv);
        argc--;
        argv++;
    }

    if ((p = getenv("OPENBLAS_TRANS")))  trans=*p;

    fprintf(stderr, "From : %3d  To : %3d Step=%d : Trans=%c\n", from, to, step, trans);

    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
        fprintf(stderr,"Out of Memory!!\n");
        exit(1);
    }

    if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
        fprintf(stderr,"Out of Memory!!\n");
        exit(1);
    }

    if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
        fprintf(stderr,"Out of Memory!!\n");
        exit(1);
    }

    p = getenv("OPENBLAS_LOOPS");
    if ( p != NULL )
        loops = atoi(p);

    if ((p = getenv("OPENBLAS_PARAM_N"))) {
        n = atoi(p);
        has_param_n=1;
    }


#ifdef linux
    srandom(getpid());
#endif

    fprintf(stderr, "   SIZE       Flops\n");

    for(m = from; m <= to; m += step)
    {

        timeg=0;

        if ( has_param_n == 1 && n <= m )
            n=n;
        else
            n=m;



        fprintf(stderr, " %6dx%d : ", (int)m, (int)n);

        for (l=0; l<loops; l++)
        {

            for(j = 0; j < m; j++) {
                for(i = 0; i < m * COMPSIZE; i++) {
                    a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
                    b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
                    c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
                }
            }

            gettimeofday( &start, (struct timezone *)0);

            GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );

            gettimeofday( &stop, (struct timezone *)0);

            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;

            timeg += time1;

        }

        timeg /= loops;
        fprintf(stderr,
                " %10.2f MFlops\n",
                COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6);

    }

    return 0;
}

コード例 #7

0

ファイルを表示

ファイル: ilmatrixoperations.cpp プロジェクト: atn19808/objectdet

//************************************************************************
// ilEuclidDist
//   Computes Euclidean distance between sets of points in src1 and src2.
//   src1 is m1*n array; src2 is m2*n array where m1 and m2 are
//   the number of points in each set and n is the dimensionality
//   of the Euclidean space. The result dst is an m1*m2 matrix
//   with distances between all pairs of points.
//   If m1=m2=1 Euclidean distance between two vectors is computed.
//************************************************************************
void ilEuclidDist(pMat const& src1, pMat const& src2, pMat dst)
{
    int m1 = src1->rows;
    int m2 = src2->rows;
    int n = src1->cols;
    int type=src1->type;

#if 0 
    //*** matrix version
    pMat ones_nm2 = CreateMat(n, m2, type);
    pMat ones_m1n = CreateMat(m1, n, type);
    pMat src1sq = CreateMat(m1, n, type);
    pMat src2sq = CreateMat(m2, n, type);
    pMat res1 = CreateMat(m1, m2, type);
    pMat res2 = CreateMat(m1, m2, type);

    Multiply(src1, src1, src1sq);
    Multiply(src2, src2, src2sq);

    SetValue(ones_nm2, cvScalar(1.0));
    SetValue(ones_m1n, cvScalar(1.0));

    //*** Euclidean distance with matrix multiplications
    MatrixMultiply(src1sq, ones_nm2, res1);
    GEMM(ones_m1n, src2sq, 1.0, res1, 1, res2, CV_GEMM_B_T);
    GEMM(src1, src2, -2.0, res2, 1, dst, CV_GEMM_B_T);
    PowerMatrix(dst, dst, .5);
#endif

#if 1
    //*** element-wise version
    pMat src1sq = CreateMat(m1, n, type);
    pMat src2sq = CreateMat(m2, n, type);
    Multiply(src1, src1, src1sq);
    Multiply(src2, src2, src2sq);

    float *ptrsrc1sq, *ptrsrc2sq, *ptrdst;
    ptrsrc1sq = Ptr2D<float>(src1sq);
    ptrsrc2sq = Ptr2D<float>(src2sq);
    ptrdst = Ptr2D<float>(dst);
    int indsrc1sq, indsrc2sq, inddst=0;
    float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, vdst;
    for(int i = 0; i<m1; ++i)
    {
        indsrc1sq = i * n;
        v0 = ptrsrc1sq[indsrc1sq + 0];
        v1 = ptrsrc1sq[indsrc1sq + 1];
        v2 = ptrsrc1sq[indsrc1sq + 2];
        v3 = ptrsrc1sq[indsrc1sq + 3];
        v4 = ptrsrc1sq[indsrc1sq + 4];
        v5 = ptrsrc1sq[indsrc1sq + 5];
        v6 = ptrsrc1sq[indsrc1sq + 6];
        v7 = ptrsrc1sq[indsrc1sq + 7];
        v8 = ptrsrc1sq[indsrc1sq + 8];
        v9 = ptrsrc1sq[indsrc1sq + 9];
        for (int j=0; j<m2; ++j)
        {
    	    inddst = i * m2 + j;
    	    indsrc2sq = j * n;
    	    vdst  = (v0-ptrsrc2sq[indsrc2sq])   * (v0-ptrsrc2sq[indsrc2sq]);
    	    vdst += (v1-ptrsrc2sq[indsrc2sq+1]) * (v1-ptrsrc2sq[indsrc2sq+1]);
    	    vdst += (v2-ptrsrc2sq[indsrc2sq+2]) * (v2-ptrsrc2sq[indsrc2sq+2]);
    	    vdst += (v3-ptrsrc2sq[indsrc2sq+3]) * (v3-ptrsrc2sq[indsrc2sq+3]);
    	    vdst += (v4-ptrsrc2sq[indsrc2sq+4]) * (v4-ptrsrc2sq[indsrc2sq+4]);
    	    vdst += (v5-ptrsrc2sq[indsrc2sq+5]) * (v5-ptrsrc2sq[indsrc2sq+5]);
    	    vdst += (v6-ptrsrc2sq[indsrc2sq+6]) * (v6-ptrsrc2sq[indsrc2sq+6]);
    	    vdst += (v7-ptrsrc2sq[indsrc2sq+7]) * (v7-ptrsrc2sq[indsrc2sq+7]);
    	    vdst += (v8-ptrsrc2sq[indsrc2sq+8]) * (v8-ptrsrc2sq[indsrc2sq+8]);
    	    vdst += (v9-ptrsrc2sq[indsrc2sq+9]) * (v9-ptrsrc2sq[indsrc2sq+9]);
    	    ptrdst[inddst] = vdst;
        }
    }
#endif
}

コード例 #8

0

ファイルを表示

ファイル: MatrixBLAS.C プロジェクト: i-maruyama/Block

void SpinAdapted::MatrixMultiply (const Matrix& a, char conjA, const Matrix& b, char conjB, Matrix& c, Real scale, double cfactor)
{
  //dmrginp.justmultiply.start();
  //dmrginp.justmultiply -> start(); //ROA
  Matrix& a_ref = const_cast<Matrix&>(a); // for BLAS calls
  Matrix& b_ref = const_cast<Matrix&>(b);
  try
    {
      int aRows = a_ref.Nrows ();
      int aCols = a_ref.Ncols ();
      int bRows = b_ref.Nrows ();
      int bCols = b_ref.Ncols ();
      int cRows = c.Nrows ();
      int cCols = c.Ncols ();
      if (conjA == 'n' && conjB == 'n')
	{	  
	  assert ((aCols == bRows) && (cRows == aRows) && (cCols == bCols));
#ifdef BLAS
	  GEMM ('n', 'n', bCols, aRows, bRows, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bCols);
#else
	  c += (scale * a) * b;
#endif
	}
      else if (conjA == 'n' && conjB == 't')
	{
	  assert ((aCols == bCols) && (cRows == aRows) && (cCols == bRows));
#ifdef BLAS
	  GEMM ('t', 'n', bRows, aRows, bCols, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bRows);
#else
	  c += (scale * a) * b.t ();
#endif
	} 
      else if (conjA == 't' && conjB == 'n')
	{
	  assert ((aRows == bRows) && (cRows == aCols) && (cCols == bCols));
#ifdef BLAS
	  GEMM ('n', 't', bCols, aCols, bRows, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bCols);
#else
	c += (scale * a.t ()) * b;
#endif
	}
      else if (conjA == 't' && conjB == 't')
	{
	  assert ((aRows == bCols) && (cRows == aCols) && (cCols == bRows));
#ifdef BLAS
	  GEMM ('t', 't', bRows, aCols, bCols, scale, b.Store (), bCols, a.Store (), aCols, cfactor, c.Store (), bRows);
#else
	  c += (scale * a.t ()) * b.t ();
#endif
	}
      else
	abort ();
    }
  catch (Exception)
    {
      pout << Exception::what () << endl;
      abort ();
   }
  //dmrginp.justmultiply.stop();
  //dmrginp.justmultiply -> stop(); //ROA
}

コード例 #9

0

ファイルを表示

ファイル: l2ls_learn_basis.cpp プロジェクト: pfShawn/sparse_linear_model

void l2ls_learn_basis_dual(DOUBLE *Dopt, DOUBLE *Dorig, DOUBLE *X, DOUBLE *S, DOUBLE l2norm, INT length, INT N, INT K, INT numSamples) {

	DOUBLE *SSt = (DOUBLE *) MALLOC(K * K * sizeof(DOUBLE));
	
	CHAR uplo = 'U';
	CHAR trans = 'N';
	INT SYRKN = K;
	INT SYRKK = numSamples;
	DOUBLE alpha = 1;
	INT SYRKLDA = K;
	DOUBLE beta = 0;
	INT SYRKLDC = K;
	
	SYRK(&uplo, &trans, &SYRKN, &SYRKK, &alpha, S, &SYRKLDA, &beta, SSt, &SYRKLDC);
	
	DOUBLE *XSt = (DOUBLE *) MALLOC(N * K * sizeof(DOUBLE));

	CHAR transa = 'N';
	CHAR transb = 'T';
	INT GEMMM = N;
	INT GEMMN = K;
	INT GEMMK = numSamples;
	alpha = 1;
	INT GEMMLDA = N;
	INT GEMMLDB = K;
	beta = 0;
	INT GEMMLDC = N;
	
	GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, X, &GEMMLDA, S, &GEMMLDB, &beta, XSt, &GEMMLDC);
	DOUBLE *SXt = (DOUBLE *) MALLOC(N * K * sizeof(DOUBLE));
	transpose(XSt, SXt, N, K);
	
	INT iterK;	
	DOUBLE *dualLambdaOrig = (DOUBLE *) MALLOC(K * sizeof(DOUBLE));
	if (Dorig == NULL) {
		srand(time(NULL));
		for (iterK = 0; iterK < K; ++iterK) {
			dualLambdaOrig[iterK] = 10 * (DOUBLE) rand() / (DOUBLE) RAND_MAX;
		}
	} else {
		
		INT maxNK = IMAX(N, K);
		DOUBLE *B = (DOUBLE *) MALLOC(maxNK * maxNK * sizeof(DOUBLE));
		for (iterK = 0; iterK < K; ++iterK) {
			datacpy(&B[iterK * maxNK], &XSt[iterK * N], K);
		}
		
		INT GELSYM = N;
		INT GELSYN = K;
		INT GELSYNRHS = K;
		INT GELSYLDA = N;
		INT GELSYLDB = maxNK;
		INT *jpvt = (INT *) MALLOC(K * sizeof(INT));
		DOUBLE rcond;
		INT rank;
		INT lwork = -1;
		DOUBLE work_temp;
		DOUBLE *work;
		INT INFO;

		GELSY(&GELSYM, &GELSYN, &GELSYNRHS, Dorig, &GELSYLDA, B, &GELSYLDB, jpvt, &rcond, &rank, &work_temp, &lwork, &INFO);
		
		lwork = (INT) work_temp;
		work = (DOUBLE*) MALLOC(lwork * sizeof(DOUBLE));

		
		GELSY(&GELSYM, &GELSYN, &GELSYNRHS, Dorig, &GELSYLDA, XSt, &GELSYLDB, jpvt, &rcond, &rank, work, &lwork, &INFO);

		for (iterK = 0; iterK < K; ++iterK) {
			dualLambdaOrig[K] = B[iterK * K + iterK] - SSt[iterK * K + iterK];
		}
		
		FREE(work);
		FREE(B);
		FREE(jpvt);
	}

	DOUBLE *SXtXSt = (DOUBLE *) MALLOC(K * K * sizeof(DOUBLE));
	
	uplo = 'U';
	trans = 'N';
	SYRKN = K;
	SYRKK = N;
	alpha = 1;
	SYRKLDA = K;
	beta = 0;
	SYRKLDC = K;
	
	SYRK(&uplo, &trans, &SYRKN, &SYRKK, &alpha, SXt, &SYRKLDA, &beta, SXtXSt, &SYRKLDC);

	DOUBLE c = SQR(l2norm);

	CHAR norm = 'F';
	INT LANGEM = N;
	INT LANGEN = numSamples;
	INT LANGELDA = N;
	
	DOUBLE trXXt = LANGE(&norm, &LANGEM, &LANGEN, X, &LANGELDA, NULL);
	trXXt = SQR(trXXt);
	
/*
	DOUBLE *dualLambdaOpt = (DOUBLE *) MALLOC(K * sizeof(DOUBLE));
*/
	DOUBLE *dualLambdaOpt = XSt;
	minimize_dual(dualLambdaOpt, dualLambdaOrig, length, SSt, SXt, SXtXSt, trXXt, c, N, K);

	for (iterK = 0; iterK < K; ++iterK) {
		SSt[iterK * K + iterK] += dualLambdaOpt[iterK];
	}

	uplo = 'U';
	INT POTRSN = K;
	INT POTRSLDA = K;
	INT INFO;
	POTRF(&uplo, &POTRSN, SSt, &POTRSLDA, &INFO);
	
	INT POTRSNRHS = N;
	INT POTRSLDB = K;
	
	POTRS(&uplo, &POTRSN, &POTRSNRHS, SSt, &POTRSLDA, SXt, &POTRSLDB, &INFO);

	transpose(SXt, Dopt, K, N);
	
	FREE(SSt);
	FREE(XSt);
	FREE(SXt);
	FREE(dualLambdaOrig);
	FREE(SXtXSt);
}

コード例 #10

0

ファイルを表示

ファイル: Epetra_SerialDenseSVD.cpp プロジェクト: 00liujj/trilinos

//=============================================================================
int Epetra_SerialDenseSVD::Solve(void) {

  //FOR NOW, ONLY ALLOW SOLVES IF INVERTED!!!!
  //NO REFINEMENT!!!
  //NO EQUILIBRATION!!!

  // We will call one of four routines depending on what services the user wants and
  // whether or not the matrix has been inverted or factored already.
  //
  // If the matrix has been inverted, use DGEMM to compute solution.
  // Otherwise, if the user want the matrix to be equilibrated or wants a refined solution, we will
  // call the X interface.
  // Otherwise, if the matrix is already factored we will call the TRS interface.
  // Otherwise, if the matrix is unfactored we will call the SV interface.


/*
  if (Equilibrate_) {
    ierr = EquilibrateRHS();
    B_Equilibrated_ = true;
  }
  EPETRA_CHK_ERR(ierr);
  if (A_Equilibrated_ && !B_Equilibrated_) EPETRA_CHK_ERR(-1); // Matrix and vectors must be similarly scaled
  if (!A_Equilibrated_ && B_Equilibrated_) EPETRA_CHK_ERR(-2);
  if (B_==0) EPETRA_CHK_ERR(-3); // No B
  if (X_==0) EPETRA_CHK_ERR(-4); // No B

  if (ShouldEquilibrate() && !A_Equilibrated_) ierr = 1; // Warn that the system should be equilibrated.
*/

  double DN = N_;
  double DNRHS = NRHS_;
  if (Inverted()) {

    if (B_==X_) EPETRA_CHK_ERR(-100); // B and X must be different for this case

    GEMM(TRANS_, 'N', N_, NRHS_, N_, 1.0, AI_, LDAI_, B_, LDB_, 0.0, X_, LDX_);
    if (INFO_!=0) EPETRA_CHK_ERR(INFO_);
    UpdateFlops(2.0*DN*DN*DNRHS);
    Solved_ = true;
  }
  else EPETRA_CHK_ERR(-101); //Currently, for solve must have inverse
/*
  else {

    if (!Factored()) Factor(); // Matrix must be factored

    if (B_!=X_) *LHS_ = *RHS_; // Copy B to X if needed
    GETRS(TRANS_, N_, NRHS_, AF_, LDAF_, IPIV_, X_, LDX_, &INFO_);
    if (INFO_!=0) EPETRA_CHK_ERR(INFO_);
    UpdateFlops(2.0*DN*DN*DNRHS);
    Solved_ = true;

  }

  int ierr1=0;
  if (RefineSolution_ && !Inverted()) ierr1 = ApplyRefinement();
  if (ierr1!=0) EPETRA_CHK_ERR(ierr1)
  else
    EPETRA_CHK_ERR(ierr);

  if (Equilibrate_) ierr1 = UnequilibrateLHS();
  EPETRA_CHK_ERR(ierr1);
*/
  return(0);
}

コード例 #11

0

ファイルを表示

ファイル: Epetra_SerialSpdDenseSolver.cpp プロジェクト: haripandey/trilinos

//=============================================================================
int Epetra_SerialSpdDenseSolver::Solve(void) {
  int ierr = 0;

  // We will call one of four routines depending on what services the user wants and 
  // whether or not the matrix has been inverted or factored already.
  //
  // If the matrix has been inverted, use DGEMM to compute solution.
  // Otherwise, if the user want the matrix to be equilibrated or wants a refined solution, we will
  // call the X interface.
  // Otherwise, if the matrix is already factored we will call the TRS interface.
  // Otherwise, if the matrix is unfactored we will call the SV interface.

  if (Equilibrate_) {
    ierr = Epetra_SerialDenseSolver::EquilibrateRHS();
    B_Equilibrated_ = true;
  }
  EPETRA_CHK_ERR(ierr);
  if (A_Equilibrated_ && !B_Equilibrated_) EPETRA_CHK_ERR(-1); // Matrix and vectors must be similarly scaled
  if (!A_Equilibrated_ && B_Equilibrated_) EPETRA_CHK_ERR(-2);
  if (B_==0) EPETRA_CHK_ERR(-3); // No B
  if (X_==0) EPETRA_CHK_ERR(-4); // No B

  if (ShouldEquilibrate() && !A_Equilibrated_) ierr = 1; // Warn that the system should be equilibrated.

  double DN = N_;
  double DNRHS = NRHS_;
  if (Inverted()) {

    if (B_==X_) EPETRA_CHK_ERR(-100); // B and X must be different for this case
    GEMM('N', 'N', N_, NRHS_, N_, 1.0, AF_, LDAF_,
		B_, LDB_, 0.0, X_, LDX_);
    if (INFO_!=0) EPETRA_CHK_ERR(INFO_);
    UpdateFlops(2.0*DN*DN*DNRHS);
    Solved_ = true;
  }
  else {

    if (!Factored()) Factor(); // Matrix must be factored
    if (B_!=X_) {
       *LHS_ = *RHS_; // Copy B to X if needed 
       X_ = LHS_->A(); LDX_ = LHS_->LDA();
    }
    
    POTRS(SymMatrix_->UPLO(), N_, NRHS_, AF_, LDAF_, X_, LDX_, &INFO_);
    if (INFO_!=0) EPETRA_CHK_ERR(INFO_);
    UpdateFlops(2.0*DN*DN*DNRHS);
    Solved_ = true;

  }
  int ierr1=0;
  if (RefineSolution_) ierr1 = ApplyRefinement();
  if (ierr1!=0) {
    EPETRA_CHK_ERR(ierr1);
  }
  else {
    EPETRA_CHK_ERR(ierr);
  }
  if (Equilibrate_) ierr1 = Epetra_SerialDenseSolver::UnequilibrateLHS();
  EPETRA_CHK_ERR(ierr1);
  return(0);
}

コード例 #12

0

ファイルを表示

ファイル: simpleCUBLAS.cpp プロジェクト: sinanshi/gpu_benchmark

int gpu_gemm(const real *h_A, const real *h_B, real *h_C, const real alpha,
    const real beta, const int N)
{
  real *d_A = 0;
  real *d_B = 0;
  real *d_C = 0;
  int n2 = N * N;

  cublasStatus_t status;
  cublasHandle_t handle;

  status = cublasCreate(&handle);
  if (status != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
  }
  /* Allocate device memory for the matrices */
  if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
  {
    fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
    return EXIT_FAILURE;
  }
  if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
  {
    fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
    return EXIT_FAILURE;
  }

  if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
  {
    fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
    return EXIT_FAILURE;
  }

  /* Initialize the device matrices with the host matrices */
  status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
  status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
  status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

  /* Performs operation using cublas */
  status = GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
  if (status != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }

  /* Read the result back */
  status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

  if (cudaFree(d_A) != cudaSuccess)
  {
    fprintf(stderr, "!!!! memory free error (A)\n");
    return EXIT_FAILURE;
  }

  if (cudaFree(d_B) != cudaSuccess)
  {
    fprintf(stderr, "!!!! memory free error (B)\n");
    return EXIT_FAILURE;
  }

  if (cudaFree(d_C) != cudaSuccess)
  {
    fprintf(stderr, "!!!! memory free error (C)\n");
    return EXIT_FAILURE;
  }

  /* Shutdown */
  status = cublasDestroy(handle);

  if (status != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
  }

  return 0;
}

コード例 #13

0

ファイルを表示

ファイル: matrix_optimization.c プロジェクト: pfShawn/sparse_linear_model

void nuclear_hard_thresholding(DOUBLE *X, DOUBLE *norm, INT rank, INT M, INT N, DOUBLE *sv, \
		DOUBLE *svecsmall, DOUBLE *sveclarge, DOUBLE *work, INT lwork) {

	INT MINMN = IMIN(M, N);
	INT MAXMN = IMAX(M, N);

	INT svFlag = 0;
	if (sv == NULL) {
		sv = (DOUBLE *) malloc(MINMN * 1 * sizeof(DOUBLE));
		svFlag = 1;
	}

	INT svecsmallFlag = 0;
	if (svecsmall == NULL) {
		svecsmall = (DOUBLE *) malloc(MINMN * MINMN * sizeof(DOUBLE));
		svecsmallFlag = 1;
	}

	INT sveclargeFlag = 0;
	if (sveclarge == NULL) {
		sveclarge = (DOUBLE *) malloc(MAXMN * MINMN * sizeof(DOUBLE));
		sveclargeFlag = 1;
	}

	CHAR jobu = 'S';
	CHAR jobvt = 'S';
	DOUBLE *u;
	DOUBLE *vt;
	if (MAXMN == M) {
		u = sveclarge;
		vt = svecsmall;
	} else {
		u = svecsmall;
		vt = sveclarge;
	}
	INT GESVDM = M;
	INT GESVDN = N;
	INT GESVDLDA = M;
	INT GESVDLDU = M;
	INT GESVDLDVT = MINMN;
	INT info;

	if (lwork == -1) {
		GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, X, &GESVDLDA, sv, u, &GESVDLDU, vt, &GESVDLDVT, work, &lwork, &info);

		if (svFlag == 1) {
			free(sv);
		}

		if (svecsmallFlag == 1) {
			free(svecsmall);
		}

		if (sveclargeFlag == 1) {
			free(sveclarge);
		}
		return;
	}

	INT workFlag = 0;
	if (lwork == 0) {
		DOUBLE workTemp;
		lwork = -1;
		GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, X, &GESVDLDA, sv, u, &GESVDLDU, vt, &GESVDLDVT, &workTemp, &lwork, &info);
		if (info != 0) {
			PRINTF("Error, INFO = %d. ", info);
			ERROR("LAPACK error.");
		}

		lwork = (INT) workTemp;
		work = (DOUBLE *) malloc(lwork * 1 * sizeof(DOUBLE));
		workFlag = 1;
	}

	GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, X, &GESVDLDA, sv, u, &GESVDLDU, vt, &GESVDLDVT, work, &lwork, &info);
	if (info != 0) {
		PRINTF("Error, INFO = %d. ", info);
		ERROR("LAPACK error.");
	}

	INT iterMN;
	DOUBLE normtemp = 0;
	for (iterMN = 0; iterMN < rank; ++iterMN) {
		normtemp += sv[iterMN];
	}

	if (norm != NULL) {
		*norm = normtemp;
	}

	for (iterMN = rank; iterMN < MINMN; ++iterMN) {
		sv[iterMN] = 0;
	}

	/*
	 * TODO: Only multiply for singular vectors corresponding to non-zero singular values.
	 */
	if (MAXMN == M) {
		INT SCALN = M;
		INT incx = 1;
		for (iterMN = 0; iterMN < MINMN; ++iterMN) {
			SCAL(&SCALN, &sv[iterMN], &u[iterMN * M], &incx);
		}

		CHAR transa = 'N';
		CHAR transb = 'N';
		INT GEMMM = M;
		INT GEMMN = N;
		INT GEMMK = MINMN;
		DOUBLE alpha = 1;
		INT GEMMLDA = M;
		INT GEMMLDB = MINMN;
		DOUBLE beta = 0;
		INT GEMMLDC = M;

		GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, u, &GEMMLDA, vt, &GEMMLDB, &beta, X, &GEMMLDC);
	} else {
		INT SCALN = M;
		INT incx = 1;
		for (iterMN = 0; iterMN < MINMN; ++iterMN) {
			SCAL(&SCALN, &sv[iterMN], &u[iterMN * M], &incx);
		}

		CHAR transa = 'N';
		CHAR transb = 'N';
		INT GEMMM = M;
		INT GEMMN = N;
		INT GEMMK = MINMN;
		DOUBLE alpha = 1;
		INT GEMMLDA = M;
		INT GEMMLDB = MINMN;
		DOUBLE beta = 0;
		INT GEMMLDC = M;

		GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, u, &GEMMLDA, vt, &GEMMLDB, &beta, X, &GEMMLDC);
	}

	if (svFlag == 1) {
		free(sv);
	}

	if (svecsmallFlag == 1) {
		free(svecsmall);
	}

	if (sveclargeFlag == 1) {
		free(sveclarge);
	}

	if (workFlag == 1) {
		free(work);
	}
}

コード例 #14

0

ファイルを表示

ファイル: matrix_optimization.c プロジェクト: pfShawn/sparse_linear_model

// TODO: Change so that adaptive depending on M and N
// TODO: Check what standard I use for copy of original data, and get rid of it
// if not necessary.
void nuclear_approx_obj_grad(DOUBLE *obj, DOUBLE *deriv, DOUBLE *X, DOUBLE *rp, \
					INT M, INT N, INT derivFlag, DOUBLE *svdVec, DOUBLE *vtMat, \
					DOUBLE *dataBuffer, DOUBLE *derivVec, DOUBLE *work, INT lwork) {

	INT MN = IMIN(M, N);

	if (lwork == -1) {
		CHAR jobu;
		CHAR jobvt;
		if (derivFlag == 1) {
			jobu = 'O';
			jobvt = 'S';
		} else {
			jobu = 'N';
			jobvt = 'N';
		}
		INT GESVDM = M;
		INT GESVDN = N;
		INT GESVDLDA = M;
		INT GESVDLDU = M;
		INT GESVDLDVT = MN;
		INT INFO;

		GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, dataBuffer, &GESVDLDA, svdVec, NULL, &GESVDLDU, vtMat, &GESVDLDVT, work, &lwork, &INFO);
		if (INFO != 0) {
			PRINTF("Error, INFO = %d. ", INFO);
			ERROR("LAPACK error.");
		}
		return;
	}

	INT svdVecFlag = 0;
	if (svdVec == NULL) {
		svdVec = (DOUBLE *) MALLOC(1 * MN * sizeof(DOUBLE));
		svdVecFlag = 1;
	}

	INT derivVecFlag = 0;
	if ((derivVec == NULL) && (derivFlag == 1)) {
		derivVec = (DOUBLE *) MALLOC(1 * MN * sizeof(DOUBLE));
		derivVecFlag = 1;
	}

	INT vtMatFlag = 0;
	if ((vtMat == NULL) && (derivFlag == 1)) {
		vtMat = (DOUBLE *) MALLOC(MN * N * sizeof(DOUBLE));
		vtMatFlag = 1;
	}

	INT dataBufferFlag = 0;
	if (dataBuffer == NULL) {
		dataBuffer = (DOUBLE *) MALLOC(M * N * sizeof(DOUBLE));
		dataBufferFlag = 1;
	}

	INT workFlag = 0;
	if (work == NULL) {
		workFlag = 1;
	}

	CHAR jobu;
	CHAR jobvt;
	if (derivFlag == 1) {
		jobu = 'O';
		jobvt = 'S';
	} else {
		jobu = 'N';
		jobvt = 'N';
	}
	datacpy(dataBuffer, X, M * N);

	INT GESVDM = M;
	INT GESVDN = N;
	INT GESVDLDA = M;
	INT GESVDLDU = M;
	INT GESVDLDVT = MN;
	INT INFO;

	if (workFlag == 1) {
		lwork = -1;
		DOUBLE work_temp;
		GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, dataBuffer, &GESVDLDA, svdVec, NULL, &GESVDLDU, vtMat, &GESVDLDVT, &work_temp, &lwork, &INFO);
		if (INFO != 0) {
			PRINTF("Error, INFO = %d. ", INFO);
			ERROR("LAPACK error.");
		}

		lwork = (INT) work_temp;
		work = (DOUBLE*) MALLOC(lwork * sizeof(DOUBLE));
	}

	GESVD(&jobu, &jobvt, &GESVDM, &GESVDN, dataBuffer, &GESVDLDA, svdVec, NULL, &GESVDLDU, vtMat, &GESVDLDVT, work, &lwork, &INFO);
	if (INFO != 0) {
		PRINTF("Error, INFO = %d. ", INFO);
		ERROR("LAPACK error.");
	}

	abs_smooth_obj_grad(svdVec, derivVec, svdVec, rp, MN, derivFlag);
	INT ASUMN = MN;
	INT incx = 1;
	*obj = ASUM(&ASUMN, svdVec, &incx);

	if (derivFlag == 1) {

		INT iterMN;
		INT SCALN = M;
		INT incx = 1;
		DOUBLE alpha;
		for (iterMN = 0; iterMN < MN; ++iterMN) {
			alpha = derivVec[iterMN];
			SCAL(&SCALN, &alpha, &dataBuffer[iterMN * M], &incx);
		}

		CHAR transa = 'N';
		CHAR transb = 'N';
		INT GEMMM = M;
		INT GEMMN = N;
		INT GEMMK = MN;
		alpha = 1.0;
		INT GEMMLDA = M;
		INT GEMMLDB = MN;
		DOUBLE beta = 0.0;
		INT GEMMLDC = M;
		GEMM(&transa, &transb, &GEMMM, &GEMMN, &GEMMK, &alpha, dataBuffer, &GEMMLDA, vtMat, &GEMMLDB, &beta, deriv, &GEMMLDC);
	}

	if (svdVecFlag == 1) {
		FREE(svdVec);
	}

	if (derivVecFlag == 1) {
		FREE(derivVec);
	}

	if (vtMatFlag == 1) {
		FREE(vtMat);
	}

	if (dataBufferFlag == 1) {
		FREE(dataBuffer);
	}

	if (workFlag == 1) {
		FREE(work);
	}
}