示例#1
0
文件: ardblas.c 项目: rforge/gcb
SEXP d_setVector(SEXP v)
{
	int n = length(v);
	double * d_v;

	cublasAlloc(n, sizeof(double), (void **)&d_v);
	cublasSetVector(n, sizeof(double), REAL(v), 1, d_v, 1);
	checkCublasError("d_setVector");
	
	return packVector(n, d_v);
}
示例#2
0
文件: ardblas.c 项目: rforge/gcb
SEXP d_setMatrix(SEXP m)
{
	int
		rows = nrows(m), cols = ncols(m);
	double * d_m;
	
	cublasAlloc(rows * cols, sizeof(double), (void **)&d_m);
	cublasSetMatrix(rows, cols, sizeof(double), REAL(m), rows, d_m, rows);
	checkCublasError("d_setMatrix");
	
	return packMatrix(rows, cols, d_m);
}
示例#3
0
double magma_get_norm_sy(SEXP obj, const char *typstr)
{
#ifdef HIPLAR_WITH_MAGMA
	char typnm[] = {'\0', '\0'};
	int *dims = INTEGER(GET_SLOT(obj, Matrix_DimSym));
	double *work = (double *) NULL;
	int N = dims[0];
	int lda = N;
	double *A = REAL(GET_SLOT(obj, Matrix_xSym));
	typnm[0] = La_norm_type(typstr);

	const char *c = uplo_P(obj);

	//Magmablas dlansy only does I & M norms
	if(GPUFlag == 1 && (*typnm == 'I' || *typnm == 'M')) {
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing norm using magmablas_dlansy"); 
#endif
		double *dwork, *d_A, maxnorm;
		cublasAlloc(N, sizeof(double), (void**)&dwork);
		cublasAlloc(lda * N, sizeof(double), (void**)&d_A);
		cublasSetVector(N * lda, sizeof(double), A, 1, d_A, 1);
		maxnorm = magmablas_dlansy(typnm[0], *c ,N, d_A, lda, dwork);
		cublasFree(d_A);
		cublasFree(dwork);
		return maxnorm;
	}
	else {

		if (*typnm == 'I' || *typnm == 'O') {
			work = (double *) R_alloc(dims[0], sizeof(double));
		}

		return F77_CALL(dlansy)(typnm, uplo_P(obj),
				dims, A,
				dims, work);
	}
#endif
	return 0.0;
}
示例#4
0
static
double magma_get_norm(SEXP obj, const char *typstr)
{
#ifdef HIPLAR_WITH_MAGMA
	if(any_NA_in_x(obj))
		return NA_REAL;
	else {
		char typnm[] = {'\0', '\0'};
		int *dims = INTEGER(GET_SLOT(obj, Matrix_DimSym));
		double *work = (double *) NULL;

		typnm[0] = La_norm_type(typstr);
		if (*typnm == 'I') {
			work = (double *) R_alloc(dims[0], sizeof(double));
			if(GPUFlag == 1 && (dims[0] % 64 == 0) && (dims[1] % 64 == 0)) {
#ifdef HIPLAR_DBG
	R_ShowMessage("DBG: Getting norm using magmablas_dlange");
#endif
				double *d_work, *d_A, *A, val;
				A = REAL(GET_SLOT(obj, Matrix_xSym));
				cublasAlloc(dims[0] * dims[1], sizeof(double), (void**)&d_A);
				cublasAlloc(dims[0], sizeof(double), (void**)&d_work);
				cublasSetVector(dims[0] * dims[1], sizeof(double), A, 1, d_A, 1);
				val = magmablas_dlange(*typstr, dims[0], dims[1], d_A, dims[0], d_work);
				cudaFree(d_A);
				cudaFree(d_work);
				return val;
			}

		}
		return F77_CALL(dlange)(typstr, dims, dims+1,
				REAL(GET_SLOT(obj, Matrix_xSym)),
				dims, work);
	}
#endif

	return 0.0;
}
示例#5
0
int magma_solve ( int *dA_dim, int *lWork, double2 *A, int *ipiv, int *N ){

	// Check inputs
	//
	fprintf (stderr, "Using MAGMA solve\n" );
	fprintf (stderr, "	dA_dim: %i\n", *dA_dim );
	fprintf (stderr, "	N: %i\n", *N );
	fprintf (stderr, "	lWork: %i\n", *lWork );

	cuInit(0);
	cublasInit();
	printout_devices();

	cublasStatus status;

	double2 *d_A, *work;
	status = cublasAlloc ( *dA_dim, sizeof(double2), (void**)&d_A );

	if ( status != CUBLAS_STATUS_SUCCESS ){
			fprintf (stderr, "ERROR: device memory allocation error (d_A)\n" );
			fprintf (stderr, "ERROR: dA_dim: %i\n", dA_dim );
	}

	cudaError_t err;
	err = cudaMallocHost ( (void**)&work, *lWork * sizeof(double2) );

	if(err != cudaSuccess){
		fprintf (stderr, "ERROR: cudaMallocHost error (work)\n" );
	}

	int info[1];
	TimeStruct start, end;

	start = get_current_time ();
	magma_zgetrf ( N, N, A, N, ipiv, work, d_A, info );
	end = get_current_time ();

	double gpu_perf;
	gpu_perf = 4.*2.*(*N)*(*N)*(*N)/(3.*1000000*GetTimerValue(start,end));

	if ( info[0] != 0 ){
			fprintf (stderr, "ERROR: magma_zgetrf failed\n" );
	}

	printf("	GPU performance: %6.2f GFlop/s\n", gpu_perf);

	int stat = 0;
	return stat;

}
示例#6
0
long benchmark(int size) {
    long requestStart, requestEnd;
    int incx = 1, incy = 1, n = size;
    double *cuA, *cuB;
    cublasStatus status;


    double* a = random_array(size);
    double* b = random_array(size);

    status = cublasAlloc(n, sizeof(double),(void**)&cuA);
    checkStatus("A", status);
    status = cublasAlloc(n, sizeof(double),(void**)&cuB);
    checkStatus("B", status);

    status = cublasSetVector(n, sizeof(double), a, incx, cuA, incx);
    checkStatus("setA", status);

    status = cublasSetVector(n, sizeof(double), b, incy, cuB, incy);
    checkStatus("setB", status);

    requestStart = currentTimeNanos();

    cublasDdot(n, cuA, incx, cuB, incy);

    requestEnd = currentTimeNanos();

    status = cublasFree(cuA);
    checkStatus("freeA", status);
    status = cublasFree(cuB);
    checkStatus("freeB", status);

    free(a);
    free(b);

    return (requestEnd - requestStart);
}
示例#7
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  int status;

  if (nrhs != 3)
    mexErrMsgTxt("Wrong number of arguments");

  int N = (int) mxGetScalar(prhs[0]);
  int SIZE = (int) mxGetScalar(prhs[1]);
  void *d_A = (void *) (UINTPTR mxGetScalar(prhs[2]));

  status = cublasAlloc(N, SIZE, &d_A);

  plhs[0] = mxCreateDoubleScalar(status);
  if (nlhs>1)
    plhs[1] = mxCreateDoubleScalar(UINTPTR d_A);

}
示例#8
0
SEXP magma_dgeMatrix_matrix_mm(SEXP a, SEXP bP, SEXP right)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP b = PROTECT(mMatrix_as_dgeMatrix(bP)),
	     val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
	    *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)),
	    *cdims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2));
	double one = 1.0, zero = 0.0;

	if (asLogical(right)) {
		int m = bdims[0], n = adims[1], k = bdims[1];
		if (adims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		if (m < 1 || n < 1 || k < 1) {
			// 		This was commented out
				    error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			double *B = REAL(GET_SLOT(b, Matrix_xSym));
			double *A = REAL(GET_SLOT(a, Matrix_xSym));
			double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
			//TODO add magma here too
			if(GPUFlag == 1) {
				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;


#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication with Right = true using magmablas_dgemm");
#endif
				cublasAlloc(n * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( n  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( m * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C,  m);
				//CHANGED 30/07
				cublasDgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m);
				
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in cublasDgemm routine"));
				}
				/********************************************/

				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm with right = TRUE");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						B, &m, A , &k, &zero, C , &m);
			}
		}
	} else {
		int m = adims[0], n = bdims[1], k = adims[1];
		double *A = REAL(GET_SLOT(a, Matrix_xSym));
		double *B = REAL(GET_SLOT(b, Matrix_xSym));


		if (bdims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
		
		if (m < 1 || n < 1 || k < 1) {
			//		This was commented out
			error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			if(GPUFlag == 1) {

				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;

	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using magmablas_dgemm");
#endif			
				cublasAlloc(m * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(n * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( m  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( n * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C,  m);
				//CHANGE
				cublasDgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m);
				
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/
				}
				
				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
				
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						A, &m,
						B, &k, &zero,
						C,
						&m);	

			}
		}
	}
	ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2);
	UNPROTECT(2);
	return val;
#endif
	return R_NilValue;
}
void psgetrf_gpu(int *m_in, int *n_in, 
   float *A, int *ia_in, int *ja_in, int *descA, 
   int *ipiv_, int *info)
{

int m = *m_in;
int n = *n_in;
int ia = *ia_in;
int ja = *ja_in;

const int use_setup_desc = TRUE;
const int idebug = 0;
int use_replicated_storage = FALSE;
const int use_broadcast_triangular_matrix = TRUE;

int ia_proc, ja_proc;
int lrindx, lcindx, rsrc,csrc, irsrc,icsrc;
int ictxt, nprow,npcol, myprow,mypcol;
int is_root;

int minmn;
int k1,k2,incx,ip;
int mm, nn, kk, ii, jj, mtmp;
int mm_lu,nn_lu,ia_lu,ja_lu;

int elemSize = sizeof( float );
size_t nbytes;

int nnb, jstart,jend,jsize, isize, jb;
int icontxt, isizeAtmp;


int i,j, iia,jja, ldA, ldhA;
int iinfo = 0;
int iAtmp, jAtmp, iha,jha, iib,jjb,iic,jjc;
int ldAtmp, ldBtmp, lmm,lnn;
int lrA1,lcA1, lrA2,lcA2;

int desc_hA_[DLEN_];
int *desc_hA = &(desc_hA_[0]);

int *ipiv_hA_ = 0;
float *hA = 0;
float *Atmp = 0;
float *dAtmp = 0;

int *gipiv_ = 0;
int desc_Atmp_[DLEN_];
int *desc_Atmp = &(desc_Atmp_[0]);
cublasStatus cu_status;

int isok;
int use_delayed_left_interchange = 1;

int is_mine;
int i1,j1,inc1,  i2,j2,inc2;
int desc_ipiv_hA_[DLEN_];
int *desc_ipiv_hA = &(desc_ipiv_hA_[0]);

int desc_ipiv_[DLEN_];
int *desc_ipiv = &(desc_ipiv_[0]);

int desc_gipiv_[DLEN_];
int *desc_gipiv = &(desc_gipiv_[0]);
int mb,nb, Locp, Locq, lld;


char direc = 'F';
char rowcol = 'R';

char left[] = "Left";
char lower[] = "Lower";
char notrans[] = "NoTrans";
char unit[] = "Unit";

char *side = left;
char *uplo = lower;
char *trans = notrans;
char *diag = unit;

float zero_[REAL_PART+IMAG_PART+1];
float *zero = &(zero_[0]);

float one_[REAL_PART+IMAG_PART+1];
float *one = &(one_[0]);

float neg_one_[REAL_PART+IMAG_PART+1];
float *neg_one = &(neg_one_[0]);

float beta_[REAL_PART+IMAG_PART+1];
float *beta = &(beta_[0]);

float alpha_[REAL_PART+IMAG_PART+1];
float *alpha = &(alpha_[0]);
/*
 * A is a pointer to GPU device memory but conceptually associated
 * with a scalapack distributed matrix 

 * A is array of complex numbers
 */



*info = 0;

zero[REAL_PART] = 0.0;
zero[IMAG_PART] = 0.0;
one[REAL_PART] = 1.0;
one[IMAG_PART] = 0.0;
neg_one[REAL_PART] = -1.0;
neg_one[IMAG_PART] = 0.0;


/*
 * setup copy of distributed matrix on CPU host
 */

hA = 0;
Atmp = 0;

ictxt = descA[CTXT_];
icontxt = ictxt;

Cblacs_gridinfo( ictxt, &nprow, &npcol,  &myprow, &mypcol );
is_root = (myprow == 0) && (mypcol == 0);
if ((idebug >= 1) && (is_root)) {
  printf("pcgetrf_gpu: m %d n %d ia %d ja %d \n",
      m,n,   ia,ja );
};


ia_proc = Cindxg2p( ia, descA[MB_], myprow, descA[RSRC_], nprow);
ja_proc = Cindxg2p( ja, descA[NB_], mypcol, descA[CSRC_], npcol);


/*
 * setup global pivot vector
 */
lld = MIN(m,n) + descA[MB_];
nbytes = lld;
nbytes *= sizeof(int);
if (gipiv_ != 0) {
  free(gipiv_); gipiv_ = 0;
};
gipiv_ = (int *) malloc( nbytes );
assert( gipiv_ != 0 );


desc_gipiv[DTYPE_] = descA[DTYPE_];
desc_gipiv[CTXT_] = descA[CTXT_];
desc_gipiv[M_] = MIN(m,n);
desc_gipiv[N_] = 1;
desc_gipiv[MB_] = desc_gipiv[M_];
desc_gipiv[NB_] = desc_gipiv[N_];
desc_gipiv[LLD_] = lld;

desc_gipiv[RSRC_] = -1;
desc_gipiv[CSRC_] = -1;

  /*
   * setup distribute array hA on host
   */

/*
 * Note, optimal block size on GPU might not be
 * optimal block size on CPU, but assume to be
 * the same for simplicity for now
 */

/*
 * should nnb = descA[NB_] * npcol  ?
 */
nnb = descA[NB_];

minmn = MIN(m,n);
for( jstart=1; jstart <= minmn; jstart = jend + 1) {
  jend = MIN( minmn, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  /*
   * setup matrix on host
   */

  /*
  was iia = (ia-1) + 1;
  */
  j = jstart;
  jb = jsize;

  iia = (ia-1) + jstart;
  jja = (ja-1) + jstart;
  mm = m - jstart + 1;
  nn = jsize;

  if (use_setup_desc) {
    setup_desc( mm,nn, iia,jja,descA, &isize, desc_hA );
    }
  else {
    irsrc = Cindxg2p( iia, descA[MB_], myprow, descA[RSRC_], nprow );
    icsrc = Cindxg2p( jja, descA[NB_], mypcol, descA[CSRC_], npcol );
  
    mb = descA[MB_];
    nb = descA[NB_];
    Locp = Cnumroc( mm, mb, 0,0,nprow );
    Locq = Cnumroc( nn, nb, 0,0,npcol );
    lld = MAX(1,Locp);
    isize = MAX(1,Locp) * MAX(1, Locq );
  
    ictxt = descA[CTXT_];
    iinfo = 0;
    Cdescinit( desc_hA, mm,nn,  mb,nb,  irsrc,icsrc, ictxt, lld, &iinfo);
    assert( iinfo == 0);
    };


  nbytes = isize;
  nbytes *= elemSize;
  if (hA != 0) { 
    free(hA); hA = 0;
  };
  hA = (float *) malloc( nbytes );
  assert( hA != 0 );

  /*
   * distribution of pivot vector is tied to distribution of matrix
   */
  Locp = Cnumroc( desc_hA[M_], desc_hA[MB_], myprow, desc_hA[RSRC_], nprow);
  lld = Locp + desc_hA[MB_];
  nbytes = lld;
  nbytes *= sizeof(int);
  if (ipiv_hA_ != 0) {
    free( ipiv_hA_ ); ipiv_hA_ = 0;
  };
  ipiv_hA_ = (int *) malloc( nbytes );
  assert( ipiv_hA_ != 0);

  Cdescset( desc_ipiv_hA, desc_hA[M_],  1,
              desc_hA[MB_], 1,
              desc_hA[RSRC_], icsrc,
              desc_hA[CTXT_], 
              lld );





  /*
   copy column panel back to CPU host
   to be factored using scalapack
   */ 

  jb = jsize;
  j = jstart;
  mm = m  - j + 1;
  nn = jb;



  /*
    hA(1:mm,1:nn) <-  dA(j:(j+mm-1), j:(j+nn-1) )
   */


  iia = (ia-1) + j;
  jja = (ja-1) + j;
  ii = 1;
  jj = 1;

  PROFSTART("gpu:hA <- dA");
  Cpsgecopy_d2h( mm,nn, A,iia,jja,descA,  hA, ii,jj, desc_hA );
  PROFEND("gpu:hA <- dA");



  /*
   * factor on host CPU using ScaLAPACK
   * Note the pivot vector is tied to the distribution of the matrix
   * Therefore, we need a different "ipiv_hA" pivot vector
   * that is tied the the distributed matrix hA
   */

  ii = 1;
  jj = 1;
  iinfo = 0;
  mm_lu = mm;
  nn_lu = nn;
  ia_lu = ii;
  ja_lu = jj;

  PROFSTART("gpu:psgetrf");
  scalapack_psgetrf( &mm_lu, &nn_lu, 
        hA, &ia_lu, &ja_lu,  desc_hA, &(ipiv_hA(1)), &iinfo );
  PROFEND("gpu:psgetrf");

  /*
   * broadcast pivot vector to global vector
   */



  i1 = 1;
  j1 = 1;
  inc1 = 1;

  i2 = jstart;
  j2 = 1;
  inc2 = 1;
  mtmp = MIN(mm,nn);
  desc_ipiv_hA[CSRC_] = icsrc;

  use_replicated_storage = FALSE;
  if (use_replicated_storage) {
    int ja_lu_proc;

    ja_lu_proc =   Cindxg2p(ja_lu,desc_hA[NB_],
        mypcol,desc_hA[CSRC_],npcol);

    desc_ipiv_hA[CSRC_] =  ja_lu_proc;

    desc_gipiv[RSRC_] = -1;
    desc_gipiv[CSRC_] = -1;
    scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1, desc_ipiv_hA, &inc1,
                        &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 );
    }
  else {
    /*
     * copy to 1 processors (rsrc,csrc), then
     * broadcast to all processors
     */
        int icontxt = desc_ipiv_hA[CTXT_];
        char scope = 'A'; 
        char top = ' ';
        int ntmp = 1;
        int lld; 

        int ia_lu_proc,ja_lu_proc;
        int rsrc, csrc;

        ia_lu_proc = Cindxg2p( ia_lu, desc_hA[MB_],
               myprow,desc_hA[RSRC_],nprow);
        ja_lu_proc = Cindxg2p( ja_lu, desc_hA[NB_],
               mypcol,desc_hA[CSRC_],npcol);

        rsrc = ia_lu_proc;
        csrc = ja_lu_proc;

        desc_gipiv[RSRC_] = rsrc;
        desc_gipiv[CSRC_] = csrc;
        desc_ipiv_hA[CSRC_] = csrc;

        mtmp = MIN( mm_lu, nn_lu);
        scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1,desc_ipiv_hA,&inc1,
                  &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 );

    if ((myprow == rsrc) && (mypcol == csrc)) {

        lld = mtmp;
        ntmp = 1;
        scalapack_igebs2d( &icontxt, &scope, &top,
            &mtmp, &ntmp, &(gipiv(i2)), &lld );
        }
    else {
      lld = mtmp;
      ntmp = 1;
      scalapack_igebr2d( &icontxt, &scope, &top,
            &mtmp, &ntmp, &(gipiv(i2)), &lld, 
            &rsrc,&csrc );
    };
  };

  if (idebug >= 1) {
    int desctmp[DLEN_];
    char name_ipiv_hA[] = "ipiv_hA";
    char name_gipiv[] = "gipiv";

    if (is_root) {
    printf("jstart %d jend %d \n", jstart,jend);
    printf("mm_lu %d nn_lu %d ia_lu %d ja_lu %d\n",
            mm_lu,   nn_lu,   ia_lu,   ja_lu );
    };

    Cdescset(desctmp, desc_hA[M_], npcol,
        desc_hA[MB_],1,
        desc_hA[RSRC_], desc_hA[CSRC_],
        desc_hA[CTXT_], desc_hA[LLD_] );

    Cpilaprnt( MIN(mm_lu,nn_lu), npcol, &(ipiv_hA(1)), 1,1,desctmp, name_ipiv_hA);

    Cdescset(desctmp, minmn*nprow, npcol,
        minmn, 1,    0,0,
        descA[CTXT_], minmn );
    Cpilaprnt( nprow*minmn, npcol, &(gipiv(1)),1,1,desctmp, name_gipiv);
  };


  /*
   * adjust pivot sequence from 1:min(mm,nn) in ipiv to 
   * jstart:(jstart+min(mm,nn)-1)
   */
    for(int i=1; i <= MIN(mm,nn); i++) {
      i2 = (jstart-1) + i;
      gipiv(i2) = gipiv(i2) + (jstart-1);
    };


  if (iinfo < 0) {
     *info = iinfo;
     return;
     };

  if ((*info == 0) && (iinfo > 0)) {
      *info = iinfo + (j-1);
      return;
      };


  /*
   * transfer factored panel back to GPU device
   */

  iia = (ia-1) + j;
  jja = (ja-1) + j;
  ii = 1;
  jj = 1;
  PROFSTART("gpu:A <- hA");
  Cpsgecopy_h2d(mm,nn, hA, ii,jj, desc_hA,
                       A, iia,jja, descA );
  PROFEND("gpu:A <- hA");





  if (use_delayed_left_interchange) {
    /*
     * do nothing for now
     */
    }
  else {
    /* 
     * apply interchanges to columns 1:(j-1)
     */

    nn = j-1;
    k1 = j;
    k2 = j + jb-1;
    incx = 1;


    PROFSTART("gpu:left swap");
    if (nn >= 1) {
         iia = (ia-1) + 1;
         jja = (ja-1) + 1;
         for(kk=k1; kk <= k2; kk++) {
           ip = gipiv(  kk);
           assert(ip >= kk );
           assert( ip <= m );

           if (kk != ip) {
               inc1 = descA[M_];
               inc2 = descA[M_];
               i1 = (iia-1) + kk;
               i2 = (iia-1) + ip;
               j1 = jja;
               j2 = jja;
               Cpsswap_gpu(nn, A,i1,j1,descA,inc1,
                               A,i2,j2,descA,inc2 );
                };
         };
      };
    PROFEND("gpu:left swap");
    };




  /*
   * apply interchanges to columns (j+jb):n
   */

   nn = n - (jend + 1) + 1;
   k1 = j;
   k2 = j + jb - 1;
   incx = 1;



   PROFSTART("gpu:right swap");
   if (nn >= 1) {
      iia = (ia-1) + 1;
      jja = (ja-1) + (jend+1);
      for(kk=k1; kk <= k2; kk++) {
        ip = gipiv(  kk );
        assert( ip >= kk );
        assert( ip <= m );

        if (ip != kk) {
           i1 = (iia-1) + kk;
           i2 = (iia-1) + ip;
           j1 = jja;
           j2 = jja;
           inc1 = descA[M_];
           inc2 = descA[M_];
           Cpsswap_gpu( nn, A, i1,j1, descA, inc1,
                            A, i2,j2, descA, inc2 );
        };
      };
   };
   PROFEND("gpu:right swap");


   PROFSTART("gpu:pTRSM");


   mm = jb;
   nn = n - (jend+1) + 1;
   if ( (1 <= mm) && (1 <= nn)) {
               /*
               cublasCtrsm('L','L','N','U', mm,nn,
                  alpha, dA(j,j), lddA, dA(j,j+jb), lddA );
               */

     if (use_broadcast_triangular_matrix) {
       /*
        * broadcast triangular part, then solve locally
        */
         char lscope = 'A';
         char ltop = ' ';
         int  msize, nsize, lr1,lc1, lr2,lc2;
         int ia_lu_proc, ja_lu_proc;

       /*
        * copy on local processor
        */

         ia_lu_proc = Cindxg2p(ia_lu, desc_hA[MB_], myprow,
                         desc_hA[RSRC_], nprow );
         ja_lu_proc = Cindxg2p(ja_lu, desc_hA[NB_], mypcol,
                         desc_hA[CSRC_], npcol );

       /*
        * complete mm by mm block on Atmp
        */
       ldAtmp = MAX(1,mm);
       Cdescset(desc_Atmp, mm,mm, mm,mm, 
           ia_lu_proc,ja_lu_proc, icontxt, ldAtmp);
       isizeAtmp = ldAtmp * MAX(1,mm);
       nbytes = isizeAtmp;
       nbytes *= elemSize;

       if (Atmp != 0) { free(Atmp); Atmp = 0; };
       Atmp = (float *) malloc( nbytes );
       assert( Atmp != 0);

#ifdef USE_CUBLASV2
       {
         cudaError_t ierr;
         size_t isize = isizeAtmp;
         isize *= elemSize;

         ierr = cudaMalloc( (void **) &dAtmp, isize );
         assert(ierr == cudaSuccess );
       }
#else
       cu_status = cublasAlloc(isizeAtmp, elemSize, (void **) &dAtmp );
       CHKERR(cu_status);
       assert( dAtmp != 0);
#endif

       ii = 1;
       jj = 1;
       scalapack_psgeadd( notrans, &mm, &mm, 
           one,   hA, &ia_lu, &ja_lu, desc_hA,
           zero,  Atmp, &ii, &jj, desc_Atmp );
                 
       rsrc = desc_Atmp[RSRC_];
       csrc = desc_Atmp[CSRC_];
       if ((myprow == rsrc) && (mypcol == csrc)) {
          scalapack_cgebs2d( &icontxt, &lscope, &ltop,   
              &mm, &mm,  Atmp, &ldAtmp );
          }
       else {
         scalapack_cgebr2d( &icontxt, &lscope, &ltop,
              &mm, &mm, Atmp, &ldAtmp,   &rsrc, &csrc );
       };

       inc1 = 1;
       inc2 = 1;
       cu_status = cublasSetVector(isizeAtmp, elemSize, Atmp, inc1, dAtmp, inc2 );
       CHKERR(cu_status);

       /*
        * perform local solve on GPU
        */
       iia = (ia-1) + j;
       jja = (ja-1) + (j+jb);
       local_extent( mm,nn, iia,jja,descA,  
                    &msize,&nsize, &lr1,&lc1, &lr2,&lc2 );
       if (msize >= 1) {
         assert( msize == mm );
       };

       if ((msize >= 1) && (nsize >= 1)) {
         char lside = 'L';
         char luplo = 'L';
         char ltrans = 'N';
         char ldiag = 'U';

         float zalpha;


         zalpha = (float)1.0;//make_float(1.0,0.0);

         CUBLAS_STRSM( 
             ((lside == 'l')||(lside == 'L')) ?
                CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT, 
             ((luplo == 'l')||(luplo == 'L')) ? 
                CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER,
             ((ltrans == 'c')||(ltrans == 'C')) ?
               CUBLAS_OP_C :
                 ((ltrans == 't')||(ltrans == 'T')) ?
                    CUBLAS_OP_T : CUBLAS_OP_N, 
             ((ldiag == 'u')||(ldiag == 'U')) ?
                CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT,
              mm, nsize, zalpha,
              (float *) dAtmp, ldAtmp,
              dA(lr1,lc1), descA[LLD_] );

       };



       if (Atmp != 0) {
         free(Atmp); Atmp = 0;
       };

#ifdef USE_CUBLASV2
       {
         cudaError_t ierr;
         ierr = cudaFree( (void *) dAtmp );
         assert(ierr == cudaSuccess );
         dAtmp  = 0;
       }
#else
       cu_status = cublasFree( dAtmp );
       CHKERR(cu_status );
#endif


     }
     else {
         /*
          * perform triangular solve using scalapack
          */
         iia = (ia-1) + j;
         jja = (ja-1) + (j+jb);
        setup_desc(mm,nn,iia,jja,descA,  &isize, desc_Atmp );

        nbytes = elemSize;
        nbytes *= isize;
        if (Atmp != 0) {
          free(Atmp); Atmp = 0;
        };
        Atmp = (float *) malloc( nbytes );
        assert( Atmp != 0 );



         /*
          * copy to Atmp(1:mm,1:nn) <- dA(j:(j+mm-1),(j+jb):((j+jb)+nn-1))
          */


         ii = 1; jj = 1;
         PROFSTART("gpu:Atmp <- dA");
         Cpsgecopy_d2h( mm,nn,A,iia,jja,descA,
                           Atmp, ii,jj, desc_Atmp );
         PROFEND("gpu:Atmp <- dA");



         /*
          * perform triangular solve using scalapack
          */

          side = left;
          uplo = lower;
          trans = notrans;
          diag = unit;

          alpha = one;

          iha = 1; 
          jha = 1;
          ii = 1; 
          jj = 1;

          PROFSTART("gpu:pstrsm")
          scalapack_pstrsm( side, uplo, trans, diag, 
              &mm,&nn, alpha,    
              hA, &iha,&jha, desc_hA,
              Atmp,&ii,&jj,  desc_Atmp );
          PROFEND("gpu:pstrsm")
          

          /*
           * copy back to GPU
           */

          iia = (ia-1) + j;
          jja = (ja-1) + (j+jb);
          ii = 1; 
          jj = 1;

          PROFSTART("gpu:A <- Atmp");
          Cpsgecopy_h2d( mm,nn, Atmp,ii,jj,desc_Atmp,
                             A, iia,jja, descA );
          PROFEND("gpu:A <- Atmp");
     };
                           



     };
   PROFEND("gpu:pTRSM");


    /*
     * update trailing submatrix
     */


	alpha = neg_one;
	beta = one;
	mm = m-(jend+1) + 1;
	nn = n-(jend+1) + 1;
	kk = jb;

 
      if ((1 <= mm) && (1 <= nn) && (1 <= kk)) {
        
        /*
	 cublasSgemm('N','N',mm,nn,kk,
            alpha, dA(j+jb,j),lddA, dA(j,j+jb),lddA,
            beta, dA(j+jb,j+jb), lddA );
         */

        if (use_broadcast_triangular_matrix) {
          /*
           * Copy from GPU to Atmp
           */
          iia = (ia-1) + j;
          jja = (ja-1) + (j+jb);

          setup_desc( kk,nn, iia,jja, descA, &isizeAtmp, desc_Atmp);
          nbytes = isizeAtmp;
          nbytes *= elemSize;
          if (Atmp != 0) { free(Atmp); Atmp = 0; };
          Atmp = (float *) malloc( nbytes );
          assert( Atmp != 0);

          PROFSTART("gpu:Atmp <- A");
          Cpsgecopy_d2h( kk,nn, A,iia,jja,descA, 
                                Atmp,1,1,desc_Atmp );
          PROFEND("gpu:Atmp <- A");
        };


        iic = (ia-1) + (jend+1);
        jjc = (ja-1) + (jend+1);


       iha = jsize+1;
       jha = 1;
       iAtmp = 1; 
       jAtmp = 1;
     

          {
          char transA = 'N';
          char transB = 'N';

          PROFSTART("zgetrf_gpu:psgemm");
          Cpsgemm_hhd( transA, transB, mm,nn,kk, 
           alpha, hA, iha,jha, desc_hA, 
                  Atmp, iAtmp,jAtmp, desc_Atmp, 
           beta,  A, iic,jjc, descA );

          PROFEND("zgetrf_gpu:psgemm");
           };
       };



    if (Atmp != 0) {
       free(Atmp); Atmp = 0;
       };

    if (ipiv_hA_ != 0) {
       free( ipiv_hA_ ); ipiv_hA_ = 0;
       };
    if (hA != 0) {
      free(hA); hA = 0;
      };

   }; /* for (jstart) */


   if (use_delayed_left_interchange) {

     PROFSTART("gpu:dleft swap");
    for(j=1; j <= minmn; j = jend + 1) {
        jend = MIN( minmn, j+nnb-1);
        jsize = jend - j + 1;
        jb = jsize;
        /*
         * apply interchanges to columns 1:(j-1)
         */
   
        nn = j-1;
        k1 = j;
        k2 = j+jb-1;
        incx = 1;
   
   
        if (nn >= 1) {
         iia = (ia-1) + 1; 
         jja = (ja-1) + 1;
         for(kk=k1; kk <= k2; kk++) {
             ip = gipiv(kk);
             assert( ip >= kk );

             if (ip != kk) {
               inc1 = descA[M_];
               inc2 = descA[M_];
               i1 = (iia-1) + kk;
               i2 = (iia-1) + ip;
               j1 = jja;
               j2 = jja;
               Cpsswap_gpu(nn, A, i1,j1,descA, inc1, 
                               A, i2,j2,descA, inc2 );
             };
         };
        };
     }; /* end for j */
     PROFEND("gpu:dleft swap");
   }; /* end if use delayed left interchange */


   /*
    * adjust global pivot from 1:MIN(m,n) to ia:(ia + MIN(m,n)-1)
    * copy global vector back to distributed pivot vector
    */

   for(int j=1; j <= minmn; j++) {
     gipiv(j) = (ia-1) + gipiv(j);
   };


   lld = descA[MB_] + 
         Cnumroc( descA[M_], descA[MB_], myprow, descA[RSRC_], nprow);

   Cdescset( desc_ipiv, 
              descA[M_],1, 
              descA[MB_], 1, 
              descA[RSRC_], -1, descA[CTXT_], lld );

   i1 = 1; j1 = 1; inc1 = 1;
   i2 = ia; j2 = 1; inc2 = 1;
   mtmp = MIN(m,n);

   PROFSTART("gpu:ipiv");
   use_replicated_storage = FALSE;
   if (use_replicated_storage) {
     int msize,nsize,lr1,lc1,lr2,lc2, lrindx,iia;

     local_extent(MIN(m,n),n,ia,ja,descA, &msize,&nsize, &lr1,&lc1, &lr2,&lc2);
     if (msize >= 1) {
       for(lrindx=lr1; lrindx <= lr2; lrindx++) {
         iia = Cindxl2g( lrindx, descA[MB_], myprow, descA[RSRC_], nprow);
         ipiv(lrindx) =  gipiv( (iia-ia) + 1 );
         };
       };
     }
   else  {
     /*
      * copy to a column, then broadcast
      */
     char scope = 'R';
     char top = ' ';
     int Locp, Locq;
     int lld;
     int icontxt = desc_ipiv[CTXT_];

     desc_ipiv[CSRC_] = ja_proc;
     desc_gipiv[RSRC_] = ia_proc;
     desc_gipiv[CSRC_] = ja_proc;

     mtmp = MIN(m,n);
     scalapack_picopy( &mtmp, &(gipiv(1)), &i1,&j1, desc_gipiv, &inc1,
             &(ipiv(1)), &i2, &j2, desc_ipiv, &inc2 );

     if (idebug >= 1) {
       char cmatnm[] = "ipiv after picopy";
       if (is_root) {
         printf("ia_proc %d ja_proc %d i2 %d j2 %d \n",ia_proc,ja_proc,i2,j2);
       };
       Cpilaprnt( mtmp,1, &(ipiv(1)), i2,j2,desc_ipiv, cmatnm);
     };


     Locp = Cnumroc( ia + MIN(m,n)-1, desc_ipiv[MB_], 
                     myprow, desc_ipiv[RSRC_], nprow);
     lld = MAX(1,Locp);
     Locq = 1;
     if (npcol > 1) {
      if (mypcol == ja_proc) {

       scalapack_igebs2d( &icontxt, &scope, &top, 
           &Locp, &Locq,  &(ipiv(1)), &lld );
      }
      else {
       rsrc = myprow;
       scalapack_igebr2d( &icontxt, &scope, &top,
           &Locp, &Locq, &(ipiv(1)), &lld, &rsrc, &ja_proc );
      };
     };

   };
   PROFEND("gpu:ipiv");

     if (idebug >= 1) {
       int desctmp[DLEN_];
       char cmatnm[] = "final ipiv";
       Cdescset( desctmp, 
           descA[M_],npcol,
           descA[MB_],1,
           descA[RSRC_], descA[CSRC_],
           descA[CTXT_], descA[LLD_]);
       Cpilaprnt( MIN(m,n),npcol, &(ipiv(1)), ia,1,desctmp, cmatnm);
     };





  /*
   * clean up
   */
  if (Atmp != 0) {
       free(Atmp); Atmp = 0;
       };
  if (hA != 0) {
       free(hA); hA = 0;
       };
  if (ipiv_hA_ != 0) {
      free( ipiv_hA_ ); ipiv_hA_ = 0;
      };

  if (gipiv_ != 0) {
     free(gipiv_); gipiv_ = 0;
     };


  return;
}
示例#10
0
SEXP magma_dgeMatrix_matrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = PROTECT(dup_mMatrix_as_dgeMatrix(b)),
			 lu = PROTECT(magma_dgeMatrix_LU_(a, TRUE));
	int *adims = INTEGER(GET_SLOT(lu, Matrix_DimSym)),
			*bdims = INTEGER(GET_SLOT(val, Matrix_DimSym));
	int info, n = bdims[0], nrhs = bdims[1];



	if (*adims != *bdims || bdims[1] < 1 || *adims < 1 || *adims != adims[1])
		error(_("Dimensions of system to be solved are inconsistent"));

	double *A = REAL(GET_SLOT(lu, Matrix_xSym));
	double *B  = REAL(GET_SLOT(val, Matrix_xSym));
	int *ipiv = INTEGER(GET_SLOT(lu, Matrix_permSym));

	if(GPUFlag == 0) {
		F77_CALL(dgetrs)("N", &n, &nrhs, A, &n, ipiv, B, &n, &info);	
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Solve using LU using dgetrs;");
#endif
	}else if(GPUFlag == 1) {
		
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Solve using LU using magma_dgetrs;");
#endif		
		double *d_A, *d_B;
		cublasStatus retStatus;

		cublasAlloc(adims[0] * adims[1], sizeof(double), (void**)&d_A);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation of A on Device"));
		/********************************************/


		cublasAlloc(n * nrhs, sizeof(double), (void**)&d_B);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation of b on Device"));
		/********************************************/



		cublasSetVector(adims[0] * adims[1], sizeof(double), A, 1, d_A, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Transferring data to advice"));
		/********************************************/

		cublasSetVector(n * nrhs, sizeof(double), B, 1, d_B, 1);

		magma_dgetrs_gpu( 'N', n, nrhs, d_A, n, ipiv, d_B, n, &info );

		cublasGetVector(n * nrhs, sizeof(double), d_B, 1, B, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Transferring from to advice"));
		/********************************************/

		cublasFree(d_A);
		cublasFree(d_B);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in freeing data"));
		/********************************************/

		
	}
	if (info)
		error(_("Lapack routine dgetrs: system is exactly singular"));
	UNPROTECT(2);
	return val;
#endif
	    return R_NilValue;
}
示例#11
0
SEXP magma_dgeMatrix_solve(SEXP a)
{
#ifdef HIPLAR_WITH_MAGMA
    /*  compute the 1-norm of the matrix, which is needed
	later for the computation of the reciprocal condition number. */
    double aNorm = magma_get_norm(a, "1");

    /* the LU decomposition : */
		/* Given that we may be performing this operation
		 * on the GPU we may put in an optimisation here
		 * where if we call the LU solver we, we do not require
		 * the decomposition to be transferred back to CPU. This is TODO
		 */
    SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))),
	lu = magma_dgeMatrix_LU_(a, TRUE);
    int *dims = INTEGER(GET_SLOT(lu, Matrix_DimSym)),
	*pivot = INTEGER(GET_SLOT(lu, Matrix_permSym));

    /* prepare variables for the dgetri calls */
    double *x, tmp;
    int	info, lwork = -1;

    if (dims[0] != dims[1]) error(_("Solve requires a square matrix"));
    slot_dup(val, lu, Matrix_xSym);
    x = REAL(GET_SLOT(val, Matrix_xSym));
    slot_dup(val, lu, Matrix_DimSym);
		int N2 = dims[0] * dims[0];

    if(dims[0]) /* the dimension is not zero */
    {
			/* is the matrix is *computationally* singular ? */
			double rcond;
			F77_CALL(dgecon)("1", dims, x, dims, &aNorm, &rcond,
					(double *) R_alloc(4*dims[0], sizeof(double)),
					(int *) R_alloc(dims[0], sizeof(int)), &info);
			if (info)
				error(_("error [%d] from Lapack 'dgecon()'"), info);
			if(rcond < DOUBLE_EPS)
				error(_("Lapack dgecon(): system computationally singular, reciprocal condition number = %g"),
						rcond);

			/* only now try the inversion and check if the matrix is *exactly* singular: */
			// This is also a work space query. This is not an option in magma

			F77_CALL(dgetri)(dims, x, dims, pivot, &tmp, &lwork, &info);
			lwork = (int) tmp;
			
			if( GPUFlag == 0){
				

				F77_CALL(dgetri)(dims, x, dims, pivot,
						(double *) R_alloc((size_t) lwork, sizeof(double)),
						&lwork, &info);

#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Solve using LU using dgetri;");
#endif
			}
			else if(GPUFlag == 1) {
				
				double *d_x, *dwork; 
				cublasStatus retStatus;			
	
#ifdef HIPLAR_DBG
				R_ShowMessage("Solve using LU using magma_dgetri;");
#endif
				cublasAlloc(N2, sizeof(double), (void**)&d_x);

				//cublasAlloc(N2 , sizeof(double), (void**)&dtmp);
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation on Device"));
				/********************************************/

				cublasSetVector( N2, sizeof(double), x, 1, d_x, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/
				lwork = dims[0] * magma_get_dgetri_nb( dims[0] );
				
					cublasAlloc(lwork, sizeof(double), (void**)&dwork);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation on Device"));
				/********************************************/

				magma_dgetri_gpu(dims[0], d_x, dims[0], pivot, dwork , lwork, &info);

				cublasGetVector(N2, sizeof(double), d_x, 1, x, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data From to Device"));
				/********************************************/

				cublasFree(dwork);
				cublasFree(d_x);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error freeing memory"));
				/********************************************/
					
			}
			else
				error(_("GPUFlag not set correctly"));

			if (info)
				error(_("Lapack routine dgetri: system is exactly singular"));
    }
    UNPROTECT(1);
    return val;
#endif
	    return R_NilValue;
}
示例#12
0
SEXP magma_dpoMatrix_chol(SEXP x)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = get_factors(x, "Cholesky"),
			 dimP = GET_SLOT(x, Matrix_DimSym),
			 uploP = GET_SLOT(x, Matrix_uploSym);

	const char *uplo = CHAR(STRING_ELT(uploP, 0));
	int *dims = INTEGER(dimP), info;
	int n = dims[0];
	double *vx;
	cublasStatus retStatus;
	if (val != R_NilValue) return val;
	dims = INTEGER(dimP);
	val = PROTECT(NEW_OBJECT(MAKE_CLASS("Cholesky")));
	SET_SLOT(val, Matrix_uploSym, duplicate(uploP));
	SET_SLOT(val, Matrix_diagSym, mkString("N"));
	SET_SLOT(val, Matrix_DimSym, duplicate(dimP));
	vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n));
	AZERO(vx, n * n);
	
	//we could put in magmablas_dlacpy but it only
	//copies all of the matrix 
	F77_CALL(dlacpy)(uplo, &n, &n, REAL(GET_SLOT(x, Matrix_xSym)), &n, vx, &n);
	if (n > 0) {

		if(GPUFlag == 0){
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Cholesky decomposition using dpotrf;");
#endif
			F77_CALL(dpotrf)(uplo, &n, vx, &n, &info);
		}
		else if(GPUFlag == 1 && Interface == 0){
		
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf;");
#endif			
			int nrows, ncols;
			nrows = ncols = n;

			magma_int_t lda;
			lda = nrows;

			magma_dpotrf(uplo[0], ncols, vx, lda, &info);

			/* Error Checking */
			retStatus = cudaGetLastError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf"));
			/********************************************/
			

		}
		else if(GPUFlag == 1 && Interface == 1) {
	
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf_gpu;");	
#endif
			double *d_c;
			int nrows, ncols;
			nrows = ncols = n;
			int N2 = nrows * ncols;


			magma_int_t lda;
			lda = nrows;

			cublasAlloc(lda * ncols, sizeof(double), (void**)&d_c);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector(N2, sizeof(double), vx, 1, d_c, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer to Device"));
			/********************************************/


			magma_dpotrf_gpu(uplo[0], ncols, d_c, lda, &info);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf_gpu"));
			/********************************************/
			

			cublasGetVector(nrows * ncols, sizeof(double), d_c, 1, vx, 1);		
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer from Device"));
			/********************************************/
			
			cublasFree(d_c);
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
		}
		
	if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}

	UNPROTECT(1);
	return set_factors(x, val, "Cholesky");
#endif
	return R_NilValue;
}
示例#13
0
SEXP magma_dpoMatrix_matrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
    SEXP Chol = magma_dpoMatrix_chol(a),
	val = PROTECT(duplicate(b));
    int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
	*bdims = INTEGER(getAttrib(b, R_DimSymbol)),
	info;

    if (!(isReal(b) && isMatrix(b)))
	error(_("Argument b must be a numeric matrix"));
    if (*adims != *bdims || bdims[1] < 1 || *adims < 1)
	error(_("Dimensions of system to be solved are inconsistent"));
    
		double *A = REAL(GET_SLOT(Chol, Matrix_xSym));
		double *B = REAL(val);
		//const char *uplo = uplo_P(Chol);
		//int N = bdims[1];
		//There is only a GPU interface for this call
		//so it will be the default setting if the GPU is on
		if(GPUFlag == 1) {
	
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;");
#endif
			double *d_A, *d_B;
			const char *uplo = uplo_P(Chol);
			magma_int_t NRHS = bdims[1];
			magma_int_t lda	 = adims[1];
			magma_int_t ldb  = bdims[0];
			magma_int_t N 	 = adims[0];
			cublasStatus retStatus;
			cublasAlloc(N * lda, sizeof(double), (void**)&d_A);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( N  * lda , sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info);

			cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/

			cublasFree(d_A);
			cublasFree(d_B);
		}
		else {
		F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1,
		     REAL(GET_SLOT(Chol, Matrix_xSym)), adims,
		     REAL(val), bdims, &info);
		}
		// Error checking of MAGMA/LAPACK calls
		if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}

		UNPROTECT(1);
    return val;
#endif
	return R_NilValue;
}
示例#14
0
SEXP magma_dpoMatrix_dgeMatrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP Chol = magma_dpoMatrix_chol(a),
			 val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
			*bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)),
			info;

	/* Checking Matrix Dimensions */
	if (adims[1] != bdims[0])
		error(_("Dimensions of system to be solved are inconsistent"));
	if (adims[0] < 1 || bdims[1] < 1)
		error(_("Cannot solve() for matrices with zero extents"));
	/* ****************************************** */
	
	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	slot_dup(val, b, Matrix_DimSym);
	slot_dup(val, b, Matrix_xSym);

	double *A = REAL(GET_SLOT(Chol, Matrix_xSym));
	double *B = REAL(GET_SLOT(val, Matrix_xSym));

	if(GPUFlag == 1) {
	
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;");
#endif
		double *d_A, *d_B;
		const char *uplo = uplo_P(Chol);
		magma_int_t NRHS = bdims[1];
		magma_int_t lda	 = adims[1];
		magma_int_t ldb  = bdims[0];
		magma_int_t N 	 = adims[0];
		cublasStatus retStatus;

		/*if(uplo == "U")
			uplo = MagmaUpperStr;
		else if(uplo == "L")
			uplo = MagmaLowerStr;
		else		
			uplo = MagmaUpperStr;
		*/

		cublasAlloc(N * lda, sizeof(double), (void**)&d_A);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B);	

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasSetVector( N  * lda , sizeof(double), A, 1, d_A, 1);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/

		cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 );
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/

		magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info);

		cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer from Device"));
		/********************************************/
		
		cublasFree(d_A);
		cublasFree(d_B);
	}
	else {
	
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs;");
#endif
		F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1, A , adims, B , bdims, &info);
	}
	if (info) {
		if(info > 0)
			error(_("the leading minor of order %d is not positive definite"),
					info);
		else /* should never happen! */
			error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
	}
	UNPROTECT(1);
	return val;
#endif
	return R_NilValue;
}
示例#15
0
文件: xgemm.c 项目: deccs/PLearn
/* Main */
int main(int argc, char** argv)
{    
  if (argc!=5){ 
    fprintf (stderr, "Usage: %s <sizeM> <sizeN> <sizeK> <Nb iter>\n",argv[0]); 
    exit(0); 
  } 
  const int M=strtol(argv[1],0,0);
  const int N=strtol(argv[2],0,0);
  const int K=strtol(argv[3],0,0);
  const int NBITER=strtol(argv[4],0,0);
  const int NA= M * K;
  const int NB= K * N;
  const int NC= M * N;
  real* h_A;
  real* h_B;
  real* h_C;
  const real alpha = 1.0f;
  const real beta = 0.0f;
#ifdef NVIDIA
  cublasStatus status;
  real* d_A = 0;
  real* d_B = 0;
  real* d_C = 0;
#endif

#ifdef COMPARE
  real* h_C_ref;
  real error_norm;
  real ref_norm;
  real diff;
#endif

    /* Allocate host memory for the matrices */
    h_A = (real*)malloc(NA * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    h_B = (real*)malloc(NB * sizeof(h_B[0]));
    if (h_B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    h_C = (real*)malloc(NC * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    for (int i = 0; i < NA; ++i) h_A[i] = M_PI+(real)i;
    for (int i = 0; i < NB; ++i) h_B[i] = M_PI+(real)i;

#ifdef NVIDIA
    /* Initialize CUBLAS */
    status = cublasInit();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }
    /* Allocate device memory for the matrices */
    status = cublasAlloc(NA, sizeof(d_A[0]), (void**)&d_A);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    status = cublasAlloc(NB, sizeof(d_B[0]), (void**)&d_B);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    status = cublasAlloc(NC, sizeof(d_C[0]), (void**)&d_C);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(NA, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(NB, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(NC, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Clear last error */
    cublasGetError();
#endif
#ifdef COMPARE
    /* Performs operation using plain C code */
    for (int i=0;i<NBITER;i++)
      c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;
    /* Allocate host memory for reading back the result from device memory */
    h_C = (real*)malloc(NC * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }
#endif
#ifdef NVIDIA
    /* Performs operation using cublas */
    for (int i=0;i<NBITER;i++)
      //We must Change the order of the parameter as cublas take
      //matrix as colomn major and C matrix is row major
      cublasSgemm('n', 'n', N, M, K, alpha, d_B, N, d_A, K, beta, d_C, N);

    status = cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    /* Read the result back */
    status = cublasGetVector(NC, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }
#elif defined( CXGEMM )
    for (int i=0;i<NBITER;i++)
      c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C);
#else
    char transa='N', transb='N';
    for (int i=0;i<NBITER;i++)
      sgemm_(&transb, &transa, &N, &M, &K, &alpha, h_B, &N, h_A, &K, &beta, h_C, &N);

#endif
#ifdef COMPARE
    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;
    for (int i = 0; i < NC; ++i) {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }
    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
    if (fabs(ref_norm) < 1e-7) {
        fprintf (stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }
    printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");
#endif

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);

#ifdef NVIDIA
    status = cublasFree(d_A);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    status = cublasFree(d_B);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    status = cublasFree(d_C);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasShutdown();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }
#endif
    //    if (argc <= 1 || strcmp(argv[1], "-noprompt")) {
    //        printf("\nPress ENTER to exit...\n");
    //        getchar();
    //    }
    return EXIT_SUCCESS;
}
示例#16
0
SEXP magma_dgeMatrix_LU_(SEXP x, Rboolean warn_sing)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = get_factors(x, "LU");
	int *dims, npiv, info;

	if (val != R_NilValue) {
//		R_ShowMessage("already in slot");	/* nothing to do if it's there in 'factors' slot */
		return val;
	}

	dims = INTEGER(GET_SLOT(x, Matrix_DimSym));
	if (dims[0] < 1 || dims[1] < 1)
		error(_("Cannot factor a matrix with zero extents"));
	npiv = (dims[0] < dims[1]) ? dims[0] : dims[1];
	val = PROTECT(NEW_OBJECT(MAKE_CLASS("denseLU")));
	slot_dup(val, x, Matrix_xSym);
	slot_dup(val, x, Matrix_DimSym);
	double *h_R = REAL(GET_SLOT(val, Matrix_xSym));
	int *ipiv = INTEGER(ALLOC_SLOT(val, Matrix_permSym, INTSXP, npiv));
	
	if(GPUFlag == 0){
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: LU decomposition using dgetrf;");
#endif
		F77_CALL(dgetrf)(dims, dims + 1, h_R,
				dims,
				ipiv,
				&info);
	}
	else if(GPUFlag == 1 && Interface == 0){
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: LU decomposition using magma_dgetrf;");
#endif
			magma_dgetrf(dims[0], dims[1], h_R, dims[0], ipiv, &info);
		}
		else if(GPUFlag == 1 && Interface == 1) {
			
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: LU decomposition using magma_dgetrf_gpu;");	
#endif
			double *d_A;
			int N2 = dims[0] * dims[1];
			cublasStatus retStatus;

			cublasAlloc( N2 , sizeof(double), (void**)&d_A);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector(N2, sizeof(double), h_R, 1, d_A, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer to Device"));
			/********************************************/

			magma_dgetrf_gpu(dims[0],dims[1], d_A, dims[0], ipiv,  &info);
			
			cublasGetVector( N2, sizeof(double), d_A, 1, h_R, 1);		
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer from Device"));
			/********************************************/
			
				cublasFree(d_A);
		
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error freeing data"));
			/********************************************/
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
		
	if (info < 0)
		error(_("Lapack routine %s returned error code %d"), "dgetrf", info);
	else if (info > 0 && warn_sing)
		warning(_("Exact singularity detected during LU decomposition: %s, i=%d."),
				"U[i,i]=0", info);
	UNPROTECT(1);

	return set_factors(x, val, "LU");
#endif

	    return R_NilValue;
}
示例#17
0
SEXP magma_dgeMatrix_crossprod(SEXP x, SEXP trans)
{
#ifdef HIPLAR_WITH_MAGMA
	int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x) */
	SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))),
	     nms = VECTOR_ELT(GET_SLOT(x, Matrix_DimNamesSym), tr ? 0 : 1),
	     vDnms = ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2);
	int *Dims = INTEGER(GET_SLOT(x, Matrix_DimSym)),
	    *vDims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2));
	int k = tr ? Dims[1] : Dims[0], n = tr ? Dims[0] : Dims[1];
	double *vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)),
		 one = 1.0, zero = 0.0;
	double *A =  REAL(GET_SLOT(x, Matrix_xSym));
	AZERO(vx, n * n);
	SET_SLOT(val, Matrix_uploSym, mkString("U"));
	ALLOC_SLOT(val, Matrix_factorSym, VECSXP, 0);
	vDims[0] = vDims[1] = n;
	SET_VECTOR_ELT(vDnms, 0, duplicate(nms));
	SET_VECTOR_ELT(vDnms, 1, duplicate(nms));
	if(n && GPUFlag == 1) {

#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing crossproduct using cublasDsyrk");
#endif
		cublasStatus retStatus;
		double *d_A, *d_C;

		/*retStatus = cublasCreate(&handle);
		  if ( retStatus != CUBLAS_STATUS_SUCCESS )		
		  error(_("CUBLAS initialisation failed"));
		  */

		cublasAlloc(n * k, sizeof(double), (void**)&d_A);
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasAlloc(n * n, sizeof(double), (void**)&d_C);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasSetVector( n  * k , sizeof(double), A, 1, d_A, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/
		
		//cublasSetVector( n  * n , sizeof(double), vx, 1, d_C, 1);
		
		/* Error Checking */
		//retStatus = cublasGetError ();
		//if (retStatus != CUBLAS_STATUS_SUCCESS) 
		//	error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/


		cublasDsyrk('U' , tr ? 'N' : 'T', n, k, one, d_A, Dims[0], zero, d_C, n);

		cublasGetVector( n * n , sizeof(double), d_C, 1, vx, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer from Device"));
		/********************************************/

		cublasFree(d_A);
		cublasFree(d_C);

	} else if(n){
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing cross prod with dsyrk");
#endif
		F77_CALL(dsyrk)("U", tr ? "N" : "T", &n, &k, &one, A, Dims,
				&zero, vx, &n);
	}

	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	UNPROTECT(1);
	return val;
#endif
	return R_NilValue;
}
示例#18
0
extern "C" magma_int_t
magma_zgeqrf2(magma_context *cntxt, magma_int_t m, magma_int_t n, 
          cuDoubleComplex *a,    magma_int_t lda, cuDoubleComplex *tau, 
          cuDoubleComplex *work, magma_int_t lwork,
          magma_int_t *info)
{
/*  -- MAGMA (version 1.5.0-beta3) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date July 2014

    Purpose
    =======
    ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    Arguments
    =========
    CNTXT   (input) MAGMA_CONTEXT
            CNTXT specifies the MAGMA hardware context for this routine.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX_16 array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

        Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_zgeqrf_nb(M).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -8, the GPU memory allocation failed

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define  a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
    #define da_ref(a_1,a_2) (da+(a_2)*ldda  + (a_1))

    int cnt=-1;
    cuDoubleComplex c_one = MAGMA_Z_ONE;

    int i, k, lddwork, old_i, old_ib;
    int nbmin, nx, ib, ldda;

    *info = 0;

    magma_qr_params *qr_params = (magma_qr_params *)cntxt->params;
    int nb = qr_params->nb;

    int lwkopt = n * nb;
    work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
    long int lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }
    else if (lquery)
      return MAGMA_SUCCESS;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return MAGMA_SUCCESS;
    }

    cublasStatus status;
    static cudaStream_t stream[2];
    cudaStreamCreate(&stream[0]);
    cudaStreamCreate(&stream[1]);

    nbmin = 2;
    nx = nb;

    lddwork = ((n+31)/32)*32;
    ldda    = ((m+31)/32)*32;

    cuDoubleComplex *da;
    status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da);
    if (status != CUBLAS_STATUS_SUCCESS) {
        *info = -8;
        return 0;
    }
    cuDoubleComplex *dwork = da + ldda*(n);

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        cudaMemcpy2DAsync(da_ref(0,nb), ldda*sizeof(cuDoubleComplex),
                           a_ref(0,nb), lda *sizeof(cuDoubleComplex),
                          sizeof(cuDoubleComplex)*(m), (n-nb),
                          cudaMemcpyHostToDevice,stream[0]);

        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            if (i>0){
                cudaMemcpy2DAsync( a_ref(i,i),  lda *sizeof(cuDoubleComplex),
                                   da_ref(i,i), ldda*sizeof(cuDoubleComplex),
                                   sizeof(cuDoubleComplex)*(m-i), ib,
                                   cudaMemcpyDeviceToHost,stream[1]);

                cudaMemcpy2DAsync( a_ref(0,i),  lda *sizeof(cuDoubleComplex),
                                   da_ref(0,i), ldda*sizeof(cuDoubleComplex),
                                   sizeof(cuDoubleComplex)*i, ib,
                                   cudaMemcpyDeviceToHost,stream[0]);

                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, 
                  m-old_i, n-old_i-2*old_ib, old_ib,
                  da_ref(old_i, old_i),          ldda, dwork,        lddwork,
                  da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
            }

            cudaStreamSynchronize(stream[1]);
            int rows = m-i;

        cnt++;
        cntxt->nb = qr_params->ib;
        magma_zgeqrf_mc(cntxt, &rows, &ib, a_ref(i,i), &lda, 
                tau+i, work, &lwork, info);
        cntxt->nb = nb;

            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, 
                              &rows, &ib, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib);
        if (cnt < qr_params->np_gpu) {
          qr_params->p[cnt]=a;
        }
        zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb);
            cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex),
                            a_ref(i,i), lda, da_ref(i,i), ldda);
        if (qr_params->flag == 1)
          zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb);
        
            if (i + ib < n) { 
          cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), qr_params->t+cnt*nb*nb, ib, dwork, lddwork);

          if (i+ib < k-nx)
        /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
        magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, 
                  rows, ib, ib, 
                  da_ref(i, i   ), ldda, dwork,    lddwork, 
                  da_ref(i, i+ib), ldda, dwork+ib, lddwork);
          else
        magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, 
                  rows, n-i-ib, ib, 
                  da_ref(i, i   ), ldda, dwork,    lddwork, 
                  da_ref(i, i+ib), ldda, dwork+ib, lddwork);

          old_i  = i;
          old_ib = ib;
            }
        }
    } else {
      i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < k) 
      {
    ib = n-i;
    if (i!=0)
      cublasGetMatrix(m, ib, sizeof(cuDoubleComplex),
              da_ref(0,i), ldda, a_ref(0,i), lda);
        int rows = m-i;
    
        cnt++;
        lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info);
    
    if (cnt < qr_params->np_gpu) 
      {
        int ib2=min(ib,nb);
        
        lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, 
                              &rows, &ib2, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib2);
        
        qr_params->p[cnt]=a;
      }
      }
    
    cudaStreamDestroy( stream[0] );
    cudaStreamDestroy( stream[1] );
    cublasFree(da);
    return MAGMA_SUCCESS;
} /* magma_zgeqrf */
示例#19
0
/* ========================================================================== */
int sci_gpuLU(char *fname)
{
    CheckRhs(1,2);
    CheckLhs(2,2);
    #ifdef WITH_CUDA
        cublasStatus status;
    #endif
    SciErr sciErr;
    int*    piAddr_A    = NULL;
    double* h_A         = NULL;
    double* hi_A        = NULL;
    int     rows_A;
    int     cols_A;

    int*    piAddr_Opt  = NULL;
    double* option      = NULL;
    int     rows_Opt;
    int     cols_Opt;

    void*   d_A         = NULL;
    int     na;
    void*   pvPtr       = NULL;

    int     size_A      = sizeof(double);
    bool    bComplex_A  = FALSE;
    int     inputType_A;
    int     inputType_Opt;
    double  res;
    int     posOutput   = 1;

    try
    {
        sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A);
        if(sciErr.iErr) throw sciErr;
        if(Rhs == 2)
        {
            sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt);
            if(sciErr.iErr) throw sciErr;
            sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt);
            if(sciErr.iErr) throw sciErr;
            if(inputType_Opt == sci_matrix)
            {
                sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option);
                if(sciErr.iErr) throw sciErr;
            }
            else
                throw "Option syntax is [number,number].";
        }
        else
        {
            rows_Opt=1;
            cols_Opt=2;
            option = (double*)malloc(2*sizeof(double));
            option[0]=0;
            option[1]=0;
        }

        if(rows_Opt != 1 || cols_Opt != 2)
            throw "Option syntax is [number,number].";

        if((int)option[1] == 1 && !isGpuInit())
            throw "gpu is not initialised. Please launch gpuInit() before use this function.";

        sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A);
        if(sciErr.iErr) throw sciErr;

        #ifdef WITH_CUDA
        if (useCuda())
        {
            if(inputType_A == sci_pointer)
            {
                sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr);
                if(sciErr.iErr) throw sciErr;

                gpuMat_CUDA* gmat;
                gmat = static_cast<gpuMat_CUDA*>(pvPtr);
				if(!gmat->useCuda)
					throw "Please switch to OpenCL mode before use this data.";
                rows_A=gmat->rows;
                cols_A=gmat->columns;
                if(gmat->complex)
                {
                    bComplex_A = TRUE;
                    size_A = sizeof(cuDoubleComplex);
                    d_A=(cuDoubleComplex*)gmat->ptr->get_ptr();
                }
                else
                    d_A=(double*)gmat->ptr->get_ptr();

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                na = rows_A * cols_A;
            }
            else if(inputType_A == 1)
            {
                // Get size and data
                if(isVarComplex(pvApiCtx, piAddr_A))
                {
                    sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A);
                    if(sciErr.iErr) throw sciErr;
                    size_A = sizeof(cuDoubleComplex);
                    bComplex_A = TRUE;
                }
                else
                {
                    sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A);
                    if(sciErr.iErr) throw sciErr;
                }

                na = rows_A * cols_A;

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Allocate device memory
                status = cublasAlloc(na, size_A, (void**)&d_A);
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Initialize the device matrices with the host matrices
                if(!bComplex_A)
                {
                    status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A);
                    if (status != CUBLAS_STATUS_SUCCESS) throw status;
                }
                else
                    writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A);

            }
            else
                throw "Bad argument type.";

            cuDoubleComplex resComplex;
            // Performs operation
            if(!bComplex_A)
                status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1);
       //     else
       //         resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A);

            if (status != CUBLAS_STATUS_SUCCESS) throw status;

            // Put the result in scilab
            switch((int)option[0])
            {
                case 2 :
                case 1 :    sciprint("The first option must be 0 for this function. Considered as 0.\n");

                case 0 :    // Keep the result on the Host.
                {           // Put the result in scilab
                    if(!bComplex_A)
                    {
                        double* h_res = NULL;
                        sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res);
                        if(sciErr.iErr) throw sciErr;
                        status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                    }
                    else
                    {
                        sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y);
                        if(sciErr.iErr) throw sciErr;
                    }

                    LhsVar(posOutput)=Rhs+posOutput;
                    posOutput++;
                    break;
                }

                default : throw "First option argument must be 0 or 1 or 2.";
            }

            switch((int)option[1])
            {
                case 0 :    // Don't keep the data input on Device.
                {
                    if(inputType_A == sci_matrix)
                    {
                        status = cublasFree(d_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                        d_A = NULL;
                    }
                    break;
                }
                case 1 :    // Keep data of the fisrt argument on Device and return the Device pointer.
                {
                    if(inputType_A == sci_matrix)
                    {
                        gpuMat_CUDA* dptr;
                        gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A};
                        dptr=new gpuMat_CUDA(tmp);
						dptr->useCuda = true;
                        dptr->ptr->set_ptr((double*)d_A);
                        if(bComplex_A)
                            dptr->complex=TRUE;
                        else
                            dptr->complex=FALSE;

                        sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr);
                        if(sciErr.iErr) throw sciErr;
                        LhsVar(posOutput)=Rhs+posOutput;
                    }
                    else
                        throw "The first input argument is already a GPU variable.";

                    posOutput++;
                    break;
                }

                default : throw "Second option argument must be 0 or 1.";
            }
            // Shutdown
            status = cublasShutdown();
            if (status != CUBLAS_STATUS_SUCCESS) throw status;
        }
        #endif

        #ifdef WITH_OPENCL
        if (!useCuda())
        {
            throw "not implemented with OpenCL.";
        }
        #endif
        if(Rhs == 1)
        {
            free(option);
            option = NULL;
        }

        if(posOutput < Lhs+1)
            throw "Too many output arguments.";

        if(posOutput > Lhs+1)
            throw "Too few output arguments.";

        PutLhsVar();
        return 0;
    }
    catch(const char* str)
    {
        Scierror(999,"%s\n",str);
    }
    catch(SciErr E)
    {
        printError(&E, 0);
    }
    #ifdef WITH_CUDA
    catch(cudaError_t cudaE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE);
    }
    catch(cublasStatus CublasE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1);
    }
    if (useCuda())
    {
        if(inputType_A == 1 && d_A != NULL) cudaFree(d_A);
    }
    #endif
    #ifdef WITH_OPENCL
    if (!useCuda())
    {
        Scierror(999,"not implemented with OpenCL.\n");
    }
    #endif
    if(Rhs == 1 && option != NULL) free(option);
    return EXIT_FAILURE;
}
示例#20
0
SEXP magma_dgeMatrix_matrix_crossprod(SEXP x, SEXP y, SEXP trans)
{
#ifdef HIPLAR_WITH_MAGMA
	int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x,y) */
	SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *xDims = INTEGER(GET_SLOT(x, Matrix_DimSym)),
	    *yDims = INTEGER(getAttrib(y, R_DimSymbol)),
	    *vDims, nprot = 1;
	int m  = xDims[!tr],  n = yDims[!tr];/* -> result dim */
	int xd = xDims[ tr], yd = yDims[ tr];/* the conformable dims */
	double one = 1.0, zero = 0.0;

	if (isInteger(y)) {
		y = PROTECT(coerceVector(y, REALSXP));
		nprot++;
	}
	if (!(isMatrix(y) && isReal(y)))
		error(_("Argument y must be a numeric matrix"));
	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	SET_SLOT(val, Matrix_DimSym, allocVector(INTSXP, 2));
	vDims = INTEGER(GET_SLOT(val, Matrix_DimSym));
	if (xd > 0 && yd > 0 && n > 0 && m > 0) {
		if (xd != yd)
			error(_("Dimensions of x and y are not compatible for %s"),
					tr ? "tcrossprod" : "crossprod");
		vDims[0] = m; vDims[1] = n;
		SET_SLOT(val, Matrix_xSym, allocVector(REALSXP, m * n));

		double *A = REAL(GET_SLOT(x, Matrix_xSym));
		double *B = REAL(y);
		double *C = REAL(GET_SLOT(val, Matrix_xSym));

		if(GPUFlag == 1) {
			
			double *d_A, *d_B, *d_C;
			cublasStatus retStatus;

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix crossprod using magmablas_dgemm");
#endif
			cublasAlloc(m * xd, sizeof(double), (void**)&d_A);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(n * xd, sizeof(double), (void**)&d_B);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( m  * xd , sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( xd * n, sizeof(double), B, 1, d_B, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( m * n, sizeof(double), C, 1, d_C, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/


			// ******** magmablas_dgemm call Here **
			//magmablas_dgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			//CHANGE
			cublasDgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/

			cublasFree(d_A);
			cublasFree(d_B);
			cublasFree(d_C);
			
		}
		else {

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix cross prod with dgemm");
#endif
			F77_CALL(dgemm)(tr ? "N" : "T", tr ? "T" : "N", &m, &n, &xd, &one,
					A , xDims,
					B , yDims,
					&zero, C, &m);
		}
	}
	UNPROTECT(nprot);
	return val;
#endif
	return R_NilValue;
}
示例#21
0
SEXP magma_dpoMatrix_solve(SEXP x)
{
#ifdef HIPLAR_WITH_MAGMA
    SEXP Chol = magma_dpoMatrix_chol(x);
    SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix")));
    int *dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), info;

    SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
    slot_dup(val, Chol, Matrix_uploSym);
    slot_dup(val, Chol, Matrix_xSym);
    slot_dup(val, Chol, Matrix_DimSym);
    SET_SLOT(val, Matrix_DimNamesSym,
	     duplicate(GET_SLOT(x, Matrix_DimNamesSym)));
    double *A = REAL(GET_SLOT(val, Matrix_xSym));
		int N = *dims;	
		int lda = N;
		const char *uplo = uplo_P(val);
		
		if(GPUFlag == 0) {
			
			F77_CALL(dpotri)(uplo_P(val), dims, A, dims, &info);
		
		}
		else if(GPUFlag == 1 && Interface == 0) {
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Solving using magma_dpotri");
#endif
			magma_dpotri(uplo[0], N, A, lda, &info);
		}
		else if(GPUFlag == 1 && Interface == 1){
			double *d_A;
			cublasStatus retStatus;
			cublasAlloc( N * lda , sizeof(double), (void**)&d_A);
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Solving using magma_dpotri_gpu");
#endif		
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( N * lda, sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			magma_dpotri_gpu(uplo[0], N, d_A, lda, &info);

			cublasGetVector(N * lda, sizeof(double), d_A, 1, val, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/
			
			cublasFree(d_A);
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
	
		if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}
		
		UNPROTECT(1);
    return val;
#endif
	return R_NilValue;
}
示例#22
0
double Cholesky(Quark *quark, double *A, int N, int NB, int LDA, size_t memsize) 
{
    #define A(ib,jb) A[(size_t)(jb)*NB*LDA+(ib)*NB]

    #ifndef USE_MIC
    cublasStatus cu_status;
    #endif
         
    int bb = (N + NB - 1) / NB;
    int YM, YN;
    int Ym, Yn;
    int JB;
    int jb, jjb;
    int memBlock = memsize/sizeof(double)/NB/NB;
    double *X, *Y;

    #ifdef USE_MIC
        Y = (double*) offload_Alloc((size_t)memBlock*NB*NB*sizeof(double), 0);
        assert(Y != NULL);
    #else
        #ifdef USE_CUBLASV2
        {
            cudaError_t ierr;
            ierr = cudaMalloc((void **) &Y, (size_t) memBlock*NB*NB*sizeof(double));
            assert(ierr == cudaSuccess);
        }
        #else
            cu_status = cublasAlloc((size_t) memBlock*NB*NB, sizeof(double), (void **) &Y);
            CHKERR(cu_status);
        #endif
    #endif
    
    double t1;
    double llttime = MPI_Wtime();

    
    /*--------------------------------------*/   

    /*     The main Ypanel loop     */

//  QUARK_Barrier(quark);
    for (JB = 0, jb = 0; JB < N; JB+=YN, jb+=Yn)
    {
        //determine size of Ypanel
        Ym = bb - jb;
        Yn = find_Yn(bb, memBlock, jb);
        YM = N - JB;
        YN = MIN((jb+Yn)*NB, N) - jb*NB;
        X = Y + (size_t)(memBlock-Ym)*NB*NB;
        printf("bb %d jb %d YM %d YN %d Ym %d Yn %d Y %p X %p\n", bb, jb, YM, YN, Ym, Yn, Y, X);

        /* Copy in data */
        A2Y(quark, &A(jb,jb), Y, LDA, NB, YM, YN);

        /* Left-looking */
        for(jjb = 0; jjb < jb; jjb++){
            /* copy from A to X */
            A2X(quark, &A(jb,jjb), LDA, X, NB, YM);
            ooc_syrk(quark, X, Y, YM, YN, NB);
        }

        /* incore factorization */
        ooc_incore(quark, &A(jb,jb), Y, LDA, NB, YM, YN);
    
        /* Copy out data */
//      Y2A(quark, Y, &A(jb,jb), LDA, NB, YM, YN);
//      QUARK_Barrier(quark); // reduce parallelism
//      goto oasdfh; // early stop

    }
oasdfh:
    QUARK_Barrier(quark);
    llttime = MPI_Wtime() - llttime;
    printf("llt time %lf %lf\n", llttime, MPI_Wtime());
    printf("%lf %lf\n", A[(N-1)*LDA+N-1], MPI_Wtime());
    /*--------------------------------------*/   

    #ifdef USE_MIC
        offload_Free(Y,0);
    #else
        #ifdef USE_CUBLASV2
        {
            cudaError_t ierr;
            ierr = cudaFree((void *) Y);
            assert(ierr == cudaSuccess);
            Y = 0;
        }
        #else
            cu_status = cublasFree(Y);
            CHKERR(cu_status);
        #endif
    #endif
    return llttime;
    #undef A
} 
示例#23
0
int main(void)

{
    
    cublasStatus status;
    
    float* h_image;
    
    float* h_covariance;
    
    float* d_image;
    
    float* d_covariance;
    
    float alpha = 1.0f;
    
    float beta = 0.0f;
    
    int imgsize = N * L;
    
    //int i;
    
    FILE *fp1, *fp2;
    
    /* Initialize CUBLAS */
    
    status = cublasInit();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Allocate host memory for the image */
    
    h_image = (float*)malloc(imgsize * sizeof(float));
    
    if (h_image == 0) {
        
        fprintf (stderr, "!!!! host memory allocation error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    h_covariance = (float*)calloc(L * L, sizeof(float));
    
    if (h_covariance == 0) {
        
        fprintf (stderr, "!!!! host memory allocation error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Fill the image with test data
     
     for (i = 0; i < imgsize; i++) {
     
     h_image[i] = rand() / (float)RAND_MAX;
     
     }*/
    
    fp1 = fopen("image.dat","rb");
    
    fread(h_image, sizeof(float), imgsize, fp1);
    
    printf("Valor de image[0]: %f\n", h_image[8]);
    
    /* Allocate device memory */
    
    status = cublasAlloc(imgsize, sizeof(float), (void**)&d_image);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device memory allocation error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasAlloc(L * L, sizeof(float), (void**)&d_covariance);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device memory allocation error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Copy image to device memory */
    
    status = cublasSetVector(imgsize, sizeof(float), h_image, 1, d_image, 1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (write A)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasSetVector(L * L, sizeof(float), h_covariance, 1, d_covariance,
                             
                             1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (write covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Clear last error */
    
    cublasGetError();
    
    /* Calculate covariance matrix using cublas */
    
    cublasSgemm('n', 't', L, L, N, alpha, d_image, L, d_image, L, beta,
                
                d_covariance, L);
    
    status = cublasGetError();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! kernel execution error.\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Read the result back */
    
    status = cublasGetVector(L * L, sizeof(float), d_covariance, 1, h_covariance,
                             
                             1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (read covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    fp2 = fopen("covariance.dat","wb");
    
    fwrite(h_covariance, sizeof(float), L*L, fp2);
    
    printf("Valor de covariance[8]: %f\n", h_covariance[8]);
    
    /* Memory clean up */
    
    free(h_image);
    
    free(h_covariance);
    
    status = cublasFree(d_image);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! memory free error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasFree(d_covariance);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! memory free error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Shutdown */
    
    status = cublasShutdown();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! shutdown error (A)\n");
        
        return EXIT_FAILURE;
        
    }
    
    fclose(fp1);
    
    fclose(fp2);
    
    printf("\nPress ENTER to exit...\n");
    
    getchar();
    
    return EXIT_SUCCESS;
    
}