void CudaUtil::cublasCheckGetVector(int n, int elemSize, void* devicePtr, int incx, void* hostPtr, int incy, int line, const char* file)
{
	cublasStatus_t status = cublasGetVector(n, elemSize, devicePtr, incx, hostPtr, incy);
	if (status != CUBLAS_STATUS_SUCCESS) {
		std::ostringstream os;
		os << "CUBALS device read error, line " << line << ", in file " << file;
		throw CudaException(os.str());
	}
}
Esempio n. 2
0
void magma_getvector(
    magma_int_t n, size_t elemSize,
    void const* dx_src, magma_int_t incx,
    void*       hy_dst, magma_int_t incy )
{
    cublasStatus_t status;
    status = cublasGetVector(
        n, elemSize,
        dx_src, incx,
        hy_dst, incy );
    check_error( status );
}
Esempio n. 3
0
void magma_sgetvector_internal(
    magma_int_t n,
    float const* dx_src, magma_int_t incx,
    float*       hy_dst, magma_int_t incy,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasGetVector(
        n, sizeof(float),
        dx_src, incx,
        hy_dst, incy );
    check_xerror( status, func, file, line );
}
Esempio n. 4
0
File: ardblas.c Progetto: rforge/gcb
SEXP d_getVector(SEXP vList, SEXP inc)
{
	int n, increment = asInteger(inc);
	double * dPtr;
	unpackVector(vList, &n, &dPtr);

	SEXP out;
	PROTECT(out = allocVector(REALSXP, n));
	cublasGetVector(n, sizeof(double), dPtr, increment, REAL(out), 1);
	checkCublasError("d_getVector");
	UNPROTECT(1);
	return out;
}
void solve_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * b, cublasHandle_t handle)
{
	double * d_b;
	checkCudaErrors(cudaMalloc(&d_b, gpu_matrix.order*sizeof(double)));
	checkCublasErrors(cublasSetVector(gpu_matrix.order, sizeof(double), b, 1, d_b, 1));
	
	solve_lower_system_on_gpu(gpu_matrix, d_b, handle);
	solve_upper_system_on_gpu(gpu_matrix, d_b, handle);

	checkCublasErrors(cublasGetVector(gpu_matrix.order, sizeof(double), d_b, 1, b, 1));

	checkCudaErrors(cudaFree(d_b));
}
Esempio n. 6
0
void magma_getvector_internal(
    magma_int_t n, magma_int_t elemSize,
    void const* dx_src, magma_int_t incx,
    void*       hy_dst, magma_int_t incy,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasGetVector(
        n, elemSize,
        dx_src, incx,
        hy_dst, incy );
    check_xerror( status, func, file, line );
}
Esempio n. 7
0
void cuda_scal(const cublasHandle_t blasHandle, const int n, const T alpha, T x[], int incX, SCAL scal)
{
	T *d_X = NULL;
	cudaMalloc((void**)&d_X, n*sizeof(T));

	cublasSetVector(n, sizeof(T), x, incX, d_X, incX);

	scal(blasHandle, n, &alpha, d_X, incX);

	cublasGetVector(n, sizeof(T), d_X, incX, x, incX);

	cudaFree(d_X);
}
Esempio n. 8
0
// --------------------
extern "C" void
magma_zgetvector_internal(
    magma_int_t n,
    magmaDoubleComplex_const_ptr dx_src, magma_int_t incx,
    magmaDoubleComplex*       hy_dst, magma_int_t incy,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasGetVector(
        n, sizeof(magmaDoubleComplex),
        dx_src, incx,
        hy_dst, incy );
    check_xerror( status, func, file, line );
}
Esempio n. 9
0
int double_copyGPU2Host_Transpose(PGM_Matriz_Double *host, double *device, int device_col, PGM_Matriz_Double *work){
	int i,j;
	double *ptr;

	for( i = 0; i < host->n_linhas; i++ ){

		ptr = host->valor+(i*host->n_colunas);

		if(cublasGetVector(host->n_colunas,sizeof(double), device+i, device_col ,work->valor, 1) != CUBLAS_STATUS_SUCCESS){
			printf("Error: nao foi possivel executar a copia!\n");
			return -1;
		}
		for( j = 0; j < host->n_colunas; j++)
			ptr[j] =  work->valor[j];
	}
	return CUBLAS_STATUS_SUCCESS;
}
Esempio n. 10
0
void cuda_axpy(const cublasHandle_t blasHandle, const int n, const T alpha, const T x[], int incX, T y[], int incY, AXPY axpy)
{
	T *d_X = NULL;
	T *d_Y = NULL;
	cudaMalloc((void**)&d_X, n*sizeof(T));
	cudaMalloc((void**)&d_Y, n*sizeof(T));

	cublasSetVector(n, sizeof(T), x, incX, d_X, incX);
	cublasSetVector(n, sizeof(T), y, incY, d_Y, incY);

	axpy(blasHandle, n, &alpha, d_X, incX, d_Y, incX);

	cublasGetVector(n, sizeof(T), d_Y, incY, y, incY);

	cudaFree(d_X);
	cudaFree(d_Y);
}
Esempio n. 11
0
SEXP magma_dgeMatrix_matrix_crossprod(SEXP x, SEXP y, SEXP trans)
{
#ifdef HIPLAR_WITH_MAGMA
	int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x,y) */
	SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *xDims = INTEGER(GET_SLOT(x, Matrix_DimSym)),
	    *yDims = INTEGER(getAttrib(y, R_DimSymbol)),
	    *vDims, nprot = 1;
	int m  = xDims[!tr],  n = yDims[!tr];/* -> result dim */
	int xd = xDims[ tr], yd = yDims[ tr];/* the conformable dims */
	double one = 1.0, zero = 0.0;

	if (isInteger(y)) {
		y = PROTECT(coerceVector(y, REALSXP));
		nprot++;
	}
	if (!(isMatrix(y) && isReal(y)))
		error(_("Argument y must be a numeric matrix"));
	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	SET_SLOT(val, Matrix_DimSym, allocVector(INTSXP, 2));
	vDims = INTEGER(GET_SLOT(val, Matrix_DimSym));
	if (xd > 0 && yd > 0 && n > 0 && m > 0) {
		if (xd != yd)
			error(_("Dimensions of x and y are not compatible for %s"),
					tr ? "tcrossprod" : "crossprod");
		vDims[0] = m; vDims[1] = n;
		SET_SLOT(val, Matrix_xSym, allocVector(REALSXP, m * n));

		double *A = REAL(GET_SLOT(x, Matrix_xSym));
		double *B = REAL(y);
		double *C = REAL(GET_SLOT(val, Matrix_xSym));

		if(GPUFlag == 1) {
			
			double *d_A, *d_B, *d_C;
			cublasStatus retStatus;

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix crossprod using magmablas_dgemm");
#endif
			cublasAlloc(m * xd, sizeof(double), (void**)&d_A);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(n * xd, sizeof(double), (void**)&d_B);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( m  * xd , sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( xd * n, sizeof(double), B, 1, d_B, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( m * n, sizeof(double), C, 1, d_C, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/


			// ******** magmablas_dgemm call Here **
			//magmablas_dgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			//CHANGE
			cublasDgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/

			cublasFree(d_A);
			cublasFree(d_B);
			cublasFree(d_C);
			
		}
		else {

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix cross prod with dgemm");
#endif
			F77_CALL(dgemm)(tr ? "N" : "T", tr ? "T" : "N", &m, &n, &xd, &one,
					A , xDims,
					B , yDims,
					&zero, C, &m);
		}
	}
	UNPROTECT(nprot);
	return val;
#endif
	return R_NilValue;
}
Esempio n. 12
0
SEXP magma_dgeMatrix_crossprod(SEXP x, SEXP trans)
{
#ifdef HIPLAR_WITH_MAGMA
	int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x) */
	SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))),
	     nms = VECTOR_ELT(GET_SLOT(x, Matrix_DimNamesSym), tr ? 0 : 1),
	     vDnms = ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2);
	int *Dims = INTEGER(GET_SLOT(x, Matrix_DimSym)),
	    *vDims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2));
	int k = tr ? Dims[1] : Dims[0], n = tr ? Dims[0] : Dims[1];
	double *vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)),
		 one = 1.0, zero = 0.0;
	double *A =  REAL(GET_SLOT(x, Matrix_xSym));
	AZERO(vx, n * n);
	SET_SLOT(val, Matrix_uploSym, mkString("U"));
	ALLOC_SLOT(val, Matrix_factorSym, VECSXP, 0);
	vDims[0] = vDims[1] = n;
	SET_VECTOR_ELT(vDnms, 0, duplicate(nms));
	SET_VECTOR_ELT(vDnms, 1, duplicate(nms));
	if(n && GPUFlag == 1) {

#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing crossproduct using cublasDsyrk");
#endif
		cublasStatus retStatus;
		double *d_A, *d_C;

		/*retStatus = cublasCreate(&handle);
		  if ( retStatus != CUBLAS_STATUS_SUCCESS )		
		  error(_("CUBLAS initialisation failed"));
		  */

		cublasAlloc(n * k, sizeof(double), (void**)&d_A);
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasAlloc(n * n, sizeof(double), (void**)&d_C);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasSetVector( n  * k , sizeof(double), A, 1, d_A, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/
		
		//cublasSetVector( n  * n , sizeof(double), vx, 1, d_C, 1);
		
		/* Error Checking */
		//retStatus = cublasGetError ();
		//if (retStatus != CUBLAS_STATUS_SUCCESS) 
		//	error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/


		cublasDsyrk('U' , tr ? 'N' : 'T', n, k, one, d_A, Dims[0], zero, d_C, n);

		cublasGetVector( n * n , sizeof(double), d_C, 1, vx, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer from Device"));
		/********************************************/

		cublasFree(d_A);
		cublasFree(d_C);

	} else if(n){
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing cross prod with dsyrk");
#endif
		F77_CALL(dsyrk)("U", tr ? "N" : "T", &n, &k, &one, A, Dims,
				&zero, vx, &n);
	}

	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	UNPROTECT(1);
	return val;
#endif
	return R_NilValue;
}
Esempio n. 13
0
SEXP magma_dgeMatrix_matrix_mm(SEXP a, SEXP bP, SEXP right)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP b = PROTECT(mMatrix_as_dgeMatrix(bP)),
	     val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
	    *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)),
	    *cdims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2));
	double one = 1.0, zero = 0.0;

	if (asLogical(right)) {
		int m = bdims[0], n = adims[1], k = bdims[1];
		if (adims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		if (m < 1 || n < 1 || k < 1) {
			// 		This was commented out
				    error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			double *B = REAL(GET_SLOT(b, Matrix_xSym));
			double *A = REAL(GET_SLOT(a, Matrix_xSym));
			double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
			//TODO add magma here too
			if(GPUFlag == 1) {
				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;


#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication with Right = true using magmablas_dgemm");
#endif
				cublasAlloc(n * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( n  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( m * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C,  m);
				//CHANGED 30/07
				cublasDgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m);
				
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in cublasDgemm routine"));
				}
				/********************************************/

				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm with right = TRUE");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						B, &m, A , &k, &zero, C , &m);
			}
		}
	} else {
		int m = adims[0], n = bdims[1], k = adims[1];
		double *A = REAL(GET_SLOT(a, Matrix_xSym));
		double *B = REAL(GET_SLOT(b, Matrix_xSym));


		if (bdims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
		
		if (m < 1 || n < 1 || k < 1) {
			//		This was commented out
			error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			if(GPUFlag == 1) {

				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;

	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using magmablas_dgemm");
#endif			
				cublasAlloc(m * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(n * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( m  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( n * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C,  m);
				//CHANGE
				cublasDgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m);
				
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/
				}
				
				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
				
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						A, &m,
						B, &k, &zero,
						C,
						&m);	

			}
		}
	}
	ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2);
	UNPROTECT(2);
	return val;
#endif
	return R_NilValue;
}
Esempio n. 14
0
SEXP magma_dgeMatrix_matrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = PROTECT(dup_mMatrix_as_dgeMatrix(b)),
			 lu = PROTECT(magma_dgeMatrix_LU_(a, TRUE));
	int *adims = INTEGER(GET_SLOT(lu, Matrix_DimSym)),
			*bdims = INTEGER(GET_SLOT(val, Matrix_DimSym));
	int info, n = bdims[0], nrhs = bdims[1];



	if (*adims != *bdims || bdims[1] < 1 || *adims < 1 || *adims != adims[1])
		error(_("Dimensions of system to be solved are inconsistent"));

	double *A = REAL(GET_SLOT(lu, Matrix_xSym));
	double *B  = REAL(GET_SLOT(val, Matrix_xSym));
	int *ipiv = INTEGER(GET_SLOT(lu, Matrix_permSym));

	if(GPUFlag == 0) {
		F77_CALL(dgetrs)("N", &n, &nrhs, A, &n, ipiv, B, &n, &info);	
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Solve using LU using dgetrs;");
#endif
	}else if(GPUFlag == 1) {
		
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Solve using LU using magma_dgetrs;");
#endif		
		double *d_A, *d_B;
		cublasStatus retStatus;

		cublasAlloc(adims[0] * adims[1], sizeof(double), (void**)&d_A);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation of A on Device"));
		/********************************************/


		cublasAlloc(n * nrhs, sizeof(double), (void**)&d_B);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation of b on Device"));
		/********************************************/



		cublasSetVector(adims[0] * adims[1], sizeof(double), A, 1, d_A, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Transferring data to advice"));
		/********************************************/

		cublasSetVector(n * nrhs, sizeof(double), B, 1, d_B, 1);

		magma_dgetrs_gpu( 'N', n, nrhs, d_A, n, ipiv, d_B, n, &info );

		cublasGetVector(n * nrhs, sizeof(double), d_B, 1, B, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Transferring from to advice"));
		/********************************************/

		cublasFree(d_A);
		cublasFree(d_B);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in freeing data"));
		/********************************************/

		
	}
	if (info)
		error(_("Lapack routine dgetrs: system is exactly singular"));
	UNPROTECT(2);
	return val;
#endif
	    return R_NilValue;
}
Esempio n. 15
0
SEXP magma_dgeMatrix_solve(SEXP a)
{
#ifdef HIPLAR_WITH_MAGMA
    /*  compute the 1-norm of the matrix, which is needed
	later for the computation of the reciprocal condition number. */
    double aNorm = magma_get_norm(a, "1");

    /* the LU decomposition : */
		/* Given that we may be performing this operation
		 * on the GPU we may put in an optimisation here
		 * where if we call the LU solver we, we do not require
		 * the decomposition to be transferred back to CPU. This is TODO
		 */
    SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))),
	lu = magma_dgeMatrix_LU_(a, TRUE);
    int *dims = INTEGER(GET_SLOT(lu, Matrix_DimSym)),
	*pivot = INTEGER(GET_SLOT(lu, Matrix_permSym));

    /* prepare variables for the dgetri calls */
    double *x, tmp;
    int	info, lwork = -1;

    if (dims[0] != dims[1]) error(_("Solve requires a square matrix"));
    slot_dup(val, lu, Matrix_xSym);
    x = REAL(GET_SLOT(val, Matrix_xSym));
    slot_dup(val, lu, Matrix_DimSym);
		int N2 = dims[0] * dims[0];

    if(dims[0]) /* the dimension is not zero */
    {
			/* is the matrix is *computationally* singular ? */
			double rcond;
			F77_CALL(dgecon)("1", dims, x, dims, &aNorm, &rcond,
					(double *) R_alloc(4*dims[0], sizeof(double)),
					(int *) R_alloc(dims[0], sizeof(int)), &info);
			if (info)
				error(_("error [%d] from Lapack 'dgecon()'"), info);
			if(rcond < DOUBLE_EPS)
				error(_("Lapack dgecon(): system computationally singular, reciprocal condition number = %g"),
						rcond);

			/* only now try the inversion and check if the matrix is *exactly* singular: */
			// This is also a work space query. This is not an option in magma

			F77_CALL(dgetri)(dims, x, dims, pivot, &tmp, &lwork, &info);
			lwork = (int) tmp;
			
			if( GPUFlag == 0){
				

				F77_CALL(dgetri)(dims, x, dims, pivot,
						(double *) R_alloc((size_t) lwork, sizeof(double)),
						&lwork, &info);

#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Solve using LU using dgetri;");
#endif
			}
			else if(GPUFlag == 1) {
				
				double *d_x, *dwork; 
				cublasStatus retStatus;			
	
#ifdef HIPLAR_DBG
				R_ShowMessage("Solve using LU using magma_dgetri;");
#endif
				cublasAlloc(N2, sizeof(double), (void**)&d_x);

				//cublasAlloc(N2 , sizeof(double), (void**)&dtmp);
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation on Device"));
				/********************************************/

				cublasSetVector( N2, sizeof(double), x, 1, d_x, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/
				lwork = dims[0] * magma_get_dgetri_nb( dims[0] );
				
					cublasAlloc(lwork, sizeof(double), (void**)&dwork);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation on Device"));
				/********************************************/

				magma_dgetri_gpu(dims[0], d_x, dims[0], pivot, dwork , lwork, &info);

				cublasGetVector(N2, sizeof(double), d_x, 1, x, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data From to Device"));
				/********************************************/

				cublasFree(dwork);
				cublasFree(d_x);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error freeing memory"));
				/********************************************/
					
			}
			else
				error(_("GPUFlag not set correctly"));

			if (info)
				error(_("Lapack routine dgetri: system is exactly singular"));
    }
    UNPROTECT(1);
    return val;
#endif
	    return R_NilValue;
}
Esempio n. 16
0
SEXP magma_dpoMatrix_chol(SEXP x)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = get_factors(x, "Cholesky"),
			 dimP = GET_SLOT(x, Matrix_DimSym),
			 uploP = GET_SLOT(x, Matrix_uploSym);

	const char *uplo = CHAR(STRING_ELT(uploP, 0));
	int *dims = INTEGER(dimP), info;
	int n = dims[0];
	double *vx;
	cublasStatus retStatus;
	if (val != R_NilValue) return val;
	dims = INTEGER(dimP);
	val = PROTECT(NEW_OBJECT(MAKE_CLASS("Cholesky")));
	SET_SLOT(val, Matrix_uploSym, duplicate(uploP));
	SET_SLOT(val, Matrix_diagSym, mkString("N"));
	SET_SLOT(val, Matrix_DimSym, duplicate(dimP));
	vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n));
	AZERO(vx, n * n);
	
	//we could put in magmablas_dlacpy but it only
	//copies all of the matrix 
	F77_CALL(dlacpy)(uplo, &n, &n, REAL(GET_SLOT(x, Matrix_xSym)), &n, vx, &n);
	if (n > 0) {

		if(GPUFlag == 0){
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Cholesky decomposition using dpotrf;");
#endif
			F77_CALL(dpotrf)(uplo, &n, vx, &n, &info);
		}
		else if(GPUFlag == 1 && Interface == 0){
		
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf;");
#endif			
			int nrows, ncols;
			nrows = ncols = n;

			magma_int_t lda;
			lda = nrows;

			magma_dpotrf(uplo[0], ncols, vx, lda, &info);

			/* Error Checking */
			retStatus = cudaGetLastError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf"));
			/********************************************/
			

		}
		else if(GPUFlag == 1 && Interface == 1) {
	
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf_gpu;");	
#endif
			double *d_c;
			int nrows, ncols;
			nrows = ncols = n;
			int N2 = nrows * ncols;


			magma_int_t lda;
			lda = nrows;

			cublasAlloc(lda * ncols, sizeof(double), (void**)&d_c);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector(N2, sizeof(double), vx, 1, d_c, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer to Device"));
			/********************************************/


			magma_dpotrf_gpu(uplo[0], ncols, d_c, lda, &info);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf_gpu"));
			/********************************************/
			

			cublasGetVector(nrows * ncols, sizeof(double), d_c, 1, vx, 1);		
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer from Device"));
			/********************************************/
			
			cublasFree(d_c);
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
		}
		
	if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}

	UNPROTECT(1);
	return set_factors(x, val, "Cholesky");
#endif
	return R_NilValue;
}
/* Main */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[]) {

    if (nrhs != 7) {
        mexErrMsgTxt("sgemm requires 7 input arguments");
    } else if (nlhs != 1) {
        mexErrMsgTxt("sgemm requires 1 output argument");
    }

    if ( !mxIsSingle(prhs[4]) ||
            !mxIsSingle(prhs[5]) ||
            !mxIsSingle(prhs[6]))   {
        mexErrMsgTxt("Input arrays must be single precision.");
    }

    int ta = (int) mxGetScalar(prhs[0]);
    int tb = (int) mxGetScalar(prhs[1]);
    float alpha = (float) mxGetScalar(prhs[2]);
    float beta = (float) mxGetScalar(prhs[3]);
    float *h_A = (float*) mxGetData(prhs[4]);
    float *h_B = (float*) mxGetData(prhs[5]);
    float *h_C = (float*) mxGetData(prhs[6]);

    int M = mxGetM(prhs[4]);   /* gets number of rows of A */
    int K = mxGetN(prhs[4]);   /* gets number of columns of A */
    int L = mxGetM(prhs[5]);   /* gets number of rows of B */
    int N = mxGetN(prhs[5]);   /* gets number of columns of B */

    cublasOperation_t transa, transb;
    int MM, KK, NN;
    if (ta == 0) {
        transa = CUBLAS_OP_N;
        MM=M;
        KK=K;
    } else {
        transa = CUBLAS_OP_T;
        MM=K;
        KK=M;
    }

    if (tb == 0) {
        transb = CUBLAS_OP_N;
        NN=N;
    } else {
        transb = CUBLAS_OP_T;
        NN=L;
    }

    /*	printf("transa=%c\n",transa);
    	printf("transb=%c\n",transb);
    	printf("alpha=%f\n",alpha);
    	printf("beta=%f\n",beta);	*/

    /* Left hand side matrix set up */
    mwSize dims0[2];
    dims0[0]=MM;
    dims0[1]=NN;
    plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
    float *h_C_out = (float*) mxGetData(plhs[0]);

    cublasStatus_t status;
    cublasHandle_t handle;
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! CUBLAS initialization error\n");
    }

    float* d_A = 0;
    float* d_B = 0;
    float* d_C = 0;

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void**)&d_A, M * K * sizeof(d_A[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate A)\n");
    }
    if (cudaMalloc((void**)&d_B, L * N * sizeof(d_B[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate B)\n");

    }
    if (cudaMalloc((void**)&d_C, MM * NN * sizeof(d_C[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate C)\n");
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(M * K, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write A)\n");

    }
    status = cublasSetVector(L * N, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write B)\n");
    }
    status = cublasSetVector(MM * NN, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write C)\n");
    }

    /* Performs operation using cublas */
    status = cublasSgemm(handle, transa, transb, MM, NN, KK, &alpha, d_A, M, d_B, L, &beta, d_C, MM);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! kernel execution error.\n");
    }

    /* Read the result back */
    status = cublasGetVector(MM * NN, sizeof(h_C[0]), d_C, 1, h_C_out, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (read C)\n");

    }

    if (cudaFree(d_A) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (A)\n");
    }
    if (cudaFree(d_B) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (B)\n");
    }
    if (cudaFree(d_C) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (C)\n");
    }

    /* Shutdown */
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! shutdown error (A)\n");
    }
}
Esempio n. 18
0
/* Main */
int test_cublas(void)
{    
    cublasStatus status;
    cudaError_t e;
    float* h_A;
    float* h_B;
    float* h_C;
    float* h_C_ref;
    float* d_A = 0;
    void *vp;
    float* d_B = 0;
    float* d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasInit();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float*)malloc(n2 * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    h_B = (float*)malloc(n2 * sizeof(h_B[0]));
    if (h_B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    d_A = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    d_B = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (C)\n");
        return EXIT_FAILURE;
    }
    d_C = (float *) vp;

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }
    
    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Clear last error */
    cublasGetError();

    /* Performs operation using cublas */
    cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);
    status = cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    
    /* Allocate host memory for reading back the result from device memory */
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;
    for (i = 0; i < n2; ++i) {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }
    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
    if (fabs(ref_norm) < 1e-7) {
        fprintf (stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }
    printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);
    e = cudaFree(d_A);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_B);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_C);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasShutdown();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
Esempio n. 19
0
float getv(float *da){
  float res[1];
  cublasGetVector(1, sizeof(float), da, 1, res, 1);
  return res[0];
}
Esempio n. 20
0
/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_rnd;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    cublasHandle_t handle;

    int dev_id;
    cudaDeviceProp device_prop;

    bool do_device_api_test = false;

    float host_api_test_ratio, device_api_test_ratio;

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running...\n");

    dev_id = findCudaDevice(argc, (const char **) argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

    if ((device_prop.major << 4) + device_prop.minor >= 0x35)
    {
        printf("Host and device APIs will be tested.\n");
        do_device_api_test = true;
    }
    /*    else if ((device_prop.major << 4) + device_prop.minor >= 0x20)
        {
            printf("Host API will be tested.\n");
            do_device_api_test = false;
        }
    */
    else
    {
        fprintf(stderr, "simpleDevLibCUBLAS examples requires Compute Capability of SM 3.5 or higher\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_SUCCESS;
    }

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    h_C_rnd = (float *)malloc(n2 * sizeof(h_C_rnd[0]));

    if (h_C_rnd == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C_rnd)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C_ref[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C_rnd[i] = rand() / (float)RAND_MAX;
        h_C[i] = h_C_rnd[i];
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /*
     * Performs operation using plain C code
     */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /*
     * Performs operation using cublas
     */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    host_api_test_ratio = check_result(h_C, h_C_ref, n2);

    if (do_device_api_test)
    {
        /* Reset device resident C matrix */
        status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1);

        if (status != CUBLAS_STATUS_SUCCESS)
        {
            fprintf(stderr, "!!!! device access error (write C)\n");

            // cudaDeviceReset causes the driver to clean up all state. While
            // not mandatory in normal operation, it is good practice.  It is also
            // needed to ensure correct operation when the application is being
            // profiled. Calling cudaDeviceReset causes all profile data to be
            // flushed before the application exits
            cudaDeviceReset();
            return EXIT_FAILURE;
        }

        /*
         * Performs operation using the device API of CUBLAS library
         */
        device_cublas_sgemm(N, alpha, d_A, d_B, beta, d_C);

        /* Read the result back */
        status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

        if (status != CUBLAS_STATUS_SUCCESS)
        {
            fprintf(stderr, "!!!! device access error (read C)\n");

            // cudaDeviceReset causes the driver to clean up all state. While
            // not mandatory in normal operation, it is good practice.  It is also
            // needed to ensure correct operation when the application is being
            // profiled. Calling cudaDeviceReset causes all profile data to be
            // flushed before the application exits
            cudaDeviceReset();
            return EXIT_FAILURE;
        }

        /* Check result against reference */
        device_api_test_ratio = check_result(h_C, h_C_ref, n2);
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_rnd);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    bool test_result = do_device_api_test ?
                       host_api_test_ratio < 1e-6 &&
                       device_api_test_ratio < 1e-6 :
                       host_api_test_ratio < 1e-6;

    printf("simpleCUBLAS completed, returned %s\n",
           test_result ? "OK" : "ERROR!");

    exit(test_result ? EXIT_SUCCESS : EXIT_FAILURE);
}
Esempio n. 21
0
SEXP magma_dgeMatrix_LU_(SEXP x, Rboolean warn_sing)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = get_factors(x, "LU");
	int *dims, npiv, info;

	if (val != R_NilValue) {
//		R_ShowMessage("already in slot");	/* nothing to do if it's there in 'factors' slot */
		return val;
	}

	dims = INTEGER(GET_SLOT(x, Matrix_DimSym));
	if (dims[0] < 1 || dims[1] < 1)
		error(_("Cannot factor a matrix with zero extents"));
	npiv = (dims[0] < dims[1]) ? dims[0] : dims[1];
	val = PROTECT(NEW_OBJECT(MAKE_CLASS("denseLU")));
	slot_dup(val, x, Matrix_xSym);
	slot_dup(val, x, Matrix_DimSym);
	double *h_R = REAL(GET_SLOT(val, Matrix_xSym));
	int *ipiv = INTEGER(ALLOC_SLOT(val, Matrix_permSym, INTSXP, npiv));
	
	if(GPUFlag == 0){
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: LU decomposition using dgetrf;");
#endif
		F77_CALL(dgetrf)(dims, dims + 1, h_R,
				dims,
				ipiv,
				&info);
	}
	else if(GPUFlag == 1 && Interface == 0){
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: LU decomposition using magma_dgetrf;");
#endif
			magma_dgetrf(dims[0], dims[1], h_R, dims[0], ipiv, &info);
		}
		else if(GPUFlag == 1 && Interface == 1) {
			
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: LU decomposition using magma_dgetrf_gpu;");	
#endif
			double *d_A;
			int N2 = dims[0] * dims[1];
			cublasStatus retStatus;

			cublasAlloc( N2 , sizeof(double), (void**)&d_A);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector(N2, sizeof(double), h_R, 1, d_A, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer to Device"));
			/********************************************/

			magma_dgetrf_gpu(dims[0],dims[1], d_A, dims[0], ipiv,  &info);
			
			cublasGetVector( N2, sizeof(double), d_A, 1, h_R, 1);		
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer from Device"));
			/********************************************/
			
				cublasFree(d_A);
		
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error freeing data"));
			/********************************************/
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
		
	if (info < 0)
		error(_("Lapack routine %s returned error code %d"), "dgetrf", info);
	else if (info > 0 && warn_sing)
		warning(_("Exact singularity detected during LU decomposition: %s, i=%d."),
				"U[i,i]=0", info);
	UNPROTECT(1);

	return set_factors(x, val, "LU");
#endif

	    return R_NilValue;
}
// Single precision matrix multiplication using CUBLAS
//	C = A * B
//
// CUBLAS function called
//
//  cublasSgemm
//
//  CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (
//		cublasHandle_t handle, 
//      cublasOperation_t transa,
//      cublasOperation_t transb, 
//      int m,		// number of rows in matrices A and C
//      int n,		// number of columns in matrices B and C
//      int k,		// number of columns in A and number of rows in B
//      const float *alpha, /* host or device pointer */  
//      const float *A, 
//      int lda,
//      const float *B,
//      int ldb, 
//      const float *beta, /* host or device pointer */  
//      float *C,
//      int ldc);
//
int cublasSGEMM (float* C, float* A, float* B, int HA, int WA, int WB)
{
	cublasStatus_t status;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    float error_norm;
    float ref_norm;
    float diff;
    cublasHandle_t handle;
	cudaError_t cudaStatus;

	// Make sure CUDA device 0 is available
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
	{
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        return 1;
    }

    /* Initialize CUBLAS */
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }
 
    /* Allocate device memory for the matrices (d_A, d_B, and d_C) */
    if (cudaMalloc((void **)&d_A, HA * WA * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }
    if (cudaMalloc((void **)&d_B, WA * WB * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }
    if (cudaMalloc((void **)&d_C, HA * WB * sizeof(d_C[0])) != cudaSuccess)
    {
        printf("!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices (A, B, and C) */
    status = cublasSetVector(HA * WA, sizeof(A[0]), A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(WA * WB, sizeof(B[0]), B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(HA * WB, sizeof(C[0]), C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Performs operation using cublas */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, HA, WB, WA, &alpha, d_A, HA, d_B, WA, &beta, d_C, HA);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }

    /* Read the result back (to C)*/
    status = cublasGetVector(HA * WB, sizeof(C[0]), d_C, 1, C, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Memory clean up */
    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

	return 0;
}
Esempio n. 23
0
SEXP magma_dpoMatrix_solve(SEXP x)
{
#ifdef HIPLAR_WITH_MAGMA
    SEXP Chol = magma_dpoMatrix_chol(x);
    SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix")));
    int *dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), info;

    SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
    slot_dup(val, Chol, Matrix_uploSym);
    slot_dup(val, Chol, Matrix_xSym);
    slot_dup(val, Chol, Matrix_DimSym);
    SET_SLOT(val, Matrix_DimNamesSym,
	     duplicate(GET_SLOT(x, Matrix_DimNamesSym)));
    double *A = REAL(GET_SLOT(val, Matrix_xSym));
		int N = *dims;	
		int lda = N;
		const char *uplo = uplo_P(val);
		
		if(GPUFlag == 0) {
			
			F77_CALL(dpotri)(uplo_P(val), dims, A, dims, &info);
		
		}
		else if(GPUFlag == 1 && Interface == 0) {
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Solving using magma_dpotri");
#endif
			magma_dpotri(uplo[0], N, A, lda, &info);
		}
		else if(GPUFlag == 1 && Interface == 1){
			double *d_A;
			cublasStatus retStatus;
			cublasAlloc( N * lda , sizeof(double), (void**)&d_A);
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Solving using magma_dpotri_gpu");
#endif		
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( N * lda, sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			magma_dpotri_gpu(uplo[0], N, d_A, lda, &info);

			cublasGetVector(N * lda, sizeof(double), d_A, 1, val, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/
			
			cublasFree(d_A);
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
	
		if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}
		
		UNPROTECT(1);
    return val;
#endif
	return R_NilValue;
}
Esempio n. 24
0
void ckm( struct svm_problem *prob, struct svm_problem *pecm, float *gamma  )
{
	cublasStatus_t status;

	double g_val = *gamma;

	long int nfa;
	
	int len_tv;
	int ntv;
	int i_v;
	int i_el;
	int i_r, i_c;
	int trvei;

	double *tv_sq;
	double *v_f_g;

	float *tr_ar;
	float *tva, *vtm, *DP;
	float *g_tva = 0, *g_vtm = 0, *g_DotProd = 0;

	cudaError_t cudaStat;   
	cublasHandle_t handle;
	
	status = cublasCreate(&handle);

	len_tv = prob-> x[0].dim;
	ntv   = prob-> l;

	nfa = len_tv * ntv; 

	tva = (float*) malloc ( len_tv * ntv* sizeof(float) );
	vtm = (float*) malloc ( len_tv * sizeof(float) );
	DP  = (float*) malloc ( ntv * sizeof(float) );

	tr_ar = (float*) malloc ( len_tv * ntv* sizeof(float) );

	tv_sq = (double*) malloc ( ntv * sizeof(double) );

	v_f_g  = (double*) malloc ( ntv * sizeof(double) );

	for ( i_r = 0; i_r < ntv ; i_r++ )
	{				 
		for ( i_c = 0; i_c < len_tv; i_c++ ) 
			tva[i_r * len_tv + i_c] = (float)prob-> x[i_r].values[i_c];
	}

	cudaStat = cudaMalloc((void**)&g_tva, len_tv * ntv * sizeof(float));
	
	if (cudaStat != cudaSuccess) {
		free( tva );
		free( vtm );
		free( DP  );

		free( v_f_g );
		free( tv_sq );

		cudaFree( g_tva );
		cublasDestroy( handle );	
	
		fprintf (stderr, "!!!! Device memory allocation error (A)\n");
		getchar();
		return;
    }

	cudaStat = cudaMalloc((void**)&g_vtm, len_tv * sizeof(float));

	cudaStat = cudaMalloc((void**)&g_DotProd, ntv * sizeof(float));

	for( i_r = 0; i_r < ntv; i_r++ )
		for( i_c = 0; i_c < len_tv; i_c++ )
			tr_ar[i_c * ntv + i_r] = tva[i_r * len_tv + i_c];

	// Copy cpu vector to gpu vector
	status = cublasSetVector( len_tv * ntv, sizeof(float), tr_ar, 1, g_tva, 1 );
    
	free( tr_ar );

	for( i_v = 0; i_v < ntv; i_v++ )
	{
		tv_sq[ i_v ] = 0;
		for( i_el = 0; i_el < len_tv; i_el++ )
			tv_sq[i_v] += pow( tva[i_v*len_tv + i_el], (float)2.0 );
	}



	for ( trvei = 0; trvei < ntv; trvei++ )
	{
		status = cublasSetVector( len_tv, sizeof(float), &tva[trvei * len_tv], 1, g_vtm, 1 );
		
		status = cublasSgemv( handle, CUBLAS_OP_N, ntv, len_tv, &alpha, g_tva, ntv , g_vtm, 1, &beta, g_DotProd, 1 );

		status = cublasGetVector( ntv, sizeof(float), g_DotProd, 1, DP, 1 );

		for ( i_c = 0; i_c < ntv; i_c++ )
			v_f_g[i_c] = exp( -g_val * (tv_sq[trvei] + tv_sq[i_c]-((double)2.0)* (double)DP[i_c] ));
		

		pecm-> x[trvei].values[0] = trvei + 1;
		
		for ( i_c = 0; i_c < ntv; i_c++ )
			pecm-> x[trvei].values[i_c + 1] = v_f_g[i_c];				
		

	}

	free( tva );
	free( vtm );
	free( DP  );
	free( v_f_g );
	free( tv_sq );

	cudaFree( g_tva );
	cudaFree( g_vtm );
	cudaFree( g_DotProd );

	cublasDestroy( handle );
}
Esempio n. 25
0
/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;
    cublasHandle_t handle;

    int dev = findCudaDevice(argc, (const char **) argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Performs operation using cublas */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < n2; ++i)
    {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }

    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);

    if (fabs(ref_norm) < 1e-7)
    {
        fprintf(stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    printf("CUBLAS program finished\n");

    exit(error_norm / ref_norm < 1e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
}
Esempio n. 26
0
int gpu_gemm(const real *h_A, const real *h_B, real *h_C, const real alpha,
    const real beta, const int N)
{
  real *d_A = 0;
  real *d_B = 0;
  real *d_C = 0;
  int n2 = N * N;

  cublasStatus_t status;
  cublasHandle_t handle;

  status = cublasCreate(&handle);
  if (status != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
  }
  /* Allocate device memory for the matrices */
  if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
  {
    fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
    return EXIT_FAILURE;
  }
  if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
  {
    fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
    return EXIT_FAILURE;
  }

  if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
  {
    fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
    return EXIT_FAILURE;
  }

  /* Initialize the device matrices with the host matrices */
  status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
  status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
  status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

  /* Performs operation using cublas */
  status = GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
  if (status != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }

  /* Read the result back */
  status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

  if (cudaFree(d_A) != cudaSuccess)
  {
    fprintf(stderr, "!!!! memory free error (A)\n");
    return EXIT_FAILURE;
  }

  if (cudaFree(d_B) != cudaSuccess)
  {
    fprintf(stderr, "!!!! memory free error (B)\n");
    return EXIT_FAILURE;
  }

  if (cudaFree(d_C) != cudaSuccess)
  {
    fprintf(stderr, "!!!! memory free error (C)\n");
    return EXIT_FAILURE;
  }

  /* Shutdown */
  status = cublasDestroy(handle);

  if (status != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
  }

  return 0;
}
Esempio n. 27
0
SEXP magma_dpoMatrix_dgeMatrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP Chol = magma_dpoMatrix_chol(a),
			 val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
			*bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)),
			info;

	/* Checking Matrix Dimensions */
	if (adims[1] != bdims[0])
		error(_("Dimensions of system to be solved are inconsistent"));
	if (adims[0] < 1 || bdims[1] < 1)
		error(_("Cannot solve() for matrices with zero extents"));
	/* ****************************************** */
	
	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	slot_dup(val, b, Matrix_DimSym);
	slot_dup(val, b, Matrix_xSym);

	double *A = REAL(GET_SLOT(Chol, Matrix_xSym));
	double *B = REAL(GET_SLOT(val, Matrix_xSym));

	if(GPUFlag == 1) {
	
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;");
#endif
		double *d_A, *d_B;
		const char *uplo = uplo_P(Chol);
		magma_int_t NRHS = bdims[1];
		magma_int_t lda	 = adims[1];
		magma_int_t ldb  = bdims[0];
		magma_int_t N 	 = adims[0];
		cublasStatus retStatus;

		/*if(uplo == "U")
			uplo = MagmaUpperStr;
		else if(uplo == "L")
			uplo = MagmaLowerStr;
		else		
			uplo = MagmaUpperStr;
		*/

		cublasAlloc(N * lda, sizeof(double), (void**)&d_A);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B);	

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasSetVector( N  * lda , sizeof(double), A, 1, d_A, 1);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/

		cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 );
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/

		magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info);

		cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer from Device"));
		/********************************************/
		
		cublasFree(d_A);
		cublasFree(d_B);
	}
	else {
	
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs;");
#endif
		F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1, A , adims, B , bdims, &info);
	}
	if (info) {
		if(info > 0)
			error(_("the leading minor of order %d is not positive definite"),
					info);
		else /* should never happen! */
			error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
	}
	UNPROTECT(1);
	return val;
#endif
	return R_NilValue;
}
Esempio n. 28
0
File: xgemm.c Progetto: deccs/PLearn
/* Main */
int main(int argc, char** argv)
{    
  if (argc!=5){ 
    fprintf (stderr, "Usage: %s <sizeM> <sizeN> <sizeK> <Nb iter>\n",argv[0]); 
    exit(0); 
  } 
  const int M=strtol(argv[1],0,0);
  const int N=strtol(argv[2],0,0);
  const int K=strtol(argv[3],0,0);
  const int NBITER=strtol(argv[4],0,0);
  const int NA= M * K;
  const int NB= K * N;
  const int NC= M * N;
  real* h_A;
  real* h_B;
  real* h_C;
  const real alpha = 1.0f;
  const real beta = 0.0f;
#ifdef NVIDIA
  cublasStatus status;
  real* d_A = 0;
  real* d_B = 0;
  real* d_C = 0;
#endif

#ifdef COMPARE
  real* h_C_ref;
  real error_norm;
  real ref_norm;
  real diff;
#endif

    /* Allocate host memory for the matrices */
    h_A = (real*)malloc(NA * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    h_B = (real*)malloc(NB * sizeof(h_B[0]));
    if (h_B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    h_C = (real*)malloc(NC * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    for (int i = 0; i < NA; ++i) h_A[i] = M_PI+(real)i;
    for (int i = 0; i < NB; ++i) h_B[i] = M_PI+(real)i;

#ifdef NVIDIA
    /* Initialize CUBLAS */
    status = cublasInit();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }
    /* Allocate device memory for the matrices */
    status = cublasAlloc(NA, sizeof(d_A[0]), (void**)&d_A);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    status = cublasAlloc(NB, sizeof(d_B[0]), (void**)&d_B);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    status = cublasAlloc(NC, sizeof(d_C[0]), (void**)&d_C);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(NA, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(NB, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(NC, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Clear last error */
    cublasGetError();
#endif
#ifdef COMPARE
    /* Performs operation using plain C code */
    for (int i=0;i<NBITER;i++)
      c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;
    /* Allocate host memory for reading back the result from device memory */
    h_C = (real*)malloc(NC * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }
#endif
#ifdef NVIDIA
    /* Performs operation using cublas */
    for (int i=0;i<NBITER;i++)
      //We must Change the order of the parameter as cublas take
      //matrix as colomn major and C matrix is row major
      cublasSgemm('n', 'n', N, M, K, alpha, d_B, N, d_A, K, beta, d_C, N);

    status = cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    /* Read the result back */
    status = cublasGetVector(NC, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }
#elif defined( CXGEMM )
    for (int i=0;i<NBITER;i++)
      c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C);
#else
    char transa='N', transb='N';
    for (int i=0;i<NBITER;i++)
      sgemm_(&transb, &transa, &N, &M, &K, &alpha, h_B, &N, h_A, &K, &beta, h_C, &N);

#endif
#ifdef COMPARE
    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;
    for (int i = 0; i < NC; ++i) {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }
    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
    if (fabs(ref_norm) < 1e-7) {
        fprintf (stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }
    printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");
#endif

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);

#ifdef NVIDIA
    status = cublasFree(d_A);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    status = cublasFree(d_B);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    status = cublasFree(d_C);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasShutdown();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }
#endif
    //    if (argc <= 1 || strcmp(argv[1], "-noprompt")) {
    //        printf("\nPress ENTER to exit...\n");
    //        getchar();
    //    }
    return EXIT_SUCCESS;
}
Esempio n. 29
0
SEXP magma_dpoMatrix_matrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
    SEXP Chol = magma_dpoMatrix_chol(a),
	val = PROTECT(duplicate(b));
    int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
	*bdims = INTEGER(getAttrib(b, R_DimSymbol)),
	info;

    if (!(isReal(b) && isMatrix(b)))
	error(_("Argument b must be a numeric matrix"));
    if (*adims != *bdims || bdims[1] < 1 || *adims < 1)
	error(_("Dimensions of system to be solved are inconsistent"));
    
		double *A = REAL(GET_SLOT(Chol, Matrix_xSym));
		double *B = REAL(val);
		//const char *uplo = uplo_P(Chol);
		//int N = bdims[1];
		//There is only a GPU interface for this call
		//so it will be the default setting if the GPU is on
		if(GPUFlag == 1) {
	
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;");
#endif
			double *d_A, *d_B;
			const char *uplo = uplo_P(Chol);
			magma_int_t NRHS = bdims[1];
			magma_int_t lda	 = adims[1];
			magma_int_t ldb  = bdims[0];
			magma_int_t N 	 = adims[0];
			cublasStatus retStatus;
			cublasAlloc(N * lda, sizeof(double), (void**)&d_A);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( N  * lda , sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info);

			cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/

			cublasFree(d_A);
			cublasFree(d_B);
		}
		else {
		F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1,
		     REAL(GET_SLOT(Chol, Matrix_xSym)), adims,
		     REAL(val), bdims, &info);
		}
		// Error checking of MAGMA/LAPACK calls
		if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}

		UNPROTECT(1);
    return val;
#endif
	return R_NilValue;
}
Esempio n. 30
0
int main(void)

{
    
    cublasStatus status;
    
    float* h_image;
    
    float* h_covariance;
    
    float* d_image;
    
    float* d_covariance;
    
    float alpha = 1.0f;
    
    float beta = 0.0f;
    
    int imgsize = N * L;
    
    //int i;
    
    FILE *fp1, *fp2;
    
    /* Initialize CUBLAS */
    
    status = cublasInit();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Allocate host memory for the image */
    
    h_image = (float*)malloc(imgsize * sizeof(float));
    
    if (h_image == 0) {
        
        fprintf (stderr, "!!!! host memory allocation error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    h_covariance = (float*)calloc(L * L, sizeof(float));
    
    if (h_covariance == 0) {
        
        fprintf (stderr, "!!!! host memory allocation error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Fill the image with test data
     
     for (i = 0; i < imgsize; i++) {
     
     h_image[i] = rand() / (float)RAND_MAX;
     
     }*/
    
    fp1 = fopen("image.dat","rb");
    
    fread(h_image, sizeof(float), imgsize, fp1);
    
    printf("Valor de image[0]: %f\n", h_image[8]);
    
    /* Allocate device memory */
    
    status = cublasAlloc(imgsize, sizeof(float), (void**)&d_image);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device memory allocation error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasAlloc(L * L, sizeof(float), (void**)&d_covariance);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device memory allocation error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Copy image to device memory */
    
    status = cublasSetVector(imgsize, sizeof(float), h_image, 1, d_image, 1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (write A)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasSetVector(L * L, sizeof(float), h_covariance, 1, d_covariance,
                             
                             1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (write covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Clear last error */
    
    cublasGetError();
    
    /* Calculate covariance matrix using cublas */
    
    cublasSgemm('n', 't', L, L, N, alpha, d_image, L, d_image, L, beta,
                
                d_covariance, L);
    
    status = cublasGetError();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! kernel execution error.\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Read the result back */
    
    status = cublasGetVector(L * L, sizeof(float), d_covariance, 1, h_covariance,
                             
                             1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (read covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    fp2 = fopen("covariance.dat","wb");
    
    fwrite(h_covariance, sizeof(float), L*L, fp2);
    
    printf("Valor de covariance[8]: %f\n", h_covariance[8]);
    
    /* Memory clean up */
    
    free(h_image);
    
    free(h_covariance);
    
    status = cublasFree(d_image);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! memory free error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasFree(d_covariance);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! memory free error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Shutdown */
    
    status = cublasShutdown();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! shutdown error (A)\n");
        
        return EXIT_FAILURE;
        
    }
    
    fclose(fp1);
    
    fclose(fp2);
    
    printf("\nPress ENTER to exit...\n");
    
    getchar();
    
    return EXIT_SUCCESS;
    
}