void CudaUtil::cublasCheckGetVector(int n, int elemSize, void* devicePtr, int incx, void* hostPtr, int incy, int line, const char* file) { cublasStatus_t status = cublasGetVector(n, elemSize, devicePtr, incx, hostPtr, incy); if (status != CUBLAS_STATUS_SUCCESS) { std::ostringstream os; os << "CUBALS device read error, line " << line << ", in file " << file; throw CudaException(os.str()); } }
void magma_getvector( magma_int_t n, size_t elemSize, void const* dx_src, magma_int_t incx, void* hy_dst, magma_int_t incy ) { cublasStatus_t status; status = cublasGetVector( n, elemSize, dx_src, incx, hy_dst, incy ); check_error( status ); }
void magma_sgetvector_internal( magma_int_t n, float const* dx_src, magma_int_t incx, float* hy_dst, magma_int_t incy, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetVector( n, sizeof(float), dx_src, incx, hy_dst, incy ); check_xerror( status, func, file, line ); }
SEXP d_getVector(SEXP vList, SEXP inc) { int n, increment = asInteger(inc); double * dPtr; unpackVector(vList, &n, &dPtr); SEXP out; PROTECT(out = allocVector(REALSXP, n)); cublasGetVector(n, sizeof(double), dPtr, increment, REAL(out), 1); checkCublasError("d_getVector"); UNPROTECT(1); return out; }
void solve_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * b, cublasHandle_t handle) { double * d_b; checkCudaErrors(cudaMalloc(&d_b, gpu_matrix.order*sizeof(double))); checkCublasErrors(cublasSetVector(gpu_matrix.order, sizeof(double), b, 1, d_b, 1)); solve_lower_system_on_gpu(gpu_matrix, d_b, handle); solve_upper_system_on_gpu(gpu_matrix, d_b, handle); checkCublasErrors(cublasGetVector(gpu_matrix.order, sizeof(double), d_b, 1, b, 1)); checkCudaErrors(cudaFree(d_b)); }
void magma_getvector_internal( magma_int_t n, magma_int_t elemSize, void const* dx_src, magma_int_t incx, void* hy_dst, magma_int_t incy, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetVector( n, elemSize, dx_src, incx, hy_dst, incy ); check_xerror( status, func, file, line ); }
void cuda_scal(const cublasHandle_t blasHandle, const int n, const T alpha, T x[], int incX, SCAL scal) { T *d_X = NULL; cudaMalloc((void**)&d_X, n*sizeof(T)); cublasSetVector(n, sizeof(T), x, incX, d_X, incX); scal(blasHandle, n, &alpha, d_X, incX); cublasGetVector(n, sizeof(T), d_X, incX, x, incX); cudaFree(d_X); }
// -------------------- extern "C" void magma_zgetvector_internal( magma_int_t n, magmaDoubleComplex_const_ptr dx_src, magma_int_t incx, magmaDoubleComplex* hy_dst, magma_int_t incy, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetVector( n, sizeof(magmaDoubleComplex), dx_src, incx, hy_dst, incy ); check_xerror( status, func, file, line ); }
int double_copyGPU2Host_Transpose(PGM_Matriz_Double *host, double *device, int device_col, PGM_Matriz_Double *work){ int i,j; double *ptr; for( i = 0; i < host->n_linhas; i++ ){ ptr = host->valor+(i*host->n_colunas); if(cublasGetVector(host->n_colunas,sizeof(double), device+i, device_col ,work->valor, 1) != CUBLAS_STATUS_SUCCESS){ printf("Error: nao foi possivel executar a copia!\n"); return -1; } for( j = 0; j < host->n_colunas; j++) ptr[j] = work->valor[j]; } return CUBLAS_STATUS_SUCCESS; }
void cuda_axpy(const cublasHandle_t blasHandle, const int n, const T alpha, const T x[], int incX, T y[], int incY, AXPY axpy) { T *d_X = NULL; T *d_Y = NULL; cudaMalloc((void**)&d_X, n*sizeof(T)); cudaMalloc((void**)&d_Y, n*sizeof(T)); cublasSetVector(n, sizeof(T), x, incX, d_X, incX); cublasSetVector(n, sizeof(T), y, incY, d_Y, incY); axpy(blasHandle, n, &alpha, d_X, incX, d_Y, incX); cublasGetVector(n, sizeof(T), d_Y, incY, y, incY); cudaFree(d_X); cudaFree(d_Y); }
SEXP magma_dgeMatrix_matrix_crossprod(SEXP x, SEXP y, SEXP trans) { #ifdef HIPLAR_WITH_MAGMA int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x,y) */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *xDims = INTEGER(GET_SLOT(x, Matrix_DimSym)), *yDims = INTEGER(getAttrib(y, R_DimSymbol)), *vDims, nprot = 1; int m = xDims[!tr], n = yDims[!tr];/* -> result dim */ int xd = xDims[ tr], yd = yDims[ tr];/* the conformable dims */ double one = 1.0, zero = 0.0; if (isInteger(y)) { y = PROTECT(coerceVector(y, REALSXP)); nprot++; } if (!(isMatrix(y) && isReal(y))) error(_("Argument y must be a numeric matrix")); SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); SET_SLOT(val, Matrix_DimSym, allocVector(INTSXP, 2)); vDims = INTEGER(GET_SLOT(val, Matrix_DimSym)); if (xd > 0 && yd > 0 && n > 0 && m > 0) { if (xd != yd) error(_("Dimensions of x and y are not compatible for %s"), tr ? "tcrossprod" : "crossprod"); vDims[0] = m; vDims[1] = n; SET_SLOT(val, Matrix_xSym, allocVector(REALSXP, m * n)); double *A = REAL(GET_SLOT(x, Matrix_xSym)); double *B = REAL(y); double *C = REAL(GET_SLOT(val, Matrix_xSym)); if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing dge/matrix crossprod using magmablas_dgemm"); #endif cublasAlloc(m * xd, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * xd, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( m * xd , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( xd * n, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( m * n, sizeof(double), C, 1, d_C, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C, m); //CHANGE cublasDgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C, m); cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing dge/matrix cross prod with dgemm"); #endif F77_CALL(dgemm)(tr ? "N" : "T", tr ? "T" : "N", &m, &n, &xd, &one, A , xDims, B , yDims, &zero, C, &m); } } UNPROTECT(nprot); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_crossprod(SEXP x, SEXP trans) { #ifdef HIPLAR_WITH_MAGMA int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x) */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))), nms = VECTOR_ELT(GET_SLOT(x, Matrix_DimNamesSym), tr ? 0 : 1), vDnms = ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2); int *Dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), *vDims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2)); int k = tr ? Dims[1] : Dims[0], n = tr ? Dims[0] : Dims[1]; double *vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)), one = 1.0, zero = 0.0; double *A = REAL(GET_SLOT(x, Matrix_xSym)); AZERO(vx, n * n); SET_SLOT(val, Matrix_uploSym, mkString("U")); ALLOC_SLOT(val, Matrix_factorSym, VECSXP, 0); vDims[0] = vDims[1] = n; SET_VECTOR_ELT(vDnms, 0, duplicate(nms)); SET_VECTOR_ELT(vDnms, 1, duplicate(nms)); if(n && GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing crossproduct using cublasDsyrk"); #endif cublasStatus retStatus; double *d_A, *d_C; /*retStatus = cublasCreate(&handle); if ( retStatus != CUBLAS_STATUS_SUCCESS ) error(_("CUBLAS initialisation failed")); */ cublasAlloc(n * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( n * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ //cublasSetVector( n * n , sizeof(double), vx, 1, d_C, 1); /* Error Checking */ //retStatus = cublasGetError (); //if (retStatus != CUBLAS_STATUS_SUCCESS) // error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasDsyrk('U' , tr ? 'N' : 'T', n, k, one, d_A, Dims[0], zero, d_C, n); cublasGetVector( n * n , sizeof(double), d_C, 1, vx, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_C); } else if(n){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing cross prod with dsyrk"); #endif F77_CALL(dsyrk)("U", tr ? "N" : "T", &n, &k, &one, A, Dims, &zero, vx, &n); } SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); UNPROTECT(1); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_matrix_mm(SEXP a, SEXP bP, SEXP right) { #ifdef HIPLAR_WITH_MAGMA SEXP b = PROTECT(mMatrix_as_dgeMatrix(bP)), val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)), *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)), *cdims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2)); double one = 1.0, zero = 0.0; if (asLogical(right)) { int m = bdims[0], n = adims[1], k = bdims[1]; if (adims[0] != k) error(_("Matrices are not conformable for multiplication")); cdims[0] = m; cdims[1] = n; if (m < 1 || n < 1 || k < 1) { // This was commented out error(_("Matrices with zero extents cannot be multiplied")); ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n); } else { double *B = REAL(GET_SLOT(b, Matrix_xSym)); double *A = REAL(GET_SLOT(a, Matrix_xSym)); double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n)); //TODO add magma here too if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication with Right = true using magmablas_dgemm"); #endif cublasAlloc(n * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * k, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( n * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( m * k, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m); //CHANGED 30/07 cublasDgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) { error(_("CUBLAS: Error in cublasDgemm routine")); } /********************************************/ cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using dgemm with right = TRUE"); #endif F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one, B, &m, A , &k, &zero, C , &m); } } } else { int m = adims[0], n = bdims[1], k = adims[1]; double *A = REAL(GET_SLOT(a, Matrix_xSym)); double *B = REAL(GET_SLOT(b, Matrix_xSym)); if (bdims[0] != k) error(_("Matrices are not conformable for multiplication")); cdims[0] = m; cdims[1] = n; double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n)); if (m < 1 || n < 1 || k < 1) { // This was commented out error(_("Matrices with zero extents cannot be multiplied")); ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n); } else { if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using magmablas_dgemm"); #endif cublasAlloc(m * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * k, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( m * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( n * k, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m); //CHANGE cublasDgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m); retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) { error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ } cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using dgemm"); #endif F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one, A, &m, B, &k, &zero, C, &m); } } } ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2); UNPROTECT(2); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_matrix_solve(SEXP a, SEXP b) { #ifdef HIPLAR_WITH_MAGMA SEXP val = PROTECT(dup_mMatrix_as_dgeMatrix(b)), lu = PROTECT(magma_dgeMatrix_LU_(a, TRUE)); int *adims = INTEGER(GET_SLOT(lu, Matrix_DimSym)), *bdims = INTEGER(GET_SLOT(val, Matrix_DimSym)); int info, n = bdims[0], nrhs = bdims[1]; if (*adims != *bdims || bdims[1] < 1 || *adims < 1 || *adims != adims[1]) error(_("Dimensions of system to be solved are inconsistent")); double *A = REAL(GET_SLOT(lu, Matrix_xSym)); double *B = REAL(GET_SLOT(val, Matrix_xSym)); int *ipiv = INTEGER(GET_SLOT(lu, Matrix_permSym)); if(GPUFlag == 0) { F77_CALL(dgetrs)("N", &n, &nrhs, A, &n, ipiv, B, &n, &info); #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solve using LU using dgetrs;"); #endif }else if(GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solve using LU using magma_dgetrs;"); #endif double *d_A, *d_B; cublasStatus retStatus; cublasAlloc(adims[0] * adims[1], sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation of A on Device")); /********************************************/ cublasAlloc(n * nrhs, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation of b on Device")); /********************************************/ cublasSetVector(adims[0] * adims[1], sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Transferring data to advice")); /********************************************/ cublasSetVector(n * nrhs, sizeof(double), B, 1, d_B, 1); magma_dgetrs_gpu( 'N', n, nrhs, d_A, n, ipiv, d_B, n, &info ); cublasGetVector(n * nrhs, sizeof(double), d_B, 1, B, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Transferring from to advice")); /********************************************/ cublasFree(d_A); cublasFree(d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in freeing data")); /********************************************/ } if (info) error(_("Lapack routine dgetrs: system is exactly singular")); UNPROTECT(2); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_solve(SEXP a) { #ifdef HIPLAR_WITH_MAGMA /* compute the 1-norm of the matrix, which is needed later for the computation of the reciprocal condition number. */ double aNorm = magma_get_norm(a, "1"); /* the LU decomposition : */ /* Given that we may be performing this operation * on the GPU we may put in an optimisation here * where if we call the LU solver we, we do not require * the decomposition to be transferred back to CPU. This is TODO */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))), lu = magma_dgeMatrix_LU_(a, TRUE); int *dims = INTEGER(GET_SLOT(lu, Matrix_DimSym)), *pivot = INTEGER(GET_SLOT(lu, Matrix_permSym)); /* prepare variables for the dgetri calls */ double *x, tmp; int info, lwork = -1; if (dims[0] != dims[1]) error(_("Solve requires a square matrix")); slot_dup(val, lu, Matrix_xSym); x = REAL(GET_SLOT(val, Matrix_xSym)); slot_dup(val, lu, Matrix_DimSym); int N2 = dims[0] * dims[0]; if(dims[0]) /* the dimension is not zero */ { /* is the matrix is *computationally* singular ? */ double rcond; F77_CALL(dgecon)("1", dims, x, dims, &aNorm, &rcond, (double *) R_alloc(4*dims[0], sizeof(double)), (int *) R_alloc(dims[0], sizeof(int)), &info); if (info) error(_("error [%d] from Lapack 'dgecon()'"), info); if(rcond < DOUBLE_EPS) error(_("Lapack dgecon(): system computationally singular, reciprocal condition number = %g"), rcond); /* only now try the inversion and check if the matrix is *exactly* singular: */ // This is also a work space query. This is not an option in magma F77_CALL(dgetri)(dims, x, dims, pivot, &tmp, &lwork, &info); lwork = (int) tmp; if( GPUFlag == 0){ F77_CALL(dgetri)(dims, x, dims, pivot, (double *) R_alloc((size_t) lwork, sizeof(double)), &lwork, &info); #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solve using LU using dgetri;"); #endif } else if(GPUFlag == 1) { double *d_x, *dwork; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("Solve using LU using magma_dgetri;"); #endif cublasAlloc(N2, sizeof(double), (void**)&d_x); //cublasAlloc(N2 , sizeof(double), (void**)&dtmp); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation on Device")); /********************************************/ cublasSetVector( N2, sizeof(double), x, 1, d_x, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ lwork = dims[0] * magma_get_dgetri_nb( dims[0] ); cublasAlloc(lwork, sizeof(double), (void**)&dwork); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation on Device")); /********************************************/ magma_dgetri_gpu(dims[0], d_x, dims[0], pivot, dwork , lwork, &info); cublasGetVector(N2, sizeof(double), d_x, 1, x, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data From to Device")); /********************************************/ cublasFree(dwork); cublasFree(d_x); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error freeing memory")); /********************************************/ } else error(_("GPUFlag not set correctly")); if (info) error(_("Lapack routine dgetri: system is exactly singular")); } UNPROTECT(1); return val; #endif return R_NilValue; }
SEXP magma_dpoMatrix_chol(SEXP x) { #ifdef HIPLAR_WITH_MAGMA SEXP val = get_factors(x, "Cholesky"), dimP = GET_SLOT(x, Matrix_DimSym), uploP = GET_SLOT(x, Matrix_uploSym); const char *uplo = CHAR(STRING_ELT(uploP, 0)); int *dims = INTEGER(dimP), info; int n = dims[0]; double *vx; cublasStatus retStatus; if (val != R_NilValue) return val; dims = INTEGER(dimP); val = PROTECT(NEW_OBJECT(MAKE_CLASS("Cholesky"))); SET_SLOT(val, Matrix_uploSym, duplicate(uploP)); SET_SLOT(val, Matrix_diagSym, mkString("N")); SET_SLOT(val, Matrix_DimSym, duplicate(dimP)); vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)); AZERO(vx, n * n); //we could put in magmablas_dlacpy but it only //copies all of the matrix F77_CALL(dlacpy)(uplo, &n, &n, REAL(GET_SLOT(x, Matrix_xSym)), &n, vx, &n); if (n > 0) { if(GPUFlag == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using dpotrf;"); #endif F77_CALL(dpotrf)(uplo, &n, vx, &n, &info); } else if(GPUFlag == 1 && Interface == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf;"); #endif int nrows, ncols; nrows = ncols = n; magma_int_t lda; lda = nrows; magma_dpotrf(uplo[0], ncols, vx, lda, &info); /* Error Checking */ retStatus = cudaGetLastError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in magma_dpotrf")); /********************************************/ } else if(GPUFlag == 1 && Interface == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf_gpu;"); #endif double *d_c; int nrows, ncols; nrows = ncols = n; int N2 = nrows * ncols; magma_int_t lda; lda = nrows; cublasAlloc(lda * ncols, sizeof(double), (void**)&d_c); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector(N2, sizeof(double), vx, 1, d_c, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer to Device")); /********************************************/ magma_dpotrf_gpu(uplo[0], ncols, d_c, lda, &info); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in magma_dpotrf_gpu")); /********************************************/ cublasGetVector(nrows * ncols, sizeof(double), d_c, 1, vx, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer from Device")); /********************************************/ cublasFree(d_c); } else error(_("MAGMA/LAPACK/Interface Flag not defined correctly")); } if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return set_factors(x, val, "Cholesky"); #endif return R_NilValue; }
/* Main */ void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { if (nrhs != 7) { mexErrMsgTxt("sgemm requires 7 input arguments"); } else if (nlhs != 1) { mexErrMsgTxt("sgemm requires 1 output argument"); } if ( !mxIsSingle(prhs[4]) || !mxIsSingle(prhs[5]) || !mxIsSingle(prhs[6])) { mexErrMsgTxt("Input arrays must be single precision."); } int ta = (int) mxGetScalar(prhs[0]); int tb = (int) mxGetScalar(prhs[1]); float alpha = (float) mxGetScalar(prhs[2]); float beta = (float) mxGetScalar(prhs[3]); float *h_A = (float*) mxGetData(prhs[4]); float *h_B = (float*) mxGetData(prhs[5]); float *h_C = (float*) mxGetData(prhs[6]); int M = mxGetM(prhs[4]); /* gets number of rows of A */ int K = mxGetN(prhs[4]); /* gets number of columns of A */ int L = mxGetM(prhs[5]); /* gets number of rows of B */ int N = mxGetN(prhs[5]); /* gets number of columns of B */ cublasOperation_t transa, transb; int MM, KK, NN; if (ta == 0) { transa = CUBLAS_OP_N; MM=M; KK=K; } else { transa = CUBLAS_OP_T; MM=K; KK=M; } if (tb == 0) { transb = CUBLAS_OP_N; NN=N; } else { transb = CUBLAS_OP_T; NN=L; } /* printf("transa=%c\n",transa); printf("transb=%c\n",transb); printf("alpha=%f\n",alpha); printf("beta=%f\n",beta); */ /* Left hand side matrix set up */ mwSize dims0[2]; dims0[0]=MM; dims0[1]=NN; plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL); float *h_C_out = (float*) mxGetData(plhs[0]); cublasStatus_t status; cublasHandle_t handle; status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! CUBLAS initialization error\n"); } float* d_A = 0; float* d_B = 0; float* d_C = 0; /* Allocate device memory for the matrices */ if (cudaMalloc((void**)&d_A, M * K * sizeof(d_A[0])) != cudaSuccess) { mexErrMsgTxt("!!!! device memory allocation error (allocate A)\n"); } if (cudaMalloc((void**)&d_B, L * N * sizeof(d_B[0])) != cudaSuccess) { mexErrMsgTxt("!!!! device memory allocation error (allocate B)\n"); } if (cudaMalloc((void**)&d_C, MM * NN * sizeof(d_C[0])) != cudaSuccess) { mexErrMsgTxt("!!!! device memory allocation error (allocate C)\n"); } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(M * K, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (write A)\n"); } status = cublasSetVector(L * N, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (write B)\n"); } status = cublasSetVector(MM * NN, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (write C)\n"); } /* Performs operation using cublas */ status = cublasSgemm(handle, transa, transb, MM, NN, KK, &alpha, d_A, M, d_B, L, &beta, d_C, MM); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! kernel execution error.\n"); } /* Read the result back */ status = cublasGetVector(MM * NN, sizeof(h_C[0]), d_C, 1, h_C_out, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (read C)\n"); } if (cudaFree(d_A) != cudaSuccess) { mexErrMsgTxt("!!!! memory free error (A)\n"); } if (cudaFree(d_B) != cudaSuccess) { mexErrMsgTxt("!!!! memory free error (B)\n"); } if (cudaFree(d_C) != cudaSuccess) { mexErrMsgTxt("!!!! memory free error (C)\n"); } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! shutdown error (A)\n"); } }
/* Main */ int test_cublas(void) { cublasStatus status; cudaError_t e; float* h_A; float* h_B; float* h_C; float* h_C_ref; float* d_A = 0; void *vp; float* d_B = 0; float* d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float*)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float*)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf (stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (A)\n"); return EXIT_FAILURE; } d_A = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (B)\n"); return EXIT_FAILURE; } d_B = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (C)\n"); return EXIT_FAILURE; } d_C = (float *) vp; /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Clear last error */ cublasGetError(); /* Performs operation using cublas */ cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf (stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED"); /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); e = cudaFree(d_A); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } e = cudaFree(d_B); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } e = cudaFree(d_C); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } return EXIT_SUCCESS; }
float getv(float *da){ float res[1]; cublasGetVector(1, sizeof(float), da, 1, res, 1); return res[0]; }
/* Main */ int main(int argc, char **argv) { cublasStatus_t status; float *h_A; float *h_B; float *h_C; float *h_C_rnd; float *h_C_ref; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; cublasHandle_t handle; int dev_id; cudaDeviceProp device_prop; bool do_device_api_test = false; float host_api_test_ratio, device_api_test_ratio; /* Initialize CUBLAS */ printf("simpleCUBLAS test running...\n"); dev_id = findCudaDevice(argc, (const char **) argv); checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); if ((device_prop.major << 4) + device_prop.minor >= 0x35) { printf("Host and device APIs will be tested.\n"); do_device_api_test = true; } /* else if ((device_prop.major << 4) + device_prop.minor >= 0x20) { printf("Host API will be tested.\n"); do_device_api_test = false; } */ else { fprintf(stderr, "simpleDevLibCUBLAS examples requires Compute Capability of SM 3.5 or higher\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_SUCCESS; } status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float *)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf(stderr, "!!!! host memory allocation error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_B = (float *)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf(stderr, "!!!! host memory allocation error (B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_C_rnd = (float *)malloc(n2 * sizeof(h_C_rnd[0])); if (h_C_rnd == 0) { fprintf(stderr, "!!!! host memory allocation error (C_rnd)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_C = (float *)malloc(n2 * sizeof(h_C_ref[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C_rnd[i] = rand() / (float)RAND_MAX; h_C[i] = h_C_rnd[i]; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* * Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* * Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Check result against reference */ host_api_test_ratio = check_result(h_C, h_C_ref, n2); if (do_device_api_test) { /* Reset device resident C matrix */ status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* * Performs operation using the device API of CUBLAS library */ device_cublas_sgemm(N, alpha, d_A, d_B, beta, d_C); /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Check result against reference */ device_api_test_ratio = check_result(h_C, h_C_ref, n2); } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_rnd); free(h_C_ref); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); bool test_result = do_device_api_test ? host_api_test_ratio < 1e-6 && device_api_test_ratio < 1e-6 : host_api_test_ratio < 1e-6; printf("simpleCUBLAS completed, returned %s\n", test_result ? "OK" : "ERROR!"); exit(test_result ? EXIT_SUCCESS : EXIT_FAILURE); }
SEXP magma_dgeMatrix_LU_(SEXP x, Rboolean warn_sing) { #ifdef HIPLAR_WITH_MAGMA SEXP val = get_factors(x, "LU"); int *dims, npiv, info; if (val != R_NilValue) { // R_ShowMessage("already in slot"); /* nothing to do if it's there in 'factors' slot */ return val; } dims = INTEGER(GET_SLOT(x, Matrix_DimSym)); if (dims[0] < 1 || dims[1] < 1) error(_("Cannot factor a matrix with zero extents")); npiv = (dims[0] < dims[1]) ? dims[0] : dims[1]; val = PROTECT(NEW_OBJECT(MAKE_CLASS("denseLU"))); slot_dup(val, x, Matrix_xSym); slot_dup(val, x, Matrix_DimSym); double *h_R = REAL(GET_SLOT(val, Matrix_xSym)); int *ipiv = INTEGER(ALLOC_SLOT(val, Matrix_permSym, INTSXP, npiv)); if(GPUFlag == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: LU decomposition using dgetrf;"); #endif F77_CALL(dgetrf)(dims, dims + 1, h_R, dims, ipiv, &info); } else if(GPUFlag == 1 && Interface == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: LU decomposition using magma_dgetrf;"); #endif magma_dgetrf(dims[0], dims[1], h_R, dims[0], ipiv, &info); } else if(GPUFlag == 1 && Interface == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: LU decomposition using magma_dgetrf_gpu;"); #endif double *d_A; int N2 = dims[0] * dims[1]; cublasStatus retStatus; cublasAlloc( N2 , sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector(N2, sizeof(double), h_R, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer to Device")); /********************************************/ magma_dgetrf_gpu(dims[0],dims[1], d_A, dims[0], ipiv, &info); cublasGetVector( N2, sizeof(double), d_A, 1, h_R, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer from Device")); /********************************************/ cublasFree(d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error freeing data")); /********************************************/ } else error(_("MAGMA/LAPACK/Interface Flag not defined correctly")); if (info < 0) error(_("Lapack routine %s returned error code %d"), "dgetrf", info); else if (info > 0 && warn_sing) warning(_("Exact singularity detected during LU decomposition: %s, i=%d."), "U[i,i]=0", info); UNPROTECT(1); return set_factors(x, val, "LU"); #endif return R_NilValue; }
// Single precision matrix multiplication using CUBLAS // C = A * B // // CUBLAS function called // // cublasSgemm // // CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2 ( // cublasHandle_t handle, // cublasOperation_t transa, // cublasOperation_t transb, // int m, // number of rows in matrices A and C // int n, // number of columns in matrices B and C // int k, // number of columns in A and number of rows in B // const float *alpha, /* host or device pointer */ // const float *A, // int lda, // const float *B, // int ldb, // const float *beta, /* host or device pointer */ // float *C, // int ldc); // int cublasSGEMM (float* C, float* A, float* B, int HA, int WA, int WB) { cublasStatus_t status; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; float error_norm; float ref_norm; float diff; cublasHandle_t handle; cudaError_t cudaStatus; // Make sure CUDA device 0 is available cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); return 1; } /* Initialize CUBLAS */ status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate device memory for the matrices (d_A, d_B, and d_C) */ if (cudaMalloc((void **)&d_A, HA * WA * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, WA * WB * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, HA * WB * sizeof(d_C[0])) != cudaSuccess) { printf("!!!! device memory allocation error (allocate C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices (A, B, and C) */ status = cublasSetVector(HA * WA, sizeof(A[0]), A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(WA * WB, sizeof(B[0]), B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(HA * WB, sizeof(C[0]), C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, HA, WB, WA, &alpha, d_A, HA, d_B, WA, &beta, d_C, HA); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back (to C)*/ status = cublasGetVector(HA * WB, sizeof(C[0]), d_C, 1, C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Memory clean up */ if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } return 0; }
SEXP magma_dpoMatrix_solve(SEXP x) { #ifdef HIPLAR_WITH_MAGMA SEXP Chol = magma_dpoMatrix_chol(x); SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))); int *dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), info; SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); slot_dup(val, Chol, Matrix_uploSym); slot_dup(val, Chol, Matrix_xSym); slot_dup(val, Chol, Matrix_DimSym); SET_SLOT(val, Matrix_DimNamesSym, duplicate(GET_SLOT(x, Matrix_DimNamesSym))); double *A = REAL(GET_SLOT(val, Matrix_xSym)); int N = *dims; int lda = N; const char *uplo = uplo_P(val); if(GPUFlag == 0) { F77_CALL(dpotri)(uplo_P(val), dims, A, dims, &info); } else if(GPUFlag == 1 && Interface == 0) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving using magma_dpotri"); #endif magma_dpotri(uplo[0], N, A, lda, &info); } else if(GPUFlag == 1 && Interface == 1){ double *d_A; cublasStatus retStatus; cublasAlloc( N * lda , sizeof(double), (void**)&d_A); #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving using magma_dpotri_gpu"); #endif /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( N * lda, sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ magma_dpotri_gpu(uplo[0], N, d_A, lda, &info); cublasGetVector(N * lda, sizeof(double), d_A, 1, val, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); } else error(_("MAGMA/LAPACK/Interface Flag not defined correctly")); if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return val; #endif return R_NilValue; }
void ckm( struct svm_problem *prob, struct svm_problem *pecm, float *gamma ) { cublasStatus_t status; double g_val = *gamma; long int nfa; int len_tv; int ntv; int i_v; int i_el; int i_r, i_c; int trvei; double *tv_sq; double *v_f_g; float *tr_ar; float *tva, *vtm, *DP; float *g_tva = 0, *g_vtm = 0, *g_DotProd = 0; cudaError_t cudaStat; cublasHandle_t handle; status = cublasCreate(&handle); len_tv = prob-> x[0].dim; ntv = prob-> l; nfa = len_tv * ntv; tva = (float*) malloc ( len_tv * ntv* sizeof(float) ); vtm = (float*) malloc ( len_tv * sizeof(float) ); DP = (float*) malloc ( ntv * sizeof(float) ); tr_ar = (float*) malloc ( len_tv * ntv* sizeof(float) ); tv_sq = (double*) malloc ( ntv * sizeof(double) ); v_f_g = (double*) malloc ( ntv * sizeof(double) ); for ( i_r = 0; i_r < ntv ; i_r++ ) { for ( i_c = 0; i_c < len_tv; i_c++ ) tva[i_r * len_tv + i_c] = (float)prob-> x[i_r].values[i_c]; } cudaStat = cudaMalloc((void**)&g_tva, len_tv * ntv * sizeof(float)); if (cudaStat != cudaSuccess) { free( tva ); free( vtm ); free( DP ); free( v_f_g ); free( tv_sq ); cudaFree( g_tva ); cublasDestroy( handle ); fprintf (stderr, "!!!! Device memory allocation error (A)\n"); getchar(); return; } cudaStat = cudaMalloc((void**)&g_vtm, len_tv * sizeof(float)); cudaStat = cudaMalloc((void**)&g_DotProd, ntv * sizeof(float)); for( i_r = 0; i_r < ntv; i_r++ ) for( i_c = 0; i_c < len_tv; i_c++ ) tr_ar[i_c * ntv + i_r] = tva[i_r * len_tv + i_c]; // Copy cpu vector to gpu vector status = cublasSetVector( len_tv * ntv, sizeof(float), tr_ar, 1, g_tva, 1 ); free( tr_ar ); for( i_v = 0; i_v < ntv; i_v++ ) { tv_sq[ i_v ] = 0; for( i_el = 0; i_el < len_tv; i_el++ ) tv_sq[i_v] += pow( tva[i_v*len_tv + i_el], (float)2.0 ); } for ( trvei = 0; trvei < ntv; trvei++ ) { status = cublasSetVector( len_tv, sizeof(float), &tva[trvei * len_tv], 1, g_vtm, 1 ); status = cublasSgemv( handle, CUBLAS_OP_N, ntv, len_tv, &alpha, g_tva, ntv , g_vtm, 1, &beta, g_DotProd, 1 ); status = cublasGetVector( ntv, sizeof(float), g_DotProd, 1, DP, 1 ); for ( i_c = 0; i_c < ntv; i_c++ ) v_f_g[i_c] = exp( -g_val * (tv_sq[trvei] + tv_sq[i_c]-((double)2.0)* (double)DP[i_c] )); pecm-> x[trvei].values[0] = trvei + 1; for ( i_c = 0; i_c < ntv; i_c++ ) pecm-> x[trvei].values[i_c + 1] = v_f_g[i_c]; } free( tva ); free( vtm ); free( DP ); free( v_f_g ); free( tv_sq ); cudaFree( g_tva ); cudaFree( g_vtm ); cudaFree( g_DotProd ); cublasDestroy( handle ); }
/* Main */ int main(int argc, char **argv) { cublasStatus_t status; float *h_A; float *h_B; float *h_C; float *h_C_ref; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; cublasHandle_t handle; int dev = findCudaDevice(argc, (const char **) argv); if (dev == -1) { return EXIT_FAILURE; } /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float *)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf(stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float *)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf(stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf(stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } printf("CUBLAS program finished\n"); exit(error_norm / ref_norm < 1e-6f ? EXIT_SUCCESS : EXIT_FAILURE); }
int gpu_gemm(const real *h_A, const real *h_B, real *h_C, const real alpha, const real beta, const int N) { real *d_A = 0; real *d_B = 0; real *d_C = 0; int n2 = N * N; cublasStatus_t status; cublasHandle_t handle; status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); /* Performs operation using cublas */ status = GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } return 0; }
SEXP magma_dpoMatrix_dgeMatrix_solve(SEXP a, SEXP b) { #ifdef HIPLAR_WITH_MAGMA SEXP Chol = magma_dpoMatrix_chol(a), val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)), *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)), info; /* Checking Matrix Dimensions */ if (adims[1] != bdims[0]) error(_("Dimensions of system to be solved are inconsistent")); if (adims[0] < 1 || bdims[1] < 1) error(_("Cannot solve() for matrices with zero extents")); /* ****************************************** */ SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); slot_dup(val, b, Matrix_DimSym); slot_dup(val, b, Matrix_xSym); double *A = REAL(GET_SLOT(Chol, Matrix_xSym)); double *B = REAL(GET_SLOT(val, Matrix_xSym)); if(GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;"); #endif double *d_A, *d_B; const char *uplo = uplo_P(Chol); magma_int_t NRHS = bdims[1]; magma_int_t lda = adims[1]; magma_int_t ldb = bdims[0]; magma_int_t N = adims[0]; cublasStatus retStatus; /*if(uplo == "U") uplo = MagmaUpperStr; else if(uplo == "L") uplo = MagmaLowerStr; else uplo = MagmaUpperStr; */ cublasAlloc(N * lda, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( N * lda , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info); cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs;"); #endif F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1, A , adims, B , bdims, &info); } if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return val; #endif return R_NilValue; }
/* Main */ int main(int argc, char** argv) { if (argc!=5){ fprintf (stderr, "Usage: %s <sizeM> <sizeN> <sizeK> <Nb iter>\n",argv[0]); exit(0); } const int M=strtol(argv[1],0,0); const int N=strtol(argv[2],0,0); const int K=strtol(argv[3],0,0); const int NBITER=strtol(argv[4],0,0); const int NA= M * K; const int NB= K * N; const int NC= M * N; real* h_A; real* h_B; real* h_C; const real alpha = 1.0f; const real beta = 0.0f; #ifdef NVIDIA cublasStatus status; real* d_A = 0; real* d_B = 0; real* d_C = 0; #endif #ifdef COMPARE real* h_C_ref; real error_norm; real ref_norm; real diff; #endif /* Allocate host memory for the matrices */ h_A = (real*)malloc(NA * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (real*)malloc(NB * sizeof(h_B[0])); if (h_B == 0) { fprintf (stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (real*)malloc(NC * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } for (int i = 0; i < NA; ++i) h_A[i] = M_PI+(real)i; for (int i = 0; i < NB; ++i) h_B[i] = M_PI+(real)i; #ifdef NVIDIA /* Initialize CUBLAS */ status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate device memory for the matrices */ status = cublasAlloc(NA, sizeof(d_A[0]), (void**)&d_A); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (A)\n"); return EXIT_FAILURE; } status = cublasAlloc(NB, sizeof(d_B[0]), (void**)&d_B); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (B)\n"); return EXIT_FAILURE; } status = cublasAlloc(NC, sizeof(d_C[0]), (void**)&d_C); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(NA, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(NB, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(NC, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Clear last error */ cublasGetError(); #endif #ifdef COMPARE /* Performs operation using plain C code */ for (int i=0;i<NBITER;i++) c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Allocate host memory for reading back the result from device memory */ h_C = (real*)malloc(NC * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } #endif #ifdef NVIDIA /* Performs operation using cublas */ for (int i=0;i<NBITER;i++) //We must Change the order of the parameter as cublas take //matrix as colomn major and C matrix is row major cublasSgemm('n', 'n', N, M, K, alpha, d_B, N, d_A, K, beta, d_C, N); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(NC, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } #elif defined( CXGEMM ) for (int i=0;i<NBITER;i++) c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C); #else char transa='N', transb='N'; for (int i=0;i<NBITER;i++) sgemm_(&transb, &transa, &N, &M, &K, &alpha, h_B, &N, h_A, &K, &beta, h_C, &N); #endif #ifdef COMPARE /* Check result against reference */ error_norm = 0; ref_norm = 0; for (int i = 0; i < NC; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf (stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED"); #endif /* Memory clean up */ free(h_A); free(h_B); free(h_C); #ifdef NVIDIA status = cublasFree(d_A); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } status = cublasFree(d_B); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } status = cublasFree(d_C); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } #endif // if (argc <= 1 || strcmp(argv[1], "-noprompt")) { // printf("\nPress ENTER to exit...\n"); // getchar(); // } return EXIT_SUCCESS; }
SEXP magma_dpoMatrix_matrix_solve(SEXP a, SEXP b) { #ifdef HIPLAR_WITH_MAGMA SEXP Chol = magma_dpoMatrix_chol(a), val = PROTECT(duplicate(b)); int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)), *bdims = INTEGER(getAttrib(b, R_DimSymbol)), info; if (!(isReal(b) && isMatrix(b))) error(_("Argument b must be a numeric matrix")); if (*adims != *bdims || bdims[1] < 1 || *adims < 1) error(_("Dimensions of system to be solved are inconsistent")); double *A = REAL(GET_SLOT(Chol, Matrix_xSym)); double *B = REAL(val); //const char *uplo = uplo_P(Chol); //int N = bdims[1]; //There is only a GPU interface for this call //so it will be the default setting if the GPU is on if(GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;"); #endif double *d_A, *d_B; const char *uplo = uplo_P(Chol); magma_int_t NRHS = bdims[1]; magma_int_t lda = adims[1]; magma_int_t ldb = bdims[0]; magma_int_t N = adims[0]; cublasStatus retStatus; cublasAlloc(N * lda, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( N * lda , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info); cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); } else { F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1, REAL(GET_SLOT(Chol, Matrix_xSym)), adims, REAL(val), bdims, &info); } // Error checking of MAGMA/LAPACK calls if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return val; #endif return R_NilValue; }
int main(void) { cublasStatus status; float* h_image; float* h_covariance; float* d_image; float* d_covariance; float alpha = 1.0f; float beta = 0.0f; int imgsize = N * L; //int i; FILE *fp1, *fp2; /* Initialize CUBLAS */ status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the image */ h_image = (float*)malloc(imgsize * sizeof(float)); if (h_image == 0) { fprintf (stderr, "!!!! host memory allocation error (image)\n"); return EXIT_FAILURE; } h_covariance = (float*)calloc(L * L, sizeof(float)); if (h_covariance == 0) { fprintf (stderr, "!!!! host memory allocation error (covariance)\n"); return EXIT_FAILURE; } /* Fill the image with test data for (i = 0; i < imgsize; i++) { h_image[i] = rand() / (float)RAND_MAX; }*/ fp1 = fopen("image.dat","rb"); fread(h_image, sizeof(float), imgsize, fp1); printf("Valor de image[0]: %f\n", h_image[8]); /* Allocate device memory */ status = cublasAlloc(imgsize, sizeof(float), (void**)&d_image); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (image)\n"); return EXIT_FAILURE; } status = cublasAlloc(L * L, sizeof(float), (void**)&d_covariance); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (covariance)\n"); return EXIT_FAILURE; } /* Copy image to device memory */ status = cublasSetVector(imgsize, sizeof(float), h_image, 1, d_image, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(L * L, sizeof(float), h_covariance, 1, d_covariance, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write covariance)\n"); return EXIT_FAILURE; } /* Clear last error */ cublasGetError(); /* Calculate covariance matrix using cublas */ cublasSgemm('n', 't', L, L, N, alpha, d_image, L, d_image, L, beta, d_covariance, L); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(L * L, sizeof(float), d_covariance, 1, h_covariance, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read covariance)\n"); return EXIT_FAILURE; } fp2 = fopen("covariance.dat","wb"); fwrite(h_covariance, sizeof(float), L*L, fp2); printf("Valor de covariance[8]: %f\n", h_covariance[8]); /* Memory clean up */ free(h_image); free(h_covariance); status = cublasFree(d_image); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (image)\n"); return EXIT_FAILURE; } status = cublasFree(d_covariance); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (covariance)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } fclose(fp1); fclose(fp2); printf("\nPress ENTER to exit...\n"); getchar(); return EXIT_SUCCESS; }